From 66e4f16cb15eab504ce05393e8962b9c4d6ffc2c Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Wed, 15 Mar 2017 16:13:53 -0700
Subject: [PATCH 01/16] Initial check-in of new pooling op

---
 src/operator/mxnet_op.h       |  16 ++
 src/operator/nn/pool.cuh      | 144 +++++++++++++++
 src/operator/nn/pool.h        | 117 ++++++++++++
 src/operator/pooling-inl.h    | 150 +++++++--------
 src/operator/pooling.cc       |  26 +--
 src/operator/pooling.cu       |  15 +-
 src/operator/pooling_v1-inl.h | 335 ++++++++++++++++++++++++++++++++++
 src/operator/pooling_v1.cc    |  89 +++++++++
 src/operator/pooling_v1.cu    |  36 ++++
 9 files changed, 819 insertions(+), 109 deletions(-)
 create mode 100644 src/operator/nn/pool.cuh
 create mode 100644 src/operator/nn/pool.h
 create mode 100644 src/operator/pooling_v1-inl.h
 create mode 100644 src/operator/pooling_v1.cc
 create mode 100644 src/operator/pooling_v1.cu
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index 6d8b45fc1188..d46b978e0218 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -139,6 +139,22 @@ MSHADOW_XINLINE Shape<ndim> calc_stride(const Shape<ndim>& shape) {
 }
 
 
+struct fill {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType* out, const DType val) {
+    out[i] = val;
+  }
+};
+
+
+struct set_zero {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType* out) {
+    out[i] = static_cast<DType>(0);
+  }
+};
+
+
 template<typename OP, typename xpu>
 struct Kernel;
 
diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh
new file mode 100644
index 000000000000..e8f53a5d4ada
--- /dev/null
+++ b/src/operator/nn/pool.cuh
@@ -0,0 +1,144 @@
+#ifndef MXNET_OPERATOR_NN_POOL_CUH_
+#define MXNET_OPERATOR_NN_POOL_CUH_
+
+#include <mxnet/base.h>
+#include <mxnet/operator.h>
+#include "../mxnet_op.h"
+
+namespace mxnet {
+namespace op {
+
+#define CUDA_KERNEL_LOOP(i, n) \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+      i < (n); \
+      i += blockDim.x * gridDim.x)
+
+/*!
+ * \brief Get the number of blocks for cuda kernel given N
+ */
+inline int cuda_get_num_blocks(const int N) {
+  using namespace mshadow::cuda;
+  return std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum);
+}
+
+template <typename DType>
+__global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* const in_data,
+                                       const int channels, const int height, const int width,
+                                       const int pooled_height, const int pooled_width,
+                                       const int kernel_h, const int kernel_w, const int stride_h,
+                                       const int stride_w, const int pad_h, const int pad_w,
+                                       OpReqType req_type, DType* const out_data, int32_t* mask) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    const int hend = min(hstart + kernel_h, height);
+    const int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    const DType* const in_slice =
+        in_data + (n * channels + c) * height * width;
+    int in_index = hstart * width + wstart;
+    DType max_val = in_slice[in_index];
+    int max_idx = in_index;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        in_index = h * width + w;
+        const DType in_val = in_slice[in_index];
+        if (in_val > max_val) {
+          max_val = in_val;
+          max_idx = in_index;
+        }
+      }
+    }
+    mask[index] = max_idx;
+    KERNEL_ASSIGN(out_data[index], req_type, max_val);
+  }
+}
+
+template <typename DType>
+__global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* const out_data,
+                                         const int32_t* const mask, const int channels,
+                                         const int height, const int width,
+                                         const int pooled_height, const int pooled_width,
+                                         const int kernel_h, const int kernel_w,
+                                         const int stride_h, const int stride_w,
+                                         const int pad_h, const int pad_w,
+                                         OpReqType req_type, DType* const in_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    // find out the local index
+    // find out the local offset
+    const int w = index % width;
+    const int h = (index / width) % height;
+    const int c = (index / width / height) % channels;
+    const int n = index / width / height / channels;
+    const int phstart =
+         (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
+    const int phend = min((h + pad_h) / stride_h + 1, pooled_height);
+    const int pwstart =
+         (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
+    const int pwend = min((w + pad_w) / stride_w + 1, pooled_width);
+    DType gradient = 0;
+    const int offset = (n * channels + c) * pooled_height * pooled_width;
+    const DType* const out_data_slice = out_data + offset;
+    const int* const mask_slice = mask + offset;
+    int in_index = h * width + w;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        int pooled_index = ph * pooled_width + pw;
+        if (mask_slice[pooled_index] == in_index) {
+          gradient += out_data_slice[pooled_index];
+        }
+      }
+    }
+    KERNEL_ASSIGN(in_data[index], req_type, gradient);
+  }
+}
+
+template<typename DType>
+inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& ishape,
+                 const TShape& oshape, const TShape& kernel, const TShape& pad,
+                 const TShape& stride, const int pool_type, OpReqType req_type,
+                 DType* out_data, int32_t* mask = nullptr) {
+  if (kernel.ndim() == 2) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_max_2d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
+                                   oshape[2], oshape[3], kernel[0], kernel[1],
+                                   stride[0], stride[1], pad[0], pad[1], req_type, out_data, mask);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_2d_gpu_kernel);
+    }
+  }
+}
+
+template<typename DType>
+inline void unpool(mshadow::Stream<gpu>* s, const DType* out_data, const TShape& ishape,
+                   const TShape& oshape, const TShape& kernel, const TShape& pad,
+                   const TShape& stride, const int pool_type, OpReqType req_type, DType* in_data,
+                   const int32_t* mask = nullptr) {
+  if (mxnet::kNullOp == req_type) return;
+  if (mxnet::kAddTo != req_type) {
+    mxnet_op::Kernel<mxnet_op::set_zero, gpu>::Launch(s, ishape.Size(), in_data);
+  }
+  if (kernel.ndim() == 2) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_max_2d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     ishape.Size(), out_data, mask, ishape[1], ishape[2], ishape[3],
+                                     oshape[2], oshape[3], kernel[0], kernel[1],
+                                     stride[0], stride[1], pad[0], pad[1], req_type, in_data);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_2d_gpu_kernel);
+    }
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NN_POOL_CUH_
diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h
new file mode 100644
index 000000000000..15b0f7132844
--- /dev/null
+++ b/src/operator/nn/pool.h
@@ -0,0 +1,117 @@
+#ifndef MXNET_OPERATOR_NN_POOL_H_
+#define MXNET_OPERATOR_NN_POOL_H_
+
+#include <mxnet/base.h>
+#include <mxnet/operator.h>
+#include "../mxnet_op.h"
+
+namespace mxnet {
+namespace op {
+
+namespace pool_enum {
+enum PoolingOpInputs {kData};
+enum PoolingOpOutputs {kOut, kMask};
+enum PoolingOpType {kMaxPooling, kAvgPooling, kSumPooling};
+enum PoolingOpPadConventionType {kValid, kFull};
+}  // namespace pool_enum
+
+template<typename DType>
+inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
+                            const TShape& kernel, const TShape& pad, const TShape& stride,
+                            OpReqType req_type, DType* out_data, int32_t* mask) {
+  const index_t height = ishape[2], width = ishape[3];
+  const index_t pooled_height = oshape[2], pooled_width = oshape[3];
+  const index_t kernel_h = kernel[0], kernel_w = kernel[1];
+  const index_t pad_h = pad[0], pad_w = pad[1];
+  const index_t stride_h = stride[0], stride_w = stride[1];
+  const index_t in_data_offset = ishape[2] * ishape[3];
+  const index_t out_data_offset = oshape[2] * oshape[3];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (index_t ph = 0; ph < pooled_height; ++ph) {
+        for (index_t pw = 0; pw < pooled_width; ++pw) {
+          index_t tmp_h = ph * stride_h;
+          index_t tmp_w = pw * stride_w;
+          index_t hend = std::min(tmp_h + kernel_h - pad_h, height);
+          index_t wend = std::min(tmp_w + kernel_w - pad_w, width);
+          index_t hstart = (tmp_h > pad_h? tmp_h - pad_h : 0);
+          index_t wstart = (tmp_w > pad_w? tmp_w - pad_w : 0);
+          const index_t pool_index = ph * pooled_width + pw;
+          index_t in_index = hstart * width + wstart;
+          DType max_val = in_data[in_index];
+          mask[pool_index] = in_index;
+          for (index_t h = hstart; h < hend; ++h) {
+            for (index_t w = wstart; w < wend; ++w) {
+              in_index = h * width + w;
+              if (in_data[in_index] > max_val) {
+                max_val = in_data[in_index];
+                mask[pool_index] = in_index;
+              }
+            }
+          }
+          KERNEL_ASSIGN(out_data[pool_index], req_type, max_val);
+        }
+      }
+      in_data += in_data_offset;
+      out_data += out_data_offset;
+      mask += out_data_offset;
+    }
+  }
+}
+
+template<typename DType>
+inline void unpool_max_2d_cpu(mshadow::Stream<cpu>* s, const DType* out_data, const int32_t* mask,
+                              const TShape& ishape, const TShape& oshape, DType* in_data) {
+  const index_t pooled_height = oshape[2], pooled_width = oshape[3];
+  const index_t in_data_offset = ishape[2] * ishape[3];
+  const index_t out_data_offset = oshape[2] * oshape[3];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (index_t ph = 0; ph < pooled_height; ++ph) {
+        for (index_t pw = 0; pw < pooled_width; ++pw) {
+          const index_t out_index = ph * pooled_width + pw;
+          in_data[mask[out_index]] += out_data[out_index];
+        }
+      }
+      in_data += in_data_offset;
+      out_data += out_data_offset;
+      mask += out_data_offset;
+    }
+  }
+}
+
+template<typename DType>
+inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& ishape,
+                 const TShape& oshape, const TShape& kernel, const TShape& pad,
+                 const TShape& stride, const int pool_type, OpReqType req_type,
+                 DType* out_data, int32_t* mask = nullptr) {
+  if (kernel.ndim() == 2) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      pool_max_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data, mask);
+    }
+  }
+}
+
+template<typename DType>
+inline void unpool(mshadow::Stream<cpu>* s, const DType* out_data, const TShape& ishape,
+                   const TShape& oshape, const TShape& kernel, const TShape& pad,
+                   const TShape& stride, const int pool_type, OpReqType req_type, DType* in_data,
+                   const int32_t* mask = nullptr) {
+  if (mxnet::kNullOp == req_type) return;
+  if (mxnet::kAddTo != req_type) {
+    mxnet_op::Kernel<mxnet_op::set_zero, cpu>::Launch(s, ishape.Size(), in_data);
+  }
+  if (kernel.ndim() == 2) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      unpool_max_2d_cpu(s, out_data, mask, ishape, oshape, in_data);
+    }
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+#ifdef __CUDACC__
+#include "./pool.cuh"
+#endif
+
+#endif  // MXNET_OPERATOR_NN_POOL_H_
diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h
index 9ea0155b21be..5258844341c4 100644
--- a/src/operator/pooling-inl.h
+++ b/src/operator/pooling-inl.h
@@ -1,8 +1,8 @@
 /*!
- * Copyright (c) 2015 by Contributors
+ * Copyright (c) 2017 by Contributors
  * \file pooling-inl.h
  * \brief
- * \author Bing Xu
+ * \author Bing Xu, Jun Wu
 */
 
 #ifndef MXNET_OPERATOR_POOLING_INL_H_
@@ -17,16 +17,19 @@
 #include <string>
 #include <utility>
 #include "./operator_common.h"
+#include "./nn/pool.h"
 
 namespace mxnet {
 namespace op {
 
+#if 0
 namespace pool_enum {
 enum PoolingOpInputs {kData};
 enum PoolingOpOutputs {kOut};
 enum PoolingOpType {kMaxPooling, kAvgPooling, kSumPooling};
 enum PoolingOpPadConventionType {kValid, kFull};
 }  // namespace pool_enum
+#endif
 
 struct PoolingParam : public dmlc::Parameter<PoolingParam> {
   TShape kernel;
@@ -63,107 +66,60 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
   }
 };
 
-template<typename xpu, typename Reducer, typename DType>
+template<typename xpu, typename DType>
 class PoolingOp : public Operator {
  public:
   explicit PoolingOp(PoolingParam p) {
     this->param_ = p;
   }
 
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
+  virtual void Forward(const OpContext& ctx,
+                       const std::vector<TBlob>& in_data,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<TBlob>& out_data,
+                       const std::vector<TBlob>& aux_args) {
     using namespace mshadow;
-    using namespace mshadow::expr;
     CHECK_EQ(in_data.size(), 1U);
     CHECK_EQ(out_data.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    if (param_.kernel.ndim() == 3) {
-      LOG(FATAL) << "3D kernel not implemented";
-    }
-    Tensor<xpu, 4, DType> data = in_data[pool_enum::kData].get<xpu, 4, DType>(s);
-    Tensor<xpu, 4, DType> out = out_data[pool_enum::kOut].get<xpu, 4, DType>(s);
-    mshadow::Shape<2> out_shape = Shape2(out.shape_[2], out.shape_[3]);
-    if (param_.pool_type == pool_enum::kMaxPooling || param_.pool_type == pool_enum::kSumPooling) {
-      Assign(out,
-             req[pool_enum::kOut],
-             pool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
-                           out_shape,
-                           param_.global_pool ? data.shape_[2] : param_.kernel[0],
-                           param_.global_pool ? data.shape_[3] : param_.kernel[1],
-                           param_.global_pool ? 1 : param_.stride[0],
-                           param_.global_pool ? 1 : param_.stride[1]));
-    } else if (param_.pool_type == pool_enum::kAvgPooling) {
-      Assign(out,
-             req[pool_enum::kOut],
-             scalar<DType>(1.0f / (param_.global_pool ?
-                      data.shape_[2] * data.shape_[3] :
-                      param_.kernel[0] * param_.kernel[1])) * \
-             pool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
-                           out_shape,
-                           param_.global_pool ? data.shape_[2] : param_.kernel[0],
-                           param_.global_pool ? data.shape_[3] : param_.kernel[1],
-                           param_.global_pool ? 1 : param_.stride[0],
-                           param_.global_pool ? 1 : param_.stride[1]));
-    }
+    pool(s, in_data[pool_enum::kData].dptr<DType>(),
+         in_data[pool_enum::kData].shape_,
+         out_data[pool_enum::kOut].shape_,
+         param_.global_pool? in_data[pool_enum::kData].shape_ : param_.kernel,
+         param_.pad,
+         param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride,
+         param_.pool_type,
+         req[pool_enum::kOut],
+         out_data[pool_enum::kOut].dptr<DType>(),
+         pool_enum::kMaxPooling == param_.pool_type? out_data[pool_enum::kMask].dptr<int32_t>()
+                                                   : nullptr);
   }
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
+  virtual void Backward(const OpContext& ctx,
+                        const std::vector<TBlob>& out_grad,
+                        const std::vector<TBlob>& in_data,
+                        const std::vector<TBlob>& out_data,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& in_grad,
+                        const std::vector<TBlob>& aux_args) {
     using namespace mshadow;
-    using namespace mshadow::expr;
     CHECK_EQ(out_grad.size(), 1U);
     CHECK_EQ(in_data.size(), 1U);
     CHECK_EQ(out_data.size(), 1U);
     CHECK_EQ(req.size(), 1U);
     CHECK_EQ(in_grad.size(), 1U);
-    // TODO(bing): remove pad (0,0)
-    if (param_.kernel.ndim() == 3) {
-      LOG(FATAL) << "3D kernel not implemented";
-    }
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> grad = out_grad[pool_enum::kOut].get<xpu, 4, DType>(s);
-    Tensor<xpu, 4, DType> data = in_data[pool_enum::kData].get<xpu, 4, DType>(s);
-    Tensor<xpu, 4, DType> output_data = out_data[pool_enum::kOut].get<xpu, 4, DType>(s);
-    Tensor<xpu, 4, DType> input_grad = in_grad[pool_enum::kData].get<xpu, 4, DType>(s);
-
-    mshadow::Shape<2> in_shape = Shape2(data.shape_[2], data.shape_[3]);
-
-    if (param_.pool_type == pool_enum::kMaxPooling || param_.pool_type == pool_enum::kSumPooling) {
-      Assign(input_grad, req[pool_enum::kData],
-             crop(unpool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
-                                  pad(output_data, 0, 0),
-                                  pad(grad, 0, 0),
-                                  param_.global_pool ? in_shape[0] : param_.kernel[0],
-                                  param_.global_pool ? in_shape[1] : param_.kernel[1],
-                                  param_.global_pool ? 1 : param_.stride[0],
-                                  param_.global_pool ? 1 : param_.stride[1]),
-                  in_shape,
-                  param_.pad[0],
-                  param_.pad[1]));
-    } else if (param_.pool_type == pool_enum::kAvgPooling) {
-      Assign(input_grad, req[pool_enum::kData],
-             scalar<DType>(1.0f / (param_.global_pool ?
-                      data.shape_[2] * data.shape_[3] :
-                      param_.kernel[0] * param_.kernel[1])) * \
-             crop(unpool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
-                                  pad(output_data, 0, 0),
-                                  pad(grad, 0, 0),
-                                  param_.global_pool ? in_shape[0] : param_.kernel[0],
-                                  param_.global_pool ? in_shape[1] : param_.kernel[1],
-                                  param_.global_pool ? 1 : param_.stride[0],
-                                  param_.global_pool ? 1 : param_.stride[1]),
-                  in_shape,
-                  param_.pad[0],
-                  param_.pad[1]));
-    }
+    unpool(s, out_data[pool_enum::kOut].dptr<DType>(),
+           in_data[pool_enum::kData].shape_,
+           out_data[pool_enum::kOut].shape_,
+           param_.global_pool? in_data[pool_enum::kData].shape_ : param_.kernel,
+           param_.pad,
+           param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride,
+           param_.pool_type,
+           req[pool_enum::kData],
+           in_data[pool_enum::kData].dptr<DType>(),
+           pool_enum::kMaxPooling == param_.pool_type? out_data[pool_enum::kMask].dptr<int32_t>()
+                                                     : nullptr);
   }
 
  private:
@@ -234,7 +190,10 @@ class PoolingProp : public OperatorProperty {
         }
       }
       out_shape->clear();
-      out_shape->push_back(oshape);
+      out_shape->push_back(oshape);  // save output shape
+      if (pool_enum::kMaxPooling == param_.pool_type) {
+        out_shape->push_back(oshape);  // save mask shape
+      }
     } else if (param_.kernel.ndim() == 3) {
       CHECK_EQ(dshape.ndim(), 5U)
         << "Pooling: Input data should be 5D in (batch, channel, d, y, x)";
@@ -267,7 +226,10 @@ class PoolingProp : public OperatorProperty {
       }
 
       out_shape->clear();
-      out_shape->push_back(oshape);
+      out_shape->push_back(oshape);  // save output shape
+      if (pool_enum::kMaxPooling == param_.pool_type) {
+        out_shape->push_back(oshape);  // save mask shape
+      }
     }
     return true;
   }
@@ -285,6 +247,9 @@ class PoolingProp : public OperatorProperty {
 
     out_type->clear();
     out_type->push_back(dtype);
+    if (pool_enum::kMaxPooling == param_.pool_type) {
+      out_type->push_back(mshadow::kInt32);
+    }
     return true;
   }
 
@@ -302,6 +267,9 @@ class PoolingProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
+    if (pool_enum::kMaxPooling == param_.pool_type) {
+      return {out_grad[pool_enum::kOut], out_data[pool_enum::kMask]};
+    }
     return {out_grad[pool_enum::kOut], in_data[pool_enum::kData], out_data[pool_enum::kOut]};
   }
 
@@ -317,6 +285,16 @@ class PoolingProp : public OperatorProperty {
 #endif
   }
 
+  int NumVisibleOutputs() const override {
+    return 1;
+  }
+
+  int NumOutputs() const override {
+    // For max pooling, need to return mask as well
+    if (pool_enum::kMaxPooling == param_.pool_type) return 2;
+    return 1;
+  }
+
   Operator* CreateOperator(Context ctx) const override {
     LOG(FATAL) << "Not Implemented.";
     return NULL;
diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc
index a4eed2633232..e4722acaf3e8 100644
--- a/src/operator/pooling.cc
+++ b/src/operator/pooling.cc
@@ -1,8 +1,8 @@
 /*!
- * Copyright (c) 2015 by Contributors
+ * Copyright (c) 2017 by Contributors
  * \file pooling.cc
  * \brief
- * \author Bing Xu
+ * \author Bing Xu, Jun Wu
 */
 #include "./pooling-inl.h"
 #if MXNET_USE_MKL2017 == 1
@@ -51,21 +51,15 @@ Operator *CreateOp<cpu>(PoolingParam param, int dtype) {
   }
 #endif
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    switch (param.pool_type) {
-      case pool_enum::kMaxPooling:
-        op = new PoolingOp<cpu, mshadow::red::maximum, DType>(param);
-        break;
-      case pool_enum::kAvgPooling:
-        op = new PoolingOp<cpu, mshadow::red::sum, DType>(param);
-        break;
-      case pool_enum::kSumPooling:
-        op = new PoolingOp<cpu, mshadow::red::sum, DType>(param);
-        break;
-      default:
-        LOG(FATAL) << "unknown pooling type";
-        return NULL;
+    if (pool_enum::kMaxPooling == param.pool_type
+        || pool_enum::kAvgPooling == param.pool_type
+        || pool_enum::kSumPooling == param.pool_type) {
+      op = new PoolingOp<cpu, DType>(param);
+    } else {
+      LOG(FATAL) << "unknown pooling type";
+      return NULL;
     }
-  })
+  });
 
   return op;
 }
diff --git a/src/operator/pooling.cu b/src/operator/pooling.cu
index be2464e3c0ef..d8d5c535fce9 100644
--- a/src/operator/pooling.cu
+++ b/src/operator/pooling.cu
@@ -1,8 +1,8 @@
 /*!
- * Copyright (c) 2015 by Contributors
+ * Copyright (c) 2017 by Contributors
  * \file pooling.cu
  * \brief
- * \author Bing Xu
+ * \author Bing Xu, Jun Wu
 */
 #include <vector>
 #include "./pooling-inl.h"
@@ -12,6 +12,7 @@
 
 namespace mxnet {
 namespace op {
+
 template<>
 Operator *CreateOp<gpu>(PoolingParam param, int dtype) {
   Operator *op = NULL;
@@ -25,8 +26,8 @@ Operator *CreateOp<gpu>(PoolingParam param, int dtype) {
         op = new CuDNNPoolingOp<DType>(param);
         break;
       case pool_enum::kSumPooling:
-        LOG(WARNING) << "Sum pooling is not supported by cudnn, MxNet sum pooling is applied.";
-        op = new PoolingOp<gpu, mshadow::red::sum, DType>(param);
+        LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied.";
+        op = new PoolingOp<gpu, DType>(param);
         break;
       default:
         LOG(FATAL) << "unknown pooling type";
@@ -37,13 +38,13 @@ Operator *CreateOp<gpu>(PoolingParam param, int dtype) {
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
     switch (param.pool_type) {
       case pool_enum::kMaxPooling:
-        op = new PoolingOp<gpu, mshadow::red::maximum, DType>(param);
+        op = new PoolingOp<gpu, DType>(param);
         break;
       case pool_enum::kAvgPooling:
-        op = new PoolingOp<gpu, mshadow::red::sum, DType>(param);
+        op = new PoolingOp<gpu, DType>(param);
         break;
       case pool_enum::kSumPooling:
-        op = new PoolingOp<gpu, mshadow::red::sum, DType>(param);
+        op = new PoolingOp<gpu, DType>(param);
         break;
       default:
         LOG(FATAL) << "unknown pooling type";
diff --git a/src/operator/pooling_v1-inl.h b/src/operator/pooling_v1-inl.h
new file mode 100644
index 000000000000..d354fd7322b8
--- /dev/null
+++ b/src/operator/pooling_v1-inl.h
@@ -0,0 +1,335 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file pooling_v1-inl.h
+ * \brief
+ * \author Bing Xu
+*/
+
+#ifndef MXNET_OPERATOR_POOLING_V1_INL_H_
+#define MXNET_OPERATOR_POOLING_V1_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "./operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+namespace pool_v1_enum {
+enum PoolingV1OpInputs {kData};
+enum PoolingV1OpOutputs {kOut};
+enum PoolingV1OpType {kMaxPooling, kAvgPooling, kSumPooling};
+enum PoolingV1OpPadConventionType {kValid, kFull};
+}  // namespace pool_v1_enum
+
+struct PoolingV1Param : public dmlc::Parameter<PoolingV1Param> {
+  TShape kernel;
+  TShape stride;
+  TShape pad;
+  int pool_type;
+  int pooling_convention;
+  bool global_pool;
+  DMLC_DECLARE_PARAMETER(PoolingV1Param) {
+    DMLC_DECLARE_FIELD(global_pool).set_default(false)
+    .describe("Ignore kernel size, do global pooling based on current input feature map. ");
+
+    DMLC_DECLARE_FIELD(kernel)
+    .enforce_nonzero()
+    .describe("pooling kernel size: (y, x) or (d, y, x)");
+
+    DMLC_DECLARE_FIELD(pool_type)
+    .add_enum("max", pool_v1_enum::kMaxPooling)
+    .add_enum("avg", pool_v1_enum::kAvgPooling)
+    .add_enum("sum", pool_v1_enum::kSumPooling)
+    .describe("Pooling type to be applied.");
+
+    DMLC_DECLARE_FIELD(pooling_convention).set_default(pool_v1_enum::kValid)
+    .add_enum("full", pool_v1_enum::kFull)
+    .add_enum("valid", pool_v1_enum::kValid)
+    .describe("Pooling convention to be applied.");
+
+    DMLC_DECLARE_FIELD(stride).set_default(TShape())
+    .enforce_nonzero()
+    .describe("stride: for pooling (y, x) or (d, y, x)");
+
+    DMLC_DECLARE_FIELD(pad).set_default(TShape())
+    .describe("pad for pooling: (y, x) or (d, y, x)");
+  }
+};
+
+template<typename xpu, typename Reducer, typename DType>
+class PoolingV1Op : public Operator {
+ public:
+  explicit PoolingV1Op(PoolingV1Param p) {
+    this->param_ = p;
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    if (param_.kernel.ndim() == 3) {
+      LOG(FATAL) << "3D kernel not implemented";
+    }
+    Tensor<xpu, 4, DType> data = in_data[pool_v1_enum::kData].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> out = out_data[pool_v1_enum::kOut].get<xpu, 4, DType>(s);
+    mshadow::Shape<2> out_shape = Shape2(out.shape_[2], out.shape_[3]);
+    if (param_.pool_type == pool_v1_enum::kMaxPooling || param_.pool_type == pool_v1_enum::kSumPooling) {
+      Assign(out,
+             req[pool_v1_enum::kOut],
+             pool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
+                           out_shape,
+                           param_.global_pool ? data.shape_[2] : param_.kernel[0],
+                           param_.global_pool ? data.shape_[3] : param_.kernel[1],
+                           param_.global_pool ? 1 : param_.stride[0],
+                           param_.global_pool ? 1 : param_.stride[1]));
+    } else if (param_.pool_type == pool_v1_enum::kAvgPooling) {
+      Assign(out,
+             req[pool_v1_enum::kOut],
+             scalar<DType>(1.0f / (param_.global_pool ?
+                      data.shape_[2] * data.shape_[3] :
+                      param_.kernel[0] * param_.kernel[1])) * \
+             pool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
+                           out_shape,
+                           param_.global_pool ? data.shape_[2] : param_.kernel[0],
+                           param_.global_pool ? data.shape_[3] : param_.kernel[1],
+                           param_.global_pool ? 1 : param_.stride[0],
+                           param_.global_pool ? 1 : param_.stride[1]));
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(req.size(), 1);
+    CHECK_EQ(in_grad.size(), 1);
+    // TODO(bing): remove pad (0,0)
+    if (param_.kernel.ndim() == 3) {
+      LOG(FATAL) << "3D kernel not implemented";
+    }
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, DType> grad = out_grad[pool_v1_enum::kOut].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> data = in_data[pool_v1_enum::kData].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> output_data = out_data[pool_v1_enum::kOut].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> input_grad = in_grad[pool_v1_enum::kData].get<xpu, 4, DType>(s);
+
+    mshadow::Shape<2> in_shape = Shape2(data.shape_[2], data.shape_[3]);
+
+    if (param_.pool_type == pool_v1_enum::kMaxPooling || param_.pool_type == pool_v1_enum::kSumPooling) {
+      Assign(input_grad, req[pool_v1_enum::kData],
+             crop(unpool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
+                                  pad(output_data, 0, 0),
+                                  pad(grad, 0, 0),
+                                  param_.global_pool ? in_shape[0] : param_.kernel[0],
+                                  param_.global_pool ? in_shape[1] : param_.kernel[1],
+                                  param_.global_pool ? 1 : param_.stride[0],
+                                  param_.global_pool ? 1 : param_.stride[1]),
+                  in_shape,
+                  param_.pad[0],
+                  param_.pad[1]));
+    } else if (param_.pool_type == pool_v1_enum::kAvgPooling) {
+      Assign(input_grad, req[pool_v1_enum::kData],
+             scalar<DType>(1.0f / (param_.global_pool ?
+                      data.shape_[2] * data.shape_[3] :
+                      param_.kernel[0] * param_.kernel[1])) * \
+             crop(unpool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
+                                  pad(output_data, 0, 0),
+                                  pad(grad, 0, 0),
+                                  param_.global_pool ? in_shape[0] : param_.kernel[0],
+                                  param_.global_pool ? in_shape[1] : param_.kernel[1],
+                                  param_.global_pool ? 1 : param_.stride[0],
+                                  param_.global_pool ? 1 : param_.stride[1]),
+                  in_shape,
+                  param_.pad[0],
+                  param_.pad[1]));
+    }
+  }
+
+ private:
+  PoolingV1Param param_;
+};  // class PoolingV1Op
+
+template<typename xpu>
+Operator* CreateOp(PoolingV1Param param, int dtype);
+
+
+#if DMLC_USE_CXX11
+class PoolingV1Prop : public OperatorProperty {
+ public:
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    using namespace mshadow;
+    param_.Init(kwargs);
+    if (param_.kernel.ndim() == 2) {
+      if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
+    } else {
+      CHECK_EQ(param_.kernel.ndim(), 3) << param_.kernel.ndim() << "D pooling not supported";
+      if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
+    }
+    CHECK_EQ(param_.stride.ndim(), param_.kernel.ndim())
+      << "stride and kernel should have the same length";
+    CHECK_EQ(param_.pad.ndim(), param_.kernel.ndim())
+      << "pad and kernel should have the same length";
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    CHECK_EQ(in_shape->size(), 1);
+    const TShape &dshape = (*in_shape)[0];
+    CHECK_GE(dshape.ndim(), 4) << "Pooling: Input data should be 4D in (batch, channel, y, x) "
+                               << "Or 5D in (batch, channel, d, y, x)";
+    TShape oshape = dshape;
+    if (dshape.ndim() ==  0) return false;
+    if (param_.kernel.ndim() == 2) {
+      CHECK_EQ(dshape.ndim(), 4) << "Pooling: Input data should be 4D in (batch, channel, y, x)";
+      if (param_.global_pool) {
+        oshape[2] = 1;
+        oshape[3] = 1;
+      } else {
+        CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0])
+            << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2]
+            << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")";
+        CHECK(param_.kernel[1] <= dshape[3] + 2 * param_.pad[1])
+            << "kernel size (" << param_.kernel[1] << ") exceeds input (" << dshape[3]
+            << " padded to " << (dshape[3] + 2*param_.pad[1]) << ")";
+        if (param_.pooling_convention == pool_v1_enum::kValid) {
+          oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
+                              param_.stride[0];
+          oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) /
+                              param_.stride[1];
+        } else {
+          oshape[2] = 1 + static_cast<int>(ceil(static_cast<float>(
+                              dshape[2] + 2 * param_.pad[0] -
+                              param_.kernel[0]) / param_.stride[0]));
+          oshape[3] = 1 + static_cast<int>(ceil(static_cast<float>(
+                              dshape[3] + 2 * param_.pad[1] -
+                              param_.kernel[1]) / param_.stride[1]));
+        }
+      }
+      out_shape->clear();
+      out_shape->push_back(oshape);
+    } else if (param_.kernel.ndim() == 3) {
+      CHECK_EQ(dshape.ndim(), 5) << "Pooling: Input data should be 5D in (batch, channel, d, y, x)";
+      CHECK_LE(param_.kernel[0], dshape[2] + 2 * param_.pad[0]) << "kernel size exceeds input";
+      CHECK_LE(param_.kernel[1], dshape[3] + 2 * param_.pad[1]) << "kernel size exceeds input";
+      CHECK_LE(param_.kernel[2], dshape[4] + 2 * param_.pad[2]) << "kernel size exceeds input";
+      if (param_.global_pool) {
+        oshape[2] = 1;
+        oshape[3] = 1;
+        oshape[4] = 1;
+      } else {
+        if (param_.pool_type == pool_v1_enum::kValid) {
+          oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
+                              param_.stride[0];
+          oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) /
+                              param_.stride[1];
+          oshape[4] = 1 + (dshape[4] + 2 * param_.pad[2] - param_.kernel[2]) /
+                              param_.stride[2];
+        } else {
+          oshape[2] = 1 + static_cast<int>(ceil(static_cast<float>(
+                              dshape[2] + 2 * param_.pad[0] -
+                              param_.kernel[0]) / param_.stride[0]));
+          oshape[3] = 1 + static_cast<int>(ceil(static_cast<float>(
+                              dshape[3] + 2 * param_.pad[1] -
+                              param_.kernel[1]) / param_.stride[1]));
+          oshape[4] = 1 + static_cast<int>(ceil(static_cast<float>(
+                              dshape[4] + 2 * param_.pad[2] -
+                              param_.kernel[2]) / param_.stride[2]));
+        }
+      }
+
+      out_shape->clear();
+      out_shape->push_back(oshape);
+    }
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_EQ(in_type->size(), 1);
+    int dtype = (*in_type)[0];
+
+    if (dtype == -1) {
+      LOG(FATAL) << "Input type to pooling is not specified.";
+      return false;
+    }
+
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    PoolingV1Prop *prop_sym = new PoolingV1Prop();
+    prop_sym->param_ = this->param_;
+    return prop_sym;
+  }
+
+  std::string TypeString() const override {
+    return "PoolingV1";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {out_grad[pool_v1_enum::kOut], in_data[pool_v1_enum::kData],
+            out_data[pool_v1_enum::kOut]};
+  }
+
+  std::vector<std::pair<int, void*> > BackwardInplaceOption(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data,
+    const std::vector<void*> &in_grad) const override {
+#if MXNET_USE_CUDNN == 1
+    return {};
+#else
+    return {{in_data[pool_v1_enum::kData], in_grad[pool_v1_enum::kData]}};
+#endif
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ private:
+  PoolingV1Param param_;
+};  // class PoolingV1Prop
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_POOLING_V1_INL_H_
diff --git a/src/operator/pooling_v1.cc b/src/operator/pooling_v1.cc
new file mode 100644
index 000000000000..cdf4f1af5939
--- /dev/null
+++ b/src/operator/pooling_v1.cc
@@ -0,0 +1,89 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file pooling_v1.cc
+ * \brief
+ * \author Bing Xu
+*/
+#include "./pooling_v1-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template<>
+Operator *CreateOp<cpu>(PoolingV1Param param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    switch (param.pool_type) {
+      case pool_v1_enum::kMaxPooling:
+        op = new PoolingV1Op<cpu, mshadow::red::maximum, DType>(param);
+        break;
+      case pool_v1_enum::kAvgPooling:
+        op = new PoolingV1Op<cpu, mshadow::red::sum, DType>(param);
+        break;
+      case pool_v1_enum::kSumPooling:
+        op = new PoolingV1Op<cpu, mshadow::red::sum, DType>(param);
+        break;
+      default:
+        LOG(FATAL) << "unknown pooling type";
+        return NULL;
+    }
+  })
+
+  return op;
+}
+
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator* PoolingV1Prop::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                     std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+}
+
+DMLC_REGISTER_PARAMETER(PoolingV1Param);
+
+MXNET_REGISTER_OP_PROPERTY(Pooling_v1, PoolingV1Prop)
+.describe(R"code(Perform pooling on the input.
+
+The shapes for 2-D pooling is
+
+- **data**: *(batch_size, channel, height, width)*
+- **out**: *(batch_size, num_filter, out_height, out_width)*, with::
+
+    out_height = f(height, kernel[0], pad[0], stride[0])
+    out_width = f(width, kernel[1], pad[1], stride[1])
+
+The defintion of *f* depends on ``pooling_convention``, which has two options:
+
+- **valid** (default)::
+
+    f(x, k, p, s) = floor(x+2*p-k)/s+1
+
+- **full**, which is compatible with Caffe::
+
+    f(x, k, p, s) = ceil(x+2*p-k)/s+1
+
+But ``global_pool`` is set to be true, then do a global pooling, namely reset
+``kernel=(height, width)``.
+
+Three pooling options are supported by ``pool_type``:
+
+- **avg**: average pooling
+- **max**: max pooling
+- **sum**: sum pooling
+
+1-D pooling is special case of 2-D pooling with *weight=1* and
+*kernel[1]=1*.
+
+For 3-D pooling, an additional *depth* dimension is added before
+*height*. Namely the input data will have shape *(batch_size, channel, depth,
+height, width)*.
+
+)code" ADD_FILELINE)
+.add_argument("data", "ndarray-or-symbol", "Input data to the pooling operator.")
+.add_arguments(PoolingV1Param::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/pooling_v1.cu b/src/operator/pooling_v1.cu
new file mode 100644
index 000000000000..99aebbc6446c
--- /dev/null
+++ b/src/operator/pooling_v1.cu
@@ -0,0 +1,36 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file pooling_v1.cu
+ * \brief
+ * \author Bing Xu
+*/
+#include <vector>
+#include "./pooling_v1-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<gpu>(PoolingV1Param param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    switch (param.pool_type) {
+      case pool_v1_enum::kMaxPooling:
+        op = new PoolingV1Op<gpu, mshadow::red::maximum, DType>(param);
+        break;
+      case pool_v1_enum::kAvgPooling:
+        op = new PoolingV1Op<gpu, mshadow::red::sum, DType>(param);
+        break;
+      case pool_v1_enum::kSumPooling:
+        op = new PoolingV1Op<gpu, mshadow::red::sum, DType>(param);
+        break;
+      default:
+        LOG(FATAL) << "unknown pooling type";
+        return NULL;
+    }
+  });
+  return op;
+}
+
+}  // namespace op
+}  // namespace mxnet
+

From 966780c2b576e2379d8528151925701bc3d284cf Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Thu, 16 Mar 2017 15:22:10 -0700
Subject: [PATCH 02/16] Implemented 2d pooling cpu and gpu

---
 src/operator/nn/pool.cuh      | 92 +++++++++++++++++++----------------
 src/operator/nn/pool.h        | 65 +++++++++++++++++--------
 src/operator/pooling-inl.h    | 66 +++++++++----------------
 src/operator/pooling.cu       | 51 ++++++++-----------
 src/operator/pooling_v1-inl.h |  2 +-
 5 files changed, 138 insertions(+), 138 deletions(-)

diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh
index e8f53a5d4ada..bc0f56c8a96f 100644
--- a/src/operator/nn/pool.cuh
+++ b/src/operator/nn/pool.cuh
@@ -4,6 +4,7 @@
 #include <mxnet/base.h>
 #include <mxnet/operator.h>
 #include "../mxnet_op.h"
+#include "../../common/cuda_utils.h"
 
 namespace mxnet {
 namespace op {
@@ -27,7 +28,8 @@ __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* const in
                                        const int pooled_height, const int pooled_width,
                                        const int kernel_h, const int kernel_w, const int stride_h,
                                        const int stride_w, const int pad_h, const int pad_w,
-                                       OpReqType req_type, DType* const out_data, int32_t* mask) {
+                                       OpReqType req_type, DType* const out_data) {
+  // index is the output image's pixel index in NCHW
   CUDA_KERNEL_LOOP(index, nthreads) {
     const int pw = index % pooled_width;
     const int ph = (index / pooled_width) % pooled_height;
@@ -41,60 +43,63 @@ __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* const in
     wstart = max(wstart, 0);
     const DType* const in_slice =
         in_data + (n * channels + c) * height * width;
-    int in_index = hstart * width + wstart;
-    DType max_val = in_slice[in_index];
-    int max_idx = in_index;
+    DType max_val = in_slice[hstart * width + wstart];
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
-        in_index = h * width + w;
+        const int in_index = h * width + w;
         const DType in_val = in_slice[in_index];
         if (in_val > max_val) {
           max_val = in_val;
-          max_idx = in_index;
         }
       }
     }
-    mask[index] = max_idx;
     KERNEL_ASSIGN(out_data[index], req_type, max_val);
   }
 }
 
 template <typename DType>
-__global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* const out_data,
-                                         const int32_t* const mask, const int channels,
-                                         const int height, const int width,
+__global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_grad,
+                                         const DType* in_data, const DType* out_data,
+                                         const int channels, const int height, const int width,
                                          const int pooled_height, const int pooled_width,
                                          const int kernel_h, const int kernel_w,
                                          const int stride_h, const int stride_w,
                                          const int pad_h, const int pad_w,
-                                         OpReqType req_type, DType* const in_data) {
+                                         DType* in_grad) {
+  // index is the output image's pixel index in NCHW
+  // the order has to be consistent with pooling max
+  // to avoid adding out_grad to the wrong in_grad
+  // in the case where there are multiple max pixels
+  // covered by a kernel window
   CUDA_KERNEL_LOOP(index, nthreads) {
-    // find out the local index
-    // find out the local offset
-    const int w = index % width;
-    const int h = (index / width) % height;
-    const int c = (index / width / height) % channels;
-    const int n = index / width / height / channels;
-    const int phstart =
-         (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
-    const int phend = min((h + pad_h) / stride_h + 1, pooled_height);
-    const int pwstart =
-         (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
-    const int pwend = min((w + pad_w) / stride_w + 1, pooled_width);
-    DType gradient = 0;
-    const int offset = (n * channels + c) * pooled_height * pooled_width;
-    const DType* const out_data_slice = out_data + offset;
-    const int* const mask_slice = mask + offset;
-    int in_index = h * width + w;
-    for (int ph = phstart; ph < phend; ++ph) {
-      for (int pw = pwstart; pw < pwend; ++pw) {
-        int pooled_index = ph * pooled_width + pw;
-        if (mask_slice[pooled_index] == in_index) {
-          gradient += out_data_slice[pooled_index];
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    const int hend = min(hstart + kernel_h, height);
+    const int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    // in data/grad offset batch and channel dims
+    int in_offset = (n * channels + c) * height * width;
+    const DType* in_data_slice = in_data + in_offset;
+    int max_idx = hstart * width + wstart;
+    DType max_val = out_data[index];
+    bool found = false;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        max_idx = h * width + w;
+        if (in_data_slice[max_idx] == max_val) {
+          found = true;
+          break;
         }
       }
+      if (found) break;
     }
-    KERNEL_ASSIGN(in_data[index], req_type, gradient);
+
+    atomicAdd(&in_grad[in_offset+max_idx], out_grad[index]);
   }
 }
 
@@ -102,7 +107,7 @@ template<typename DType>
 inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& ishape,
                  const TShape& oshape, const TShape& kernel, const TShape& pad,
                  const TShape& stride, const int pool_type, OpReqType req_type,
-                 DType* out_data, int32_t* mask = nullptr) {
+                 DType* out_data) {
   if (kernel.ndim() == 2) {
     if (pool_enum::kMaxPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
@@ -110,29 +115,30 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    oshape[2], oshape[3], kernel[0], kernel[1],
-                                   stride[0], stride[1], pad[0], pad[1], req_type, out_data, mask);
+                                   stride[0], stride[1], pad[0], pad[1], req_type, out_data);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_2d_gpu_kernel);
     }
   }
 }
 
 template<typename DType>
-inline void unpool(mshadow::Stream<gpu>* s, const DType* out_data, const TShape& ishape,
-                   const TShape& oshape, const TShape& kernel, const TShape& pad,
-                   const TShape& stride, const int pool_type, OpReqType req_type, DType* in_data,
-                   const int32_t* mask = nullptr) {
+inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType* in_data,
+                   const DType* out_data, const TShape& ishape, const TShape& oshape,
+                   const TShape& kernel, const TShape& pad, const TShape& stride,
+                   const int pool_type, OpReqType req_type, DType* in_grad) {
   if (mxnet::kNullOp == req_type) return;
   if (mxnet::kAddTo != req_type) {
-    mxnet_op::Kernel<mxnet_op::set_zero, gpu>::Launch(s, ishape.Size(), in_data);
+    mxnet_op::Kernel<mxnet_op::set_zero, gpu>::Launch(s, ishape.Size(), in_grad);
   }
   if (kernel.ndim() == 2) {
     if (pool_enum::kMaxPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       unpool_max_2d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
-                                     ishape.Size(), out_data, mask, ishape[1], ishape[2], ishape[3],
+                                     oshape.Size(), out_grad, in_data, out_data,
+                                     ishape[1], ishape[2], ishape[3],
                                      oshape[2], oshape[3], kernel[0], kernel[1],
-                                     stride[0], stride[1], pad[0], pad[1], req_type, in_data);
+                                     stride[0], stride[1], pad[0], pad[1], in_grad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_2d_gpu_kernel);
     }
   }
diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h
index 15b0f7132844..19d2877d7062 100644
--- a/src/operator/nn/pool.h
+++ b/src/operator/nn/pool.h
@@ -18,7 +18,7 @@ enum PoolingOpPadConventionType {kValid, kFull};
 template<typename DType>
 inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
-                            OpReqType req_type, DType* out_data, int32_t* mask) {
+                            OpReqType req_type, DType* out_data) {
   const index_t height = ishape[2], width = ishape[3];
   const index_t pooled_height = oshape[2], pooled_width = oshape[3];
   const index_t kernel_h = kernel[0], kernel_w = kernel[1];
@@ -39,13 +39,11 @@ inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TS
           const index_t pool_index = ph * pooled_width + pw;
           index_t in_index = hstart * width + wstart;
           DType max_val = in_data[in_index];
-          mask[pool_index] = in_index;
           for (index_t h = hstart; h < hend; ++h) {
             for (index_t w = wstart; w < wend; ++w) {
               in_index = h * width + w;
               if (in_data[in_index] > max_val) {
                 max_val = in_data[in_index];
-                mask[pool_index] = in_index;
               }
             }
           }
@@ -54,28 +52,53 @@ inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TS
       }
       in_data += in_data_offset;
       out_data += out_data_offset;
-      mask += out_data_offset;
     }
   }
 }
 
 template<typename DType>
-inline void unpool_max_2d_cpu(mshadow::Stream<cpu>* s, const DType* out_data, const int32_t* mask,
-                              const TShape& ishape, const TShape& oshape, DType* in_data) {
+inline void unpool_max_2d_cpu(const DType* out_grad, const DType* in_data,
+                              const DType* out_data, const TShape& ishape,
+                              const TShape& oshape, const TShape& kernel,
+                              const TShape& pad, const TShape& stride,
+                              DType* in_grad) {
+  const index_t height = ishape[2], width = ishape[3];
   const index_t pooled_height = oshape[2], pooled_width = oshape[3];
-  const index_t in_data_offset = ishape[2] * ishape[3];
-  const index_t out_data_offset = oshape[2] * oshape[3];
+  const index_t kernel_h = kernel[0], kernel_w = kernel[1];
+  const index_t pad_h = pad[0], pad_w = pad[1];
+  const index_t stride_h = stride[0], stride_w = stride[1];
+  const index_t in_offset = ishape[2] * ishape[3];
+  const index_t out_offset = oshape[2] * oshape[3];
   for (index_t n = 0; n < oshape[0]; ++n) {
     for (index_t c = 0; c < oshape[1]; ++c) {
       for (index_t ph = 0; ph < pooled_height; ++ph) {
         for (index_t pw = 0; pw < pooled_width; ++pw) {
-          const index_t out_index = ph * pooled_width + pw;
-          in_data[mask[out_index]] += out_data[out_index];
+          index_t tmp_h = ph * stride_h;
+          index_t tmp_w = pw * stride_w;
+          index_t hend = std::min(tmp_h + kernel_h - pad_h, height);
+          index_t wend = std::min(tmp_w + kernel_w - pad_w, width);
+          index_t hstart = (tmp_h > pad_h? tmp_h - pad_h : 0);
+          index_t wstart = (tmp_w > pad_w? tmp_w - pad_w : 0);
+          const index_t pool_index = ph * pooled_width + pw;
+          index_t max_idx = hstart * width + wstart;
+          bool found = false;
+          for (index_t h = hstart; h < hend; ++h) {
+            for (index_t w = wstart; w < wend; ++w) {
+              max_idx = h * width + w;
+              if (in_data[max_idx] == out_data[pool_index]) {
+                found = true;
+                break;
+              }
+            }
+            if (found) break;
+          }
+          in_grad[max_idx] += out_grad[pool_index];
         }
       }
-      in_data += in_data_offset;
-      out_data += out_data_offset;
-      mask += out_data_offset;
+      in_data += in_offset;
+      in_grad += in_offset;
+      out_data += out_offset;
+      out_grad += out_offset;
     }
   }
 }
@@ -84,26 +107,26 @@ template<typename DType>
 inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& ishape,
                  const TShape& oshape, const TShape& kernel, const TShape& pad,
                  const TShape& stride, const int pool_type, OpReqType req_type,
-                 DType* out_data, int32_t* mask = nullptr) {
+                 DType* out_data) {
   if (kernel.ndim() == 2) {
     if (pool_enum::kMaxPooling == pool_type) {
-      pool_max_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data, mask);
+      pool_max_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data);
     }
   }
 }
 
 template<typename DType>
-inline void unpool(mshadow::Stream<cpu>* s, const DType* out_data, const TShape& ishape,
-                   const TShape& oshape, const TShape& kernel, const TShape& pad,
-                   const TShape& stride, const int pool_type, OpReqType req_type, DType* in_data,
-                   const int32_t* mask = nullptr) {
+inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType* in_data,
+                   const DType* out_data, const TShape& ishape, const TShape& oshape,
+                   const TShape& kernel, const TShape& pad, const TShape& stride,
+                   const int pool_type, OpReqType req_type, DType* in_grad) {
   if (mxnet::kNullOp == req_type) return;
   if (mxnet::kAddTo != req_type) {
-    mxnet_op::Kernel<mxnet_op::set_zero, cpu>::Launch(s, ishape.Size(), in_data);
+    mxnet_op::Kernel<mxnet_op::set_zero, cpu>::Launch(s, ishape.Size(), in_grad);
   }
   if (kernel.ndim() == 2) {
     if (pool_enum::kMaxPooling == pool_type) {
-      unpool_max_2d_cpu(s, out_data, mask, ishape, oshape, in_data);
+      unpool_max_2d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
     }
   }
 }
diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h
index 5258844341c4..b1dcad8d3d7e 100644
--- a/src/operator/pooling-inl.h
+++ b/src/operator/pooling-inl.h
@@ -22,15 +22,6 @@
 namespace mxnet {
 namespace op {
 
-#if 0
-namespace pool_enum {
-enum PoolingOpInputs {kData};
-enum PoolingOpOutputs {kOut};
-enum PoolingOpType {kMaxPooling, kAvgPooling, kSumPooling};
-enum PoolingOpPadConventionType {kValid, kFull};
-}  // namespace pool_enum
-#endif
-
 struct PoolingParam : public dmlc::Parameter<PoolingParam> {
   TShape kernel;
   TShape stride;
@@ -38,10 +29,14 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
   int pool_type;
   int pooling_convention;
   bool global_pool;
+  bool cudnn_off;
   DMLC_DECLARE_PARAMETER(PoolingParam) {
     DMLC_DECLARE_FIELD(global_pool).set_default(false)
     .describe("Ignore kernel size, do global pooling based on current input feature map. ");
 
+    DMLC_DECLARE_FIELD(cudnn_off).set_default(false)
+    .describe("Turn off cudnn pooling and use MXNet pooling operator. ");
+
     DMLC_DECLARE_FIELD(kernel)
     .enforce_nonzero()
     .describe("pooling kernel size: (y, x) or (d, y, x)");
@@ -82,17 +77,19 @@ class PoolingOp : public Operator {
     CHECK_EQ(in_data.size(), 1U);
     CHECK_EQ(out_data.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
+    const TShape& ishape = in_data[pool_enum::kData].shape_;
+
     pool(s, in_data[pool_enum::kData].dptr<DType>(),
          in_data[pool_enum::kData].shape_,
          out_data[pool_enum::kOut].shape_,
-         param_.global_pool? in_data[pool_enum::kData].shape_ : param_.kernel,
+         param_.global_pool?
+           TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim())
+           : param_.kernel,
          param_.pad,
          param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride,
          param_.pool_type,
          req[pool_enum::kOut],
-         out_data[pool_enum::kOut].dptr<DType>(),
-         pool_enum::kMaxPooling == param_.pool_type? out_data[pool_enum::kMask].dptr<int32_t>()
-                                                   : nullptr);
+         out_data[pool_enum::kOut].dptr<DType>());
   }
 
   virtual void Backward(const OpContext& ctx,
@@ -109,17 +106,21 @@ class PoolingOp : public Operator {
     CHECK_EQ(req.size(), 1U);
     CHECK_EQ(in_grad.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    unpool(s, out_data[pool_enum::kOut].dptr<DType>(),
-           in_data[pool_enum::kData].shape_,
-           out_data[pool_enum::kOut].shape_,
-           param_.global_pool? in_data[pool_enum::kData].shape_ : param_.kernel,
+    const TShape& ishape = in_data[pool_enum::kData].shape_;
+
+    unpool(s, out_grad[pool_enum::kOut].dptr<DType>(),
+           in_data[pool_enum::kData].dptr<DType>(),
+           out_data[pool_enum::kOut].dptr<DType>(),
+           in_grad[pool_enum::kData].shape_,
+           out_grad[pool_enum::kOut].shape_,
+           param_.global_pool?
+             TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim())
+             : param_.kernel,
            param_.pad,
            param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride,
            param_.pool_type,
            req[pool_enum::kData],
-           in_data[pool_enum::kData].dptr<DType>(),
-           pool_enum::kMaxPooling == param_.pool_type? out_data[pool_enum::kMask].dptr<int32_t>()
-                                                     : nullptr);
+           in_grad[pool_enum::kData].dptr<DType>());
   }
 
  private:
@@ -191,9 +192,6 @@ class PoolingProp : public OperatorProperty {
       }
       out_shape->clear();
       out_shape->push_back(oshape);  // save output shape
-      if (pool_enum::kMaxPooling == param_.pool_type) {
-        out_shape->push_back(oshape);  // save mask shape
-      }
     } else if (param_.kernel.ndim() == 3) {
       CHECK_EQ(dshape.ndim(), 5U)
         << "Pooling: Input data should be 5D in (batch, channel, d, y, x)";
@@ -227,9 +225,6 @@ class PoolingProp : public OperatorProperty {
 
       out_shape->clear();
       out_shape->push_back(oshape);  // save output shape
-      if (pool_enum::kMaxPooling == param_.pool_type) {
-        out_shape->push_back(oshape);  // save mask shape
-      }
     }
     return true;
   }
@@ -247,9 +242,6 @@ class PoolingProp : public OperatorProperty {
 
     out_type->clear();
     out_type->push_back(dtype);
-    if (pool_enum::kMaxPooling == param_.pool_type) {
-      out_type->push_back(mshadow::kInt32);
-    }
     return true;
   }
 
@@ -267,10 +259,8 @@ class PoolingProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    if (pool_enum::kMaxPooling == param_.pool_type) {
-      return {out_grad[pool_enum::kOut], out_data[pool_enum::kMask]};
-    }
-    return {out_grad[pool_enum::kOut], in_data[pool_enum::kData], out_data[pool_enum::kOut]};
+    return {out_grad[pool_enum::kOut], in_data[pool_enum::kData],
+            out_data[pool_enum::kOut]};
   }
 
   std::vector<std::pair<int, void*> > BackwardInplaceOption(
@@ -285,16 +275,6 @@ class PoolingProp : public OperatorProperty {
 #endif
   }
 
-  int NumVisibleOutputs() const override {
-    return 1;
-  }
-
-  int NumOutputs() const override {
-    // For max pooling, need to return mask as well
-    if (pool_enum::kMaxPooling == param_.pool_type) return 2;
-    return 1;
-  }
-
   Operator* CreateOperator(Context ctx) const override {
     LOG(FATAL) << "Not Implemented.";
     return NULL;
diff --git a/src/operator/pooling.cu b/src/operator/pooling.cu
index d8d5c535fce9..c420852b1c8d 100644
--- a/src/operator/pooling.cu
+++ b/src/operator/pooling.cu
@@ -15,43 +15,34 @@ namespace op {
 
 template<>
 Operator *CreateOp<gpu>(PoolingParam param, int dtype) {
-  Operator *op = NULL;
 #if MXNET_USE_CUDNN == 1
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    switch (param.pool_type) {
-      case pool_enum::kMaxPooling:
-        op = new CuDNNPoolingOp<DType>(param);
-        break;
-      case pool_enum::kAvgPooling:
-        op = new CuDNNPoolingOp<DType>(param);
-        break;
-      case pool_enum::kSumPooling:
-        LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied.";
-        op = new PoolingOp<gpu, DType>(param);
-        break;
-      default:
-        LOG(FATAL) << "unknown pooling type";
-        return NULL;
+    if (!param.cudnn_off) {
+      switch (param.pool_type) {
+        case pool_enum::kMaxPooling:
+          return new CuDNNPoolingOp<DType>(param);
+        case pool_enum::kAvgPooling:
+          return new CuDNNPoolingOp<DType>(param);
+        case pool_enum::kSumPooling:
+          LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied.";
+          return new PoolingOp<gpu, DType>(param);
+        default:
+          LOG(FATAL) << "unknown pooling type";
+          return NULL;
+      }
     }
   });
-#else
+#endif  // MXNET_USE_CUDNN
+  Operator *op = NULL;
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    switch (param.pool_type) {
-      case pool_enum::kMaxPooling:
-        op = new PoolingOp<gpu, DType>(param);
-        break;
-      case pool_enum::kAvgPooling:
-        op = new PoolingOp<gpu, DType>(param);
-        break;
-      case pool_enum::kSumPooling:
-        op = new PoolingOp<gpu, DType>(param);
-        break;
-      default:
-        LOG(FATAL) << "unknown pooling type";
-        return NULL;
+    if (pool_enum::kMaxPooling == param.pool_type
+        || pool_enum::kAvgPooling == param.pool_type
+        || pool_enum::kSumPooling == param.pool_type) {
+      op = new PoolingOp<gpu, DType>(param);
+    } else {
+      LOG(FATAL) << "unknown pooling type";
     }
   });
-#endif  // MXNET_USE_CUDNN
   return op;
 }
 
diff --git a/src/operator/pooling_v1-inl.h b/src/operator/pooling_v1-inl.h
index d354fd7322b8..ff71d2e7b8a5 100644
--- a/src/operator/pooling_v1-inl.h
+++ b/src/operator/pooling_v1-inl.h
@@ -294,7 +294,7 @@ class PoolingV1Prop : public OperatorProperty {
   }
 
   std::string TypeString() const override {
-    return "PoolingV1";
+    return "Pooling_v1";
   }
 
   std::vector<int> DeclareBackwardDependency(

From 8776bec177358a3a032aee2b3da3a19ee9f1336c Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Thu, 16 Mar 2017 21:55:31 -0700
Subject: [PATCH 03/16] Add 2d avg and sum pooling cpu and gpu

---
 src/operator/nn/pool.cuh | 120 +++++++++++++++++++++++++++++++++++++--
 src/operator/nn/pool.h   |  94 ++++++++++++++++++++++++++++++
 2 files changed, 210 insertions(+), 4 deletions(-)

diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh
index bc0f56c8a96f..3816417ec311 100644
--- a/src/operator/nn/pool.cuh
+++ b/src/operator/nn/pool.cuh
@@ -23,12 +23,12 @@ inline int cuda_get_num_blocks(const int N) {
 }
 
 template <typename DType>
-__global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* const in_data,
+__global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int channels, const int height, const int width,
                                        const int pooled_height, const int pooled_width,
                                        const int kernel_h, const int kernel_w, const int stride_h,
                                        const int stride_w, const int pad_h, const int pad_w,
-                                       OpReqType req_type, DType* const out_data) {
+                                       OpReqType req_type, DType* out_data) {
   // index is the output image's pixel index in NCHW
   CUDA_KERNEL_LOOP(index, nthreads) {
     const int pw = index % pooled_width;
@@ -41,7 +41,7 @@ __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* const in
     const int wend = min(wstart + kernel_w, width);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
-    const DType* const in_slice =
+    const DType* in_slice =
         in_data + (n * channels + c) * height * width;
     DType max_val = in_slice[hstart * width + wstart];
     for (int h = hstart; h < hend; ++h) {
@@ -57,6 +57,40 @@ __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* const in
   }
 }
 
+template <typename DType>
+__global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
+                                       const int height, const int width,
+                                       const int pooled_height, const int pooled_width,
+                                       const int kernel_h, const int kernel_w,
+                                       const int stride_h, const int stride_w,
+                                       const int pad_h, const int pad_w, OpReqType req_type,
+                                       DType* out_data, bool getAvg = false) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+	  const int pw = index % pooled_width;
+	  const int ph = (index / pooled_width) % pooled_height;
+	  const int c = (index / pooled_width / pooled_height) % channels;
+	  const int n = index / pooled_width / pooled_height / channels;
+	  int hstart = ph * stride_h - pad_h;
+	  int wstart = pw * stride_w - pad_w;
+	  int hend = min(hstart + kernel_h, height + pad_h);
+	  int wend = min(wstart + kernel_w, width + pad_w);
+	  const int pool_size = (getAvg? (hend - hstart) * (wend - wstart) : 1);
+	  hstart = max(hstart, 0); 
+	  wstart = max(wstart, 0); 
+	  hend = min(hend, height);
+	  wend = min(wend, width);
+	  DType sum = 0;
+	  const DType* out_slice =
+	 		in_data + (n * channels + c) * height * width;
+	  for (int h = hstart; h < hend; ++h) {
+		  for (int w = wstart; w < wend; ++w) {
+		    sum += out_slice[h * width + w]; 
+		  }   
+	  }
+	  KERNEL_ASSIGN(out_data[index], req_type, sum / pool_size);
+  }
+}
+
 template <typename DType>
 __global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_grad,
                                          const DType* in_data, const DType* out_data,
@@ -103,6 +137,46 @@ __global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_gr
   }
 }
 
+template<typename DType>
+__global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_grad,
+                                         const int channels, const int height, const int width,
+                                         const int pooled_height, const int pooled_width,
+                                         const int kernel_h, const int kernel_w,
+                                         const int stride_h, const int stride_w,
+                                         const int pad_h, const int pad_w,
+                                         DType* in_grad, bool isAvg = false) {
+  // index is the input image index in NCHW
+  CUDA_KERNEL_LOOP(index, nthreads) {
+	  // find out the local index
+	  // find out the local offset
+	  const int w = index % width + pad_w;
+	  const int h = (index / width) % height + pad_h;
+	  const int c = (index / width / height) % channels;
+	  const int n = index / width / height / channels;
+	  const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+	  const int phend = min(h / stride_h + 1, pooled_height);
+	  const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+	  const int pwend = min(w / stride_w + 1, pooled_width);
+	  DType gradient = 0;
+	  const DType* out_grad_slice =
+      out_grad + (n * channels + c) * pooled_height * pooled_width;
+	  for (int ph = phstart; ph < phend; ++ph) {
+	 	  for (int pw = pwstart; pw < pwend; ++pw) {
+		    // figure out the pooling size
+			  int hstart = ph * stride_h - pad_h;
+			  int wstart = pw * stride_w - pad_w;
+			  int hend = min(hstart + kernel_h, height + pad_h);
+			  int wend = min(wstart + kernel_w, width + pad_w);
+			  int pool_size = (isAvg? (hend - hstart) * (wend - wstart) : 1);
+			  gradient += out_grad_slice[ph * pooled_width + pw] / pool_size;
+		  }
+	  }
+    // if req=kWriteTo, in_grad has already been assigned zero values in unpool()
+    // use "+=" here instead of "=" to accommodate when req=kAddTo
+	  in_grad[index] += gradient;
+  }
+}
+  
 template<typename DType>
 inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& ishape,
                  const TShape& oshape, const TShape& kernel, const TShape& pad,
@@ -117,6 +191,24 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
                                    oshape[2], oshape[3], kernel[0], kernel[1],
                                    stride[0], stride[1], pad[0], pad[1], req_type, out_data);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_2d_gpu_kernel);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_sum_2d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
+                                   oshape[2], oshape[3], kernel[0], kernel[1],
+                                   stride[0], stride[1], pad[0], pad[1], req_type, out_data, true);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_sum_2d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
+                                   oshape[2], oshape[3], kernel[0], kernel[1],
+                                   stride[0], stride[1], pad[0], pad[1], req_type, out_data);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
   }
 }
@@ -133,13 +225,33 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
   if (kernel.ndim() == 2) {
     if (pool_enum::kMaxPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      unpool_max_2d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+      unpool_max_2d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      oshape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], ishape[3],
                                      oshape[2], oshape[3], kernel[0], kernel[1],
                                      stride[0], stride[1], pad[0], pad[1], in_grad);
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_2d_gpu_kernel);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_sum_2d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     ishape.Size(), out_grad,
+                                     ishape[1], ishape[2], ishape[3],
+                                     oshape[2], oshape[3], kernel[0], kernel[1],
+                                     stride[0], stride[1], pad[0], pad[1], in_grad, true);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_sum_2d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     ishape.Size(), out_grad,
+                                     ishape[1], ishape[2], ishape[3],
+                                     oshape[2], oshape[3], kernel[0], kernel[1],
+                                     stride[0], stride[1], pad[0], pad[1], in_grad);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
   }
 }
diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h
index 19d2877d7062..3de8e99d0d86 100644
--- a/src/operator/nn/pool.h
+++ b/src/operator/nn/pool.h
@@ -56,6 +56,45 @@ inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TS
   }
 }
 
+template<typename DType>
+inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
+                            const TShape& kernel, const TShape& pad, const TShape& stride,
+                            OpReqType req_type, DType* out_data, bool getAvg = false) {
+  const int height = ishape[2], width = ishape[3];
+  const int pooled_height = oshape[2], pooled_width = oshape[3];
+  const int kernel_h = kernel[0], kernel_w = kernel[1];
+  const int pad_h = pad[0], pad_w = pad[1];
+  const int stride_h = stride[0], stride_w = stride[1];
+  const index_t in_data_offset = ishape[2] * ishape[3];
+  const index_t out_data_offset = oshape[2] * oshape[3];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int hstart = ph * stride_h - pad_h;
+          int wstart = pw * stride_w - pad_w;
+          int hend = std::min(hstart + kernel_h, height + pad_h);
+          int wend = std::min(wstart + kernel_w, width + pad_w);
+          int pool_size = (hend - hstart) * (wend - wstart);
+          hstart = std::max(hstart, 0); 
+          wstart = std::max(wstart, 0); 
+          hend = std::min(hend, height);
+          wend = std::min(wend, width);
+          DType sum = 0;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              sum += in_data[h*width+w]; 
+            }
+          } 
+          KERNEL_ASSIGN(out_data[ph*pooled_width+pw], req_type, getAvg? sum/pool_size : sum);
+        }
+      }
+      in_data += in_data_offset;
+      out_data += out_data_offset;
+    }
+  }
+}
+
 template<typename DType>
 inline void unpool_max_2d_cpu(const DType* out_grad, const DType* in_data,
                               const DType* out_data, const TShape& ishape,
@@ -103,6 +142,49 @@ inline void unpool_max_2d_cpu(const DType* out_grad, const DType* in_data,
   }
 }
 
+template<typename DType>
+inline void unpool_sum_2d_cpu(const DType* out_grad, const TShape& ishape,
+                              const TShape& oshape, const TShape& kernel,
+                              const TShape& pad, const TShape& stride,
+                              DType* in_grad, bool isAvg = false) {
+  const int height = ishape[2], width = ishape[3];
+  const int pooled_height = oshape[2], pooled_width = oshape[3];
+  const int kernel_h = kernel[0], kernel_w = kernel[1];
+  const int pad_h = pad[0], pad_w = pad[1];
+  const int stride_h = stride[0], stride_w = stride[1];
+  const index_t in_grad_offset = ishape[2] * ishape[3];
+  const index_t out_grad_offset = oshape[2] * oshape[3];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int hstart = ph * stride_h - pad_h;
+          int wstart = pw * stride_w - pad_w;
+          int hend = std::min(hstart + kernel_h, height + pad_h);
+          int wend = std::min(wstart + kernel_w, width + pad_w);
+          int pool_size = 1;
+          if (isAvg) {
+            pool_size = (hend - hstart) * (wend - wstart);
+          }
+          hstart = std::max(hstart, 0); 
+          wstart = std::max(wstart, 0); 
+          hend = std::min(hend, height);
+          wend = std::min(wend, width);
+          const int pool_index = ph * pooled_width + pw;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              in_grad[h*width+w] += out_grad[pool_index] / pool_size; 
+            }
+          } 
+        }
+      }
+      in_grad += in_grad_offset;
+      out_grad += out_grad_offset;
+    }
+  }
+}
+
+
 template<typename DType>
 inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& ishape,
                  const TShape& oshape, const TShape& kernel, const TShape& pad,
@@ -111,6 +193,12 @@ inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& is
   if (kernel.ndim() == 2) {
     if (pool_enum::kMaxPooling == pool_type) {
       pool_max_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data, true);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
   }
 }
@@ -127,6 +215,12 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
   if (kernel.ndim() == 2) {
     if (pool_enum::kMaxPooling == pool_type) {
       unpool_max_2d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      unpool_sum_2d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      unpool_sum_2d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
   }
 }

From 4678368847ea45fd995d8dd21c0b630a6c85fc7f Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Thu, 16 Mar 2017 22:08:21 -0700
Subject: [PATCH 04/16] Fix lint

---
 src/operator/nn/pool.cuh | 10 +++++-----
 src/operator/nn/pool.h   | 17 ++++++++---------
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh
index 3816417ec311..5e62a67e344f 100644
--- a/src/operator/nn/pool.cuh
+++ b/src/operator/nn/pool.cuh
@@ -75,8 +75,8 @@ __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data,
 	  int hend = min(hstart + kernel_h, height + pad_h);
 	  int wend = min(wstart + kernel_w, width + pad_w);
 	  const int pool_size = (getAvg? (hend - hstart) * (wend - wstart) : 1);
-	  hstart = max(hstart, 0); 
-	  wstart = max(wstart, 0); 
+	  hstart = max(hstart, 0);
+	  wstart = max(wstart, 0);
 	  hend = min(hend, height);
 	  wend = min(wend, width);
 	  DType sum = 0;
@@ -84,8 +84,8 @@ __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data,
 	 		in_data + (n * channels + c) * height * width;
 	  for (int h = hstart; h < hend; ++h) {
 		  for (int w = wstart; w < wend; ++w) {
-		    sum += out_slice[h * width + w]; 
-		  }   
+		    sum += out_slice[h * width + w];
+		  }
 	  }
 	  KERNEL_ASSIGN(out_data[index], req_type, sum / pool_size);
   }
@@ -176,7 +176,7 @@ __global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_gr
 	  in_grad[index] += gradient;
   }
 }
-  
+
 template<typename DType>
 inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& ishape,
                  const TShape& oshape, const TShape& kernel, const TShape& pad,
diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h
index 3de8e99d0d86..d6a42de1330f 100644
--- a/src/operator/nn/pool.h
+++ b/src/operator/nn/pool.h
@@ -76,16 +76,16 @@ inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TS
           int hend = std::min(hstart + kernel_h, height + pad_h);
           int wend = std::min(wstart + kernel_w, width + pad_w);
           int pool_size = (hend - hstart) * (wend - wstart);
-          hstart = std::max(hstart, 0); 
-          wstart = std::max(wstart, 0); 
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
           hend = std::min(hend, height);
           wend = std::min(wend, width);
           DType sum = 0;
           for (int h = hstart; h < hend; ++h) {
             for (int w = wstart; w < wend; ++w) {
-              sum += in_data[h*width+w]; 
+              sum += in_data[h*width+w];
             }
-          } 
+          }
           KERNEL_ASSIGN(out_data[ph*pooled_width+pw], req_type, getAvg? sum/pool_size : sum);
         }
       }
@@ -166,16 +166,16 @@ inline void unpool_sum_2d_cpu(const DType* out_grad, const TShape& ishape,
           if (isAvg) {
             pool_size = (hend - hstart) * (wend - wstart);
           }
-          hstart = std::max(hstart, 0); 
-          wstart = std::max(wstart, 0); 
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
           hend = std::min(hend, height);
           wend = std::min(wend, width);
           const int pool_index = ph * pooled_width + pw;
           for (int h = hstart; h < hend; ++h) {
             for (int w = wstart; w < wend; ++w) {
-              in_grad[h*width+w] += out_grad[pool_index] / pool_size; 
+              in_grad[h*width+w] += out_grad[pool_index] / pool_size;
             }
-          } 
+          }
         }
       }
       in_grad += in_grad_offset;
@@ -184,7 +184,6 @@ inline void unpool_sum_2d_cpu(const DType* out_grad, const TShape& ishape,
   }
 }
 
-
 template<typename DType>
 inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& ishape,
                  const TShape& oshape, const TShape& kernel, const TShape& pad,

From 01749a7eb0f5ce4237b019d0fa0d191427980c2d Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Sun, 19 Mar 2017 13:59:34 -0700
Subject: [PATCH 05/16] Add pooling 3d cpu and bug fix

---
 src/operator/nn/pool.cuh |  11 +-
 src/operator/nn/pool.h   | 212 ++++++++++++++++++++++++++++++++-------
 2 files changed, 181 insertions(+), 42 deletions(-)

diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh
index 5e62a67e344f..5160dec8c879 100644
--- a/src/operator/nn/pool.cuh
+++ b/src/operator/nn/pool.cuh
@@ -29,6 +29,7 @@ __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int kernel_h, const int kernel_w, const int stride_h,
                                        const int stride_w, const int pad_h, const int pad_w,
                                        OpReqType req_type, DType* out_data) {
+  using mshadow::red::limits::MinValue;
   // index is the output image's pixel index in NCHW
   CUDA_KERNEL_LOOP(index, nthreads) {
     const int pw = index % pooled_width;
@@ -43,7 +44,7 @@ __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data,
     wstart = max(wstart, 0);
     const DType* in_slice =
         in_data + (n * channels + c) * height * width;
-    DType max_val = in_slice[hstart * width + wstart];
+    DType max_val = MinValue<DType>();
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
         const int in_index = h * width + w;
@@ -119,7 +120,7 @@ __global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_gr
     // in data/grad offset batch and channel dims
     int in_offset = (n * channels + c) * height * width;
     const DType* in_data_slice = in_data + in_offset;
-    int max_idx = hstart * width + wstart;
+    int max_idx = -1;
     DType max_val = out_data[index];
     bool found = false;
     for (int h = hstart; h < hend; ++h) {
@@ -133,7 +134,11 @@ __global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_gr
       if (found) break;
     }
 
-    atomicAdd(&in_grad[in_offset+max_idx], out_grad[index]);
+    // In the case where pad > 0 and kernel = 1, for example,
+    // max_idx can be -1 reaching this step.
+    if (max_idx >= 0) {
+      atomicAdd(&in_grad[in_offset+max_idx], out_grad[index]);
+    }
   }
 }
 
diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h
index d6a42de1330f..538041621198 100644
--- a/src/operator/nn/pool.h
+++ b/src/operator/nn/pool.h
@@ -19,29 +19,29 @@ template<typename DType>
 inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
                             OpReqType req_type, DType* out_data) {
-  const index_t height = ishape[2], width = ishape[3];
-  const index_t pooled_height = oshape[2], pooled_width = oshape[3];
-  const index_t kernel_h = kernel[0], kernel_w = kernel[1];
-  const index_t pad_h = pad[0], pad_w = pad[1];
-  const index_t stride_h = stride[0], stride_w = stride[1];
+  using mshadow::red::limits::MinValue;
+  const int height = ishape[2], width = ishape[3];
+  const int pooled_height = oshape[2], pooled_width = oshape[3];
+  const int kernel_h = kernel[0], kernel_w = kernel[1];
+  const int pad_h = pad[0], pad_w = pad[1];
+  const int stride_h = stride[0], stride_w = stride[1];
   const index_t in_data_offset = ishape[2] * ishape[3];
   const index_t out_data_offset = oshape[2] * oshape[3];
   for (index_t n = 0; n < oshape[0]; ++n) {
     for (index_t c = 0; c < oshape[1]; ++c) {
-      for (index_t ph = 0; ph < pooled_height; ++ph) {
-        for (index_t pw = 0; pw < pooled_width; ++pw) {
-          index_t tmp_h = ph * stride_h;
-          index_t tmp_w = pw * stride_w;
-          index_t hend = std::min(tmp_h + kernel_h - pad_h, height);
-          index_t wend = std::min(tmp_w + kernel_w - pad_w, width);
-          index_t hstart = (tmp_h > pad_h? tmp_h - pad_h : 0);
-          index_t wstart = (tmp_w > pad_w? tmp_w - pad_w : 0);
-          const index_t pool_index = ph * pooled_width + pw;
-          index_t in_index = hstart * width + wstart;
-          DType max_val = in_data[in_index];
-          for (index_t h = hstart; h < hend; ++h) {
-            for (index_t w = wstart; w < wend; ++w) {
-              in_index = h * width + w;
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int hstart = ph * stride_h - pad_h;
+          int wstart = pw * stride_w - pad_w;
+          int hend = std::min(hstart + kernel_h, height);
+          int wend = std::min(wstart + kernel_w, width);
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
+          const int pool_index = ph * pooled_width + pw;
+          DType max_val = MinValue<DType>();
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const int in_index= h * width + w;
               if (in_data[in_index] > max_val) {
                 max_val = in_data[in_index];
               }
@@ -56,6 +56,54 @@ inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TS
   }
 }
 
+template<typename DType>
+inline void pool_max_3d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
+                            const TShape& kernel, const TShape& pad, const TShape& stride,
+                            OpReqType req_type, DType* out_data) {
+  using mshadow::red::limits::MinValue;
+  const int depth = ishape[2], height = ishape[3], width = ishape[4];
+  const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
+  const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
+  const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2];
+  const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2];
+  const index_t in_data_offset = ishape[2] * ishape[3] * ishape[4];
+  const index_t out_data_offset = oshape[2] * oshape[3] * oshape[4];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int pd = 0; pd < pooled_depth; ++pd) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            int dstart = pd * stride_d - pad_d;
+            int hstart = ph * stride_h - pad_h;
+            int wstart = pw * stride_w - pad_w;
+            int dend = std::min(dstart + kernel_d, depth);
+            int hend = std::min(hstart + kernel_h, height);
+            int wend = std::min(wstart + kernel_w, width);
+            dstart = std::max(dstart, 0);
+            hstart = std::max(hstart, 0);
+            wstart = std::max(wstart, 0);
+            const int pool_index = (pd * pooled_height + ph) * pooled_width + pw;
+            DType max_val = MinValue<DType>();
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  const int in_index= (d * height + h) * width + w;
+                  if (in_data[in_index] > max_val) {
+                    max_val = in_data[in_index];
+                  }
+                }
+              }
+            }
+            KERNEL_ASSIGN(out_data[pool_index], req_type, max_val);
+          }
+        }
+      }
+      in_data += in_data_offset;
+      out_data += out_data_offset;
+    }
+  }
+}
+
 template<typename DType>
 inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
@@ -101,37 +149,102 @@ inline void unpool_max_2d_cpu(const DType* out_grad, const DType* in_data,
                               const TShape& oshape, const TShape& kernel,
                               const TShape& pad, const TShape& stride,
                               DType* in_grad) {
-  const index_t height = ishape[2], width = ishape[3];
-  const index_t pooled_height = oshape[2], pooled_width = oshape[3];
-  const index_t kernel_h = kernel[0], kernel_w = kernel[1];
-  const index_t pad_h = pad[0], pad_w = pad[1];
-  const index_t stride_h = stride[0], stride_w = stride[1];
+  const int height = ishape[2], width = ishape[3];
+  const int pooled_height = oshape[2], pooled_width = oshape[3];
+  const int kernel_h = kernel[0], kernel_w = kernel[1];
+  const int pad_h = pad[0], pad_w = pad[1];
+  const int stride_h = stride[0], stride_w = stride[1];
   const index_t in_offset = ishape[2] * ishape[3];
   const index_t out_offset = oshape[2] * oshape[3];
   for (index_t n = 0; n < oshape[0]; ++n) {
     for (index_t c = 0; c < oshape[1]; ++c) {
-      for (index_t ph = 0; ph < pooled_height; ++ph) {
-        for (index_t pw = 0; pw < pooled_width; ++pw) {
-          index_t tmp_h = ph * stride_h;
-          index_t tmp_w = pw * stride_w;
-          index_t hend = std::min(tmp_h + kernel_h - pad_h, height);
-          index_t wend = std::min(tmp_w + kernel_w - pad_w, width);
-          index_t hstart = (tmp_h > pad_h? tmp_h - pad_h : 0);
-          index_t wstart = (tmp_w > pad_w? tmp_w - pad_w : 0);
-          const index_t pool_index = ph * pooled_width + pw;
-          index_t max_idx = hstart * width + wstart;
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int hstart = ph * stride_h - pad_h;
+          int wstart = pw * stride_w - pad_w;
+          int hend = std::min(hstart + kernel_h, height);
+          int wend = std::min(wstart + kernel_w, width);
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
+          const int pool_index = ph * pooled_width + pw;
+          int max_idx = -1;
           bool found = false;
-          for (index_t h = hstart; h < hend; ++h) {
-            for (index_t w = wstart; w < wend; ++w) {
-              max_idx = h * width + w;
-              if (in_data[max_idx] == out_data[pool_index]) {
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const int idx = h * width + w;
+              if (in_data[idx] == out_data[pool_index]) {
+                max_idx = idx;
                 found = true;
                 break;
               }
             }
             if (found) break;
           }
-          in_grad[max_idx] += out_grad[pool_index];
+          // In the case where pad > 0 and kernel = 1, for example,
+          // max_idx can be -1 reaching this step.
+          if (max_idx >= 0) {
+            in_grad[max_idx] += out_grad[pool_index];
+          }
+        }
+      }
+      in_data += in_offset;
+      in_grad += in_offset;
+      out_data += out_offset;
+      out_grad += out_offset;
+    }
+  }
+}
+
+template<typename DType>
+inline void unpool_max_3d_cpu(const DType* out_grad, const DType* in_data,
+                              const DType* out_data, const TShape& ishape,
+                              const TShape& oshape, const TShape& kernel,
+                              const TShape& pad, const TShape& stride,
+                              DType* in_grad) {
+  const int depth = ishape[2], height = ishape[3], width = ishape[4];
+  const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
+  const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
+  const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2];
+  const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2];
+  const index_t in_offset = ishape[2] * ishape[3] * ishape[4];
+  const index_t out_offset = oshape[2] * oshape[3] * oshape[4];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int pd = 0; pd < pooled_depth; ++pd) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            int dstart = pd * stride_d - pad_d;
+            int hstart = ph * stride_h - pad_h;
+            int wstart = pw * stride_w - pad_w;
+            int dend = std::min(dstart + kernel_d, depth);
+            int hend = std::min(hstart + kernel_h, height);
+            int wend = std::min(wstart + kernel_w, width);
+            dstart = std::max(dstart, 0);
+            hstart = std::max(hstart, 0);
+            wstart = std::max(wstart, 0);
+            const int pool_index = (pd * pooled_height + ph) * pooled_width + pw;
+            int max_idx = -1;
+            bool found = false;
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  const int idx = (d * height + h) * width + w;
+                  if (in_data[idx] == out_data[pool_index]) {
+                    max_idx = idx;
+                    found = true;
+                    break;
+                  }
+                }
+                if (found) break;
+              }
+              if (found) break;
+            }
+            // In the case where pad > 0 and kernel = 1, for example,
+            // max_idx can be -1 reaching this step.
+            if (max_idx >= 0) {
+              in_grad[max_idx] += out_grad[pool_index];
+            }
+          }
         }
       }
       in_data += in_offset;
@@ -199,7 +312,18 @@ inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& is
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
+  } else if (kernel.ndim() == 3) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      pool_max_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      //pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data, true);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      //pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
   }
+
 }
 
 template<typename DType>
@@ -221,6 +345,16 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
+  } else if (kernel.ndim() == 3) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      unpool_max_3d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      //unpool_sum_3d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      //unpool_sum_3d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
   }
 }
 

From 3578b2f44c65d157a9fa13a6c1f0f8eb5f6fcdeb Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Sun, 19 Mar 2017 15:06:48 -0700
Subject: [PATCH 06/16] Added 3d max pooling gpu

---
 src/operator/mxnet_op.h    |  17 ++++
 src/operator/nn/im2col.cuh |  18 +---
 src/operator/nn/pool.cuh   | 190 ++++++++++++++++++++++++++++++++++---
 3 files changed, 198 insertions(+), 27 deletions(-)

diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index d46b978e0218..9b5dcfe3d3b1 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -23,6 +23,23 @@ using std::isnan;
 #endif
 
 
+#ifdef __CUDACC__
+#define CUDA_KERNEL_LOOP(i, n) \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+      i < (n); \
+      i += blockDim.x * gridDim.x)
+
+
+/*!
+ * \brief Get the number of blocks for cuda kernel given N
+ */
+inline int cuda_get_num_blocks(const int N) {
+  using namespace mshadow::cuda;
+  return std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum);
+}
+#endif  // __CUDACC__
+
+
 /*! \brief operator request type switch */
 #define MXNET_ASSIGN_REQ_SWITCH(req, ReqType, ...)  \
   switch (req) {                                    \
diff --git a/src/operator/nn/im2col.cuh b/src/operator/nn/im2col.cuh
index 8555319f5f68..786fd22f8c9b 100644
--- a/src/operator/nn/im2col.cuh
+++ b/src/operator/nn/im2col.cuh
@@ -71,20 +71,6 @@
 namespace mxnet {
 namespace op {
 
-// CUDA: grid stride looping
-#define CUDA_KERNEL_LOOP(i, n) \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
-      i < (n); \
-      i += blockDim.x * gridDim.x)
-
-/*!
- * \brief Get the number of blocks for cuda kernel given N
- */
-inline int cuda_get_num_blocks(const int N) {
-  using namespace mshadow::cuda;
-  return std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum);
-}
-
 /*!
  * \brief im2col gpu kernel.
  * DO NOT call this directly. Use wrapper function im2col() instead;
@@ -141,6 +127,7 @@ inline void im2col_gpu(mshadow::Stream<gpu>* s,
   int width_col = (width + 2 * pad_w -
       (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
   int num_kernels = channels * height_col * width_col;
+  using namespace mxnet_op;
   // NOLINT_NEXT_LINE(whitespace/operators)
   im2col_gpu_kernel<DType><<<cuda_get_num_blocks(num_kernels), mshadow::cuda::kBaseThreadNum,
                              0, mshadow::Stream<gpu>::GetStream(s)>>>(
@@ -303,6 +290,7 @@ inline void im2col(mshadow::Stream<gpu>* s,
   index_t num_spatial_axes = kernel_shape.ndim();
   CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum);
   index_t num_kernels = im_shape[1] * col_shape.ProdShape(1, col_shape.ndim());
+  using namespace mxnet_op;
   switch (num_spatial_axes) {
   case 1:
     im2col_nd_gpu_kernel<DType, 1>  // NOLINT_NEXT_LINE(whitespace/operators)
@@ -347,6 +335,7 @@ inline void col2im_gpu(mshadow::Stream<gpu>* s, const DType* data_col, const int
   int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) /
       stride_w + 1;
   int num_kernels = channels * height * width;
+  using namespace mxnet_op;
   // To avoid involving atomic operations, we will launch one kernel per
   // bottom dimension, and then in the kernel add up the top dimensions.
   // NOLINT_NEXT_LINE(whitespace/operators)
@@ -487,6 +476,7 @@ inline void col2im(mshadow::Stream<gpu>* s,
   index_t im_size = im_shape.ProdShape(1, im_shape.ndim());
   // num_axes should be smaller than block size
   CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum);
+  using namespace mxnet_op;
   switch (num_spatial_axes) {
   case 1:
     col2im_nd_gpu_kernel<DType, 1>  // NOLINT_NEXT_LINE(whitespace/operators)
diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh
index 5160dec8c879..c4806d5cde58 100644
--- a/src/operator/nn/pool.cuh
+++ b/src/operator/nn/pool.cuh
@@ -9,19 +9,6 @@
 namespace mxnet {
 namespace op {
 
-#define CUDA_KERNEL_LOOP(i, n) \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
-      i < (n); \
-      i += blockDim.x * gridDim.x)
-
-/*!
- * \brief Get the number of blocks for cuda kernel given N
- */
-inline int cuda_get_num_blocks(const int N) {
-  using namespace mshadow::cuda;
-  return std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum);
-}
-
 template <typename DType>
 __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int channels, const int height, const int width,
@@ -58,6 +45,50 @@ __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data,
   }
 }
 
+template <typename DType>
+__global__ void pool_max_3d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
+                                       const int depth, const int height, const int width,
+                                       const int pooled_depth, const int pooled_height,
+                                       const int pooled_width, const int kernel_d,
+                                       const int kernel_h, const int kernel_w, const int stride_d,
+                                       const int stride_h, const int stride_w, const int pad_d,
+                                       const int pad_h, const int pad_w, OpReqType req_type,
+                                       DType* out_data) {
+  using mshadow::red::limits::MinValue;
+  // index is the output image's pixel index in NCDHW
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int pd = (index / pooled_width / pooled_height) % pooled_depth;
+    const int c = (index / pooled_width / pooled_height / pooled_depth) % channels;
+    const int n = index / pooled_width / pooled_height / pooled_depth / channels;
+    int dstart = pd * stride_d - pad_d;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    const int dend = min(dstart + kernel_d, depth);
+    const int hend = min(hstart + kernel_h, height);
+    const int wend = min(wstart + kernel_w, width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    const DType* in_slice =
+        in_data + (n * channels + c) * depth * height * width;
+    DType max_val = MinValue<DType>();
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          const int in_index = (d * height + h) * width + w;
+          const DType in_val = in_slice[in_index];
+          if (in_val > max_val) {
+            max_val = in_val;
+          }
+        }
+      }
+    }
+    KERNEL_ASSIGN(out_data[index], req_type, max_val);
+  }
+}
+
 template <typename DType>
 __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
                                        const int height, const int width,
@@ -142,6 +173,66 @@ __global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_gr
   }
 }
 
+template <typename DType>
+__global__ void unpool_max_3d_gpu_kernel(const int nthreads, const DType* out_grad,
+                                         const DType* in_data, const DType* out_data,
+                                         const int channels, const int depth, const int height,
+                                         const int width, const int pooled_depth,
+                                         const int pooled_height, const int pooled_width,
+                                         const int kernel_d, const int kernel_h,
+                                         const int kernel_w, const int stride_d,
+                                         const int stride_h, const int stride_w, const int pad_d,
+                                         const int pad_h, const int pad_w,
+                                         DType* in_grad) {
+  // index is the output image's pixel index in NCDHW
+  // the order has to be consistent with pooling max
+  // to avoid adding out_grad to the wrong in_grad
+  // in the case where there are multiple max pixels
+  // covered by a kernel window
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int pd = (index / pooled_width / pooled_height) % pooled_depth;
+    const int c = (index / pooled_width / pooled_height / pooled_depth) % channels;
+    const int n = index / pooled_width / pooled_height / pooled_depth / channels;
+    int dstart = pd * stride_d - pad_d;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    const int dend = min(dstart + kernel_d, depth);
+    const int hend = min(hstart + kernel_h, height);
+    const int wend = min(wstart + kernel_w, width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    // in data/grad offset batch and channel dims
+    int in_offset = (n * channels + c) * depth * height * width;
+    const DType* in_data_slice = in_data + in_offset;
+    int max_idx = -1;
+    DType max_val = out_data[index];
+    bool found = false;
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          int idx = (d * height + h) * width + w;
+          if (in_data_slice[idx] == max_val) {
+            max_idx = idx;
+            found = true;
+            break;
+          }
+        }
+        if (found) break;
+      }
+      if (found) break;
+    }
+
+    // In the case where pad > 0 and kernel = 1, for example,
+    // max_idx can be -1 reaching this step.
+    if (max_idx >= 0) {
+      atomicAdd(&in_grad[in_offset+max_idx], out_grad[index]);
+    }
+  }
+}
+
 template<typename DType>
 __global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_grad,
                                          const int channels, const int height, const int width,
@@ -187,6 +278,7 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
                  const TShape& oshape, const TShape& kernel, const TShape& pad,
                  const TShape& stride, const int pool_type, OpReqType req_type,
                  DType* out_data) {
+  using namespace mxnet_op;
   if (kernel.ndim() == 2) {
     if (pool_enum::kMaxPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
@@ -215,6 +307,40 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
+  } else if (kernel.ndim() == 3) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_max_3d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
+                                   ishape[4], oshape[2], oshape[3], oshape[4],
+                                   kernel[0], kernel[1], kernel[2], stride[0],
+                                   stride[1], stride[2], pad[0], pad[1], pad[2], req_type,
+                                   out_data);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_2d_gpu_kernel);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+#if 0
+      pool_sum_3d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
+                                   oshape[2], oshape[3], kernel[0], kernel[1],
+                                   stride[0], stride[1], pad[0], pad[1], req_type, out_data, true);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
+#endif
+    } else if (pool_enum::kSumPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+#if 0
+      pool_sum_3d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
+                                   oshape[2], oshape[3], kernel[0], kernel[1],
+                                   stride[0], stride[1], pad[0], pad[1], req_type, out_data);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
+#endif
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
   }
 }
 
@@ -227,6 +353,7 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
   if (mxnet::kAddTo != req_type) {
     mxnet_op::Kernel<mxnet_op::set_zero, gpu>::Launch(s, ishape.Size(), in_grad);
   }
+  using namespace mxnet_op;
   if (kernel.ndim() == 2) {
     if (pool_enum::kMaxPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
@@ -258,7 +385,44 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
+  } else if (kernel.ndim() == 3) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_max_3d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     oshape.Size(), out_grad, in_data, out_data,
+                                     ishape[1], ishape[2], ishape[3], ishape[4],
+                                     oshape[2], oshape[3], oshape[4], kernel[0], kernel[1],
+                                     kernel[2], stride[0], stride[1], stride[2],
+                                     pad[0], pad[1], pad[2], in_grad);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_2d_gpu_kernel);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+#if 0
+      unpool_sum_3d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     ishape.Size(), out_grad,
+                                     ishape[1], ishape[2], ishape[3],
+                                     oshape[2], oshape[3], kernel[0], kernel[1],
+                                     stride[0], stride[1], pad[0], pad[1], in_grad, true);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel);
+#endif
+    } else if (pool_enum::kSumPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+#if 0
+      unpool_sum_3d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     ishape.Size(), out_grad,
+                                     ishape[1], ishape[2], ishape[3],
+                                     oshape[2], oshape[3], kernel[0], kernel[1],
+                                     stride[0], stride[1], pad[0], pad[1], in_grad);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel);
+#endif
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
   }
+
 }
 
 }  // namespace op

From 9302a0ec8efab2ec070a4ef1c4ce801b1d1a1115 Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Sun, 19 Mar 2017 17:36:49 -0700
Subject: [PATCH 07/16] Added 3d avg/sum pooling gpu

---
 src/operator/nn/pool.cuh | 134 ++++++++++++++++++++++++++++++++-------
 src/operator/nn/pool.h   | 106 +++++++++++++++++++++++++++++--
 2 files changed, 212 insertions(+), 28 deletions(-)

diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh
index c4806d5cde58..52cf579a36ea 100644
--- a/src/operator/nn/pool.cuh
+++ b/src/operator/nn/pool.cuh
@@ -123,6 +123,48 @@ __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data,
   }
 }
 
+template <typename DType>
+__global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
+                                       const int depth, const int height, const int width,
+                                       const int pooled_depth, const int pooled_height,
+                                       const int pooled_width, const int kernel_d,
+                                       const int kernel_h, const int kernel_w,
+                                       const int stride_d, const int stride_h, const int stride_w,
+                                       const int pad_d, const int pad_h, const int pad_w,
+                                       OpReqType req_type, DType* out_data, bool getAvg = false) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+	  const int pw = index % pooled_width;
+	  const int ph = (index / pooled_width) % pooled_height;
+    const int pd = (index / pooled_width / pooled_height) % pooled_depth;
+	  const int c = (index / pooled_width / pooled_height / pooled_depth) % channels;
+	  const int n = index / pooled_width / pooled_height / pooled_depth / channels;
+    int dstart = pd * stride_d - pad_d;
+	  int hstart = ph * stride_h - pad_h;
+	  int wstart = pw * stride_w - pad_w;
+    int dend = min(dstart + kernel_d, depth + pad_d);
+	  int hend = min(hstart + kernel_h, height + pad_h);
+	  int wend = min(wstart + kernel_w, width + pad_w);
+	  const int pool_size = (getAvg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
+    dstart = max(dstart, 0);
+	  hstart = max(hstart, 0);
+	  wstart = max(wstart, 0);
+    dend = min(dend, depth);
+	  hend = min(hend, height);
+	  wend = min(wend, width);
+	  DType sum = 0;
+	  const DType* out_slice =
+	 		in_data + (n * channels + c) * depth * height * width;
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          sum += out_slice[(d * height + h) * width + w];
+        }
+      }
+    }
+	  KERNEL_ASSIGN(out_data[index], req_type, sum / pool_size);
+  }
+}
+
 template <typename DType>
 __global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_grad,
                                          const DType* in_data, const DType* out_data,
@@ -273,6 +315,54 @@ __global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_gr
   }
 }
 
+template<typename DType>
+__global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_grad,
+                                         const int channels, const int depth, const int height,
+                                         const int width, const int pooled_depth,
+                                         const int pooled_height, const int pooled_width,
+                                         const int kernel_d, const int kernel_h,
+                                         const int kernel_w, const int stride_d, const int stride_h,
+                                         const int stride_w, const int pad_d, const int pad_h,
+                                         const int pad_w, DType* in_grad, bool isAvg = false) {
+  // index is the input image index in NCDHW
+  CUDA_KERNEL_LOOP(index, nthreads) {
+	  // find out the local index
+	  // find out the local offset
+	  const int w = index % width + pad_w;
+	  const int h = (index / width) % height + pad_h;
+    const int d = (index / width / height) % depth + pad_d;
+	  const int c = (index / width / height / depth) % channels;
+	  const int n = index / width / height / depth / channels;
+    const int pdstart = (d < kernel_d) ? 0 : (d - kernel_d) / stride_d + 1;
+    const int pdend = min(d / stride_d + 1, pooled_depth);
+	  const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+	  const int phend = min(h / stride_h + 1, pooled_height);
+	  const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+	  const int pwend = min(w / stride_w + 1, pooled_width);
+	  DType gradient = 0;
+	  const DType* out_grad_slice =
+      out_grad + (n * channels + c) * pooled_depth * pooled_height * pooled_width;
+    for (int pd = pdstart; pd < pdend; ++pd) {
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          // figure out the pooling size
+          int dstart = pd * stride_d - pad_d;
+          int hstart = ph * stride_h - pad_h;
+          int wstart = pw * stride_w - pad_w;
+          int dend = min(dstart + kernel_d, depth + pad_d);
+          int hend = min(hstart + kernel_h, height + pad_h);
+          int wend = min(wstart + kernel_w, width + pad_w);
+          int pool_size = (isAvg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
+          gradient += out_grad_slice[(pd * pooled_height + ph) * pooled_width + pw] / pool_size;
+        }
+      }
+    }
+    // if req=kWriteTo, in_grad has already been assigned zero values in unpool()
+    // use "+=" here instead of "=" to accommodate when req=kAddTo
+	  in_grad[index] += gradient;
+  }
+}
+
 template<typename DType>
 inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& ishape,
                  const TShape& oshape, const TShape& kernel, const TShape& pad,
@@ -317,27 +407,25 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
                                    kernel[0], kernel[1], kernel[2], stride[0],
                                    stride[1], stride[2], pad[0], pad[1], pad[2], req_type,
                                    out_data);
-      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_2d_gpu_kernel);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_3d_gpu_kernel);
     } else if (pool_enum::kAvgPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-#if 0
       pool_sum_3d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
-                                   oshape[2], oshape[3], kernel[0], kernel[1],
-                                   stride[0], stride[1], pad[0], pad[1], req_type, out_data, true);
-      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
-#endif
+                                   ishape[4], oshape[2], oshape[3], oshape[4], kernel[0],
+                                   kernel[1], kernel[2], stride[0], stride[1], stride[2],
+                                   pad[0], pad[1], pad[2], req_type, out_data, true);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_3d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-#if 0
       pool_sum_3d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
-                                   oshape[2], oshape[3], kernel[0], kernel[1],
-                                   stride[0], stride[1], pad[0], pad[1], req_type, out_data);
-      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
-#endif
+                                   ishape[4], oshape[2], oshape[3], oshape[4], kernel[0],
+                                   kernel[1], kernel[2], stride[0], stride[1], stride[2],
+                                   pad[0], pad[1], pad[2], req_type, out_data);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_3d_gpu_kernel);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
@@ -395,29 +483,27 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
                                      oshape[2], oshape[3], oshape[4], kernel[0], kernel[1],
                                      kernel[2], stride[0], stride[1], stride[2],
                                      pad[0], pad[1], pad[2], in_grad);
-      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_2d_gpu_kernel);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_3d_gpu_kernel);
     } else if (pool_enum::kAvgPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-#if 0
       unpool_sum_3d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      ishape.Size(), out_grad,
-                                     ishape[1], ishape[2], ishape[3],
-                                     oshape[2], oshape[3], kernel[0], kernel[1],
-                                     stride[0], stride[1], pad[0], pad[1], in_grad, true);
-      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel);
-#endif
+                                     ishape[1], ishape[2], ishape[3], ishape[4],
+                                     oshape[2], oshape[3], oshape[4], kernel[0], kernel[1],
+                                     kernel[2], stride[0], stride[1], stride[2], pad[0], pad[1],
+                                     pad[2], in_grad, true);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_3d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-#if 0
       unpool_sum_3d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      ishape.Size(), out_grad,
-                                     ishape[1], ishape[2], ishape[3],
-                                     oshape[2], oshape[3], kernel[0], kernel[1],
-                                     stride[0], stride[1], pad[0], pad[1], in_grad);
-      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel);
-#endif
+                                     ishape[1], ishape[2], ishape[3], ishape[4],
+                                     oshape[2], oshape[3], oshape[4], kernel[0], kernel[1],
+                                     kernel[2], stride[0], stride[1], stride[2], pad[0], pad[1],
+                                     pad[2], in_grad);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_3d_gpu_kernel);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h
index 538041621198..a204e235b419 100644
--- a/src/operator/nn/pool.h
+++ b/src/operator/nn/pool.h
@@ -143,6 +143,54 @@ inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TS
   }
 }
 
+template<typename DType>
+inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
+                            const TShape& kernel, const TShape& pad, const TShape& stride,
+                            OpReqType req_type, DType* out_data, bool getAvg = false) {
+  const int depth = ishape[2], height = ishape[3], width = ishape[4];
+  const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
+  const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
+  const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2];
+  const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2];
+  const index_t in_data_offset = ishape[2] * ishape[3] * ishape[4];
+  const index_t out_data_offset = oshape[2] * oshape[3] * oshape[4];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int pd = 0; pd < pooled_depth; ++pd) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            int dstart = pd * stride_d - pad_d;
+            int hstart = ph * stride_h - pad_h;
+            int wstart = pw * stride_w - pad_w;
+            int dend = std::min(dstart + kernel_d, depth + pad_d);
+            int hend = std::min(hstart + kernel_h, height + pad_h);
+            int wend = std::min(wstart + kernel_w, width + pad_w);
+            int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            dstart = std::max(dstart, 0);
+            hstart = std::max(hstart, 0);
+            wstart = std::max(wstart, 0);
+            dend = std::min(dend, depth);
+            hend = std::min(hend, height);
+            wend = std::min(wend, width);
+            DType sum = 0;
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  sum += in_data[(d*height+h)*width+w];
+                }
+              }
+            }
+            KERNEL_ASSIGN(out_data[(pd*pooled_height+ph)*pooled_width+pw],
+                          req_type, getAvg? sum/pool_size : sum);
+          }
+        }
+      }
+      in_data += in_data_offset;
+      out_data += out_data_offset;
+    }
+  }
+}
+
 template<typename DType>
 inline void unpool_max_2d_cpu(const DType* out_grad, const DType* in_data,
                               const DType* out_data, const TShape& ishape,
@@ -297,6 +345,56 @@ inline void unpool_sum_2d_cpu(const DType* out_grad, const TShape& ishape,
   }
 }
 
+template<typename DType>
+inline void unpool_sum_3d_cpu(const DType* out_grad, const TShape& ishape,
+                              const TShape& oshape, const TShape& kernel,
+                              const TShape& pad, const TShape& stride,
+                              DType* in_grad, bool isAvg = false) {
+  const int depth = ishape[2], height = ishape[3], width = ishape[4];
+  const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
+  const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
+  const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2];
+  const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2];
+  const index_t in_grad_offset = ishape[2] * ishape[3] * ishape[4];
+  const index_t out_grad_offset = oshape[2] * oshape[3] * oshape[4];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int pd = 0; pd < pooled_depth; ++pd) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            int dstart = pd * stride_d - pad_d;
+            int hstart = ph * stride_h - pad_h;
+            int wstart = pw * stride_w - pad_w;
+            int dend = std::min(dstart + kernel_d, depth + pad_d);
+            int hend = std::min(hstart + kernel_h, height + pad_h);
+            int wend = std::min(wstart + kernel_w, width + pad_w);
+            int pool_size = 1;
+            if (isAvg) {
+              pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            }
+            dstart = std::max(dstart, 0);
+            hstart = std::max(hstart, 0);
+            wstart = std::max(wstart, 0);
+            dend = std::min(dend, depth);
+            hend = std::min(hend, height);
+            wend = std::min(wend, width);
+            const int pool_index = (pd * pooled_height + ph) * pooled_width + pw;
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  in_grad[(d*height+h)*width+w] += out_grad[pool_index] / pool_size;
+                }
+              }
+            }
+          }
+        }
+      }
+      in_grad += in_grad_offset;
+      out_grad += out_grad_offset;
+    }
+  }
+}
+
 template<typename DType>
 inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& ishape,
                  const TShape& oshape, const TShape& kernel, const TShape& pad,
@@ -316,9 +414,9 @@ inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& is
     if (pool_enum::kMaxPooling == pool_type) {
       pool_max_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data);
     } else if (pool_enum::kAvgPooling == pool_type) {
-      //pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data, true);
+      pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data, true);
     } else if (pool_enum::kSumPooling == pool_type) {
-      //pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data);
+      pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
@@ -349,9 +447,9 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
     if (pool_enum::kMaxPooling == pool_type) {
       unpool_max_3d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
     } else if (pool_enum::kAvgPooling == pool_type) {
-      //unpool_sum_3d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true);
+      unpool_sum_3d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true);
     } else if (pool_enum::kSumPooling == pool_type) {
-      //unpool_sum_3d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad);
+      unpool_sum_3d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }

From aaac667c38567555a5b06f9c2d8941a6e23a6b50 Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Sun, 19 Mar 2017 22:25:53 -0700
Subject: [PATCH 08/16] Added max pooling 1d cpu and gpu

---
 src/operator/nn/pool.cuh      | 152 +++++++++++++++++++++++++++++++---
 src/operator/nn/pool.h        | 101 +++++++++++++++++++++-
 src/operator/pooling-inl.h    |  31 ++++++-
 src/operator/pooling_v1-inl.h |   2 +-
 4 files changed, 268 insertions(+), 18 deletions(-)

diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh
index 52cf579a36ea..de541616d8ce 100644
--- a/src/operator/nn/pool.cuh
+++ b/src/operator/nn/pool.cuh
@@ -9,6 +9,34 @@
 namespace mxnet {
 namespace op {
 
+template <typename DType>
+__global__ void pool_max_1d_gpu_kernel(const int nthreads, const DType* in_data,
+                                       const int channels, const int width,
+                                       const int pooled_width, const int kernel_w,
+                                       const int stride_w, const int pad_w,
+                                       OpReqType req_type, DType* out_data) {
+  using mshadow::red::limits::MinValue;
+  // index is the output image's pixel index in NCHW
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int pw = index % pooled_width;
+    const int c = (index / pooled_width) % channels;
+    const int n = index / pooled_width / channels;
+    int wstart = pw * stride_w - pad_w;
+    const int wend = min(wstart + kernel_w, width);
+    wstart = max(wstart, 0);
+    const DType* in_slice =
+        in_data + (n * channels + c) * width;
+    DType max_val = MinValue<DType>();
+    for (int w = wstart; w < wend; ++w) {
+      const DType in_val = in_slice[w];
+      if (in_val > max_val) {
+        max_val = in_val;
+      }
+    }
+    KERNEL_ASSIGN(out_data[index], req_type, max_val);
+  }
+}
+
 template <typename DType>
 __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int channels, const int height, const int width,
@@ -34,8 +62,7 @@ __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data,
     DType max_val = MinValue<DType>();
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
-        const int in_index = h * width + w;
-        const DType in_val = in_slice[in_index];
+        const DType in_val = in_slice[h * width + w];
         if (in_val > max_val) {
           max_val = in_val;
         }
@@ -77,8 +104,7 @@ __global__ void pool_max_3d_gpu_kernel(const int nthreads, const DType* in_data,
     for (int d = dstart; d < dend; ++d) {
       for (int h = hstart; h < hend; ++h) {
         for (int w = wstart; w < wend; ++w) {
-          const int in_index = (d * height + h) * width + w;
-          const DType in_val = in_slice[in_index];
+          const DType in_val = in_slice[(d * height + h) * width + w];
           if (in_val > max_val) {
             max_val = in_val;
           }
@@ -165,6 +191,45 @@ __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data,
   }
 }
 
+template <typename DType>
+__global__ void unpool_max_1d_gpu_kernel(const int nthreads, const DType* out_grad,
+                                         const DType* in_data, const DType* out_data,
+                                         const int channels, const int width,
+                                         const int pooled_width, const int kernel_w,
+                                         const int stride_w, const int pad_w,
+                                         DType* in_grad) {
+  // index is the output image's pixel index in NCHW
+  // the order has to be consistent with pooling max
+  // to avoid adding out_grad to the wrong in_grad
+  // in the case where there are multiple max pixels
+  // covered by a kernel window
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int pw = index % pooled_width;
+    const int c = (index / pooled_width) % channels;
+    const int n = index / pooled_width / channels;
+    int wstart = pw * stride_w - pad_w;
+    const int wend = min(wstart + kernel_w, width);
+    wstart = max(wstart, 0);
+    // in data/grad offset batch and channel dims
+    int in_offset = (n * channels + c) * width;
+    const DType* in_data_slice = in_data + in_offset;
+    int max_idx = -1;
+    DType max_val = out_data[index];
+    for (int w = wstart; w < wend; ++w) {
+      if (in_data_slice[w] == max_val) {
+        max_idx = w;
+        break;
+      }
+    }
+
+    // In the case where pad > 0 and kernel = 1, for example,
+    // max_idx can be -1 reaching this step.
+    if (max_idx >= 0) {
+      atomicAdd(&in_grad[in_offset+max_idx], out_grad[index]);
+    }
+  }
+}
+
 template <typename DType>
 __global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_grad,
                                          const DType* in_data, const DType* out_data,
@@ -198,8 +263,9 @@ __global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_gr
     bool found = false;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
-        max_idx = h * width + w;
-        if (in_data_slice[max_idx] == max_val) {
+        const int idx = h * width + w;
+        if (in_data_slice[idx] == max_val) {
+          max_idx = idx;
           found = true;
           break;
         }
@@ -255,7 +321,7 @@ __global__ void unpool_max_3d_gpu_kernel(const int nthreads, const DType* out_gr
     for (int d = dstart; d < dend; ++d) {
       for (int h = hstart; h < hend; ++h) {
         for (int w = wstart; w < wend; ++w) {
-          int idx = (d * height + h) * width + w;
+          const int idx = (d * height + h) * width + w;
           if (in_data_slice[idx] == max_val) {
             max_idx = idx;
             found = true;
@@ -369,7 +435,38 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
                  const TShape& stride, const int pool_type, OpReqType req_type,
                  DType* out_data) {
   using namespace mxnet_op;
-  if (kernel.ndim() == 2) {
+  if (kernel.ndim() == 1) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_max_1d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2],
+                                   oshape[2], kernel[0], stride[0], pad[0], req_type, out_data);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_2d_gpu_kernel);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+#if 0
+      pool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
+                                   oshape[2], oshape[3], kernel[0], kernel[1],
+                                   stride[0], stride[1], pad[0], pad[1], req_type, out_data, true);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
+#endif
+    } else if (pool_enum::kSumPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+#if 0
+      pool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
+                                   oshape[2], oshape[3], kernel[0], kernel[1],
+                                   stride[0], stride[1], pad[0], pad[1], req_type, out_data);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
+#endif
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else if (kernel.ndim() == 2) {
     if (pool_enum::kMaxPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       pool_max_2d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
@@ -442,7 +539,41 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
     mxnet_op::Kernel<mxnet_op::set_zero, gpu>::Launch(s, ishape.Size(), in_grad);
   }
   using namespace mxnet_op;
-  if (kernel.ndim() == 2) {
+  if (kernel.ndim() == 1) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_max_1d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     oshape.Size(), out_grad, in_data, out_data,
+                                     ishape[1], ishape[2], oshape[2], kernel[0], stride[0], pad[0],
+                                     in_grad);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_2d_gpu_kernel);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+#if 0
+      unpool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     ishape.Size(), out_grad,
+                                     ishape[1], ishape[2], ishape[3],
+                                     oshape[2], oshape[3], kernel[0], kernel[1],
+                                     stride[0], stride[1], pad[0], pad[1], in_grad, true);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel);
+#endif
+    } else if (pool_enum::kSumPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+#if 0
+      unpool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     ishape.Size(), out_grad,
+                                     ishape[1], ishape[2], ishape[3],
+                                     oshape[2], oshape[3], kernel[0], kernel[1],
+                                     stride[0], stride[1], pad[0], pad[1], in_grad);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel);
+#endif
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else  if (kernel.ndim() == 2) {
     if (pool_enum::kMaxPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       unpool_max_2d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
@@ -507,8 +638,9 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
+  } else {
+    LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D unpooling";
   }
-
 }
 
 }  // namespace op
diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h
index a204e235b419..bc373b49d996 100644
--- a/src/operator/nn/pool.h
+++ b/src/operator/nn/pool.h
@@ -15,6 +15,38 @@ enum PoolingOpType {kMaxPooling, kAvgPooling, kSumPooling};
 enum PoolingOpPadConventionType {kValid, kFull};
 }  // namespace pool_enum
 
+template<typename DType>
+inline void pool_max_1d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
+                            const TShape& kernel, const TShape& pad, const TShape& stride,
+                            OpReqType req_type, DType* out_data) {
+  using mshadow::red::limits::MinValue;
+  const int width = ishape[2];
+  const int pooled_width = oshape[2];
+  const int kernel_w = kernel[0];
+  const int pad_w = pad[0];
+  const int stride_w = stride[0];
+  const index_t in_data_offset = ishape[2];
+  const index_t out_data_offset = oshape[2];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int pw = 0; pw < pooled_width; ++pw) {
+        int wstart = pw * stride_w - pad_w;
+        int wend = std::min(wstart + kernel_w, width);
+        wstart = std::max(wstart, 0);
+        DType max_val = MinValue<DType>();
+        for (int w = wstart; w < wend; ++w) {
+          if (in_data[w] > max_val) {
+            max_val = in_data[w];
+          }
+        }
+        KERNEL_ASSIGN(out_data[pw], req_type, max_val);
+      }
+      in_data += in_data_offset;
+      out_data += out_data_offset;
+    }
+  }
+}
+
 template<typename DType>
 inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
@@ -191,6 +223,46 @@ inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TS
   }
 }
 
+template<typename DType>
+inline void unpool_max_1d_cpu(const DType* out_grad, const DType* in_data,
+                              const DType* out_data, const TShape& ishape,
+                              const TShape& oshape, const TShape& kernel,
+                              const TShape& pad, const TShape& stride,
+                              DType* in_grad) {
+  const int width = ishape[2];
+  const int pooled_width = oshape[2];
+  const int kernel_w = kernel[0];
+  const int pad_w = pad[0];
+  const int stride_w = stride[0];
+  const index_t in_offset = ishape[2];
+  const index_t out_offset = oshape[2];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int pw = 0; pw < pooled_width; ++pw) {
+        int wstart = pw * stride_w - pad_w;
+        int wend = std::min(wstart + kernel_w, width);
+        wstart = std::max(wstart, 0);
+        int max_idx = -1;
+        for (int w = wstart; w < wend; ++w) {
+          if (in_data[w] == out_data[pw]) {
+            max_idx = w;
+            break;
+          }
+        }
+        // In the case where pad > 0 and kernel = 1, for example,
+        // max_idx can be -1 reaching this step.
+        if (max_idx >= 0) {
+          in_grad[max_idx] += out_grad[pw];
+        }
+      }
+      in_data += in_offset;
+      in_grad += in_offset;
+      out_data += out_offset;
+      out_grad += out_offset;
+    }
+  }
+}
+
 template<typename DType>
 inline void unpool_max_2d_cpu(const DType* out_grad, const DType* in_data,
                               const DType* out_data, const TShape& ishape,
@@ -400,7 +472,17 @@ inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& is
                  const TShape& oshape, const TShape& kernel, const TShape& pad,
                  const TShape& stride, const int pool_type, OpReqType req_type,
                  DType* out_data) {
-  if (kernel.ndim() == 2) {
+  if (kernel.ndim() == 1) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      pool_max_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      //pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data, true);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      //pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else if (kernel.ndim() == 2) {
     if (pool_enum::kMaxPooling == pool_type) {
       pool_max_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data);
     } else if (pool_enum::kAvgPooling == pool_type) {
@@ -420,8 +502,9 @@ inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& is
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
+  } else {
+    LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D pooling";
   }
-
 }
 
 template<typename DType>
@@ -433,7 +516,17 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
   if (mxnet::kAddTo != req_type) {
     mxnet_op::Kernel<mxnet_op::set_zero, cpu>::Launch(s, ishape.Size(), in_grad);
   }
-  if (kernel.ndim() == 2) {
+  if (kernel.ndim() == 1) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      unpool_max_1d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      //unpool_sum_1d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      //unpool_sum_1d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else if (kernel.ndim() == 2) {
     if (pool_enum::kMaxPooling == pool_type) {
       unpool_max_2d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
     } else if (pool_enum::kAvgPooling == pool_type) {
@@ -453,6 +546,8 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
+  } else {
+    LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D unpooling";
   }
 }
 
diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h
index b1dcad8d3d7e..8156c3796539 100644
--- a/src/operator/pooling-inl.h
+++ b/src/operator/pooling-inl.h
@@ -137,7 +137,10 @@ class PoolingProp : public OperatorProperty {
   void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
     using namespace mshadow;
     param_.Init(kwargs);
-    if (param_.kernel.ndim() == 2) {
+    if (param_.kernel.ndim() == 1) {
+      if (param_.stride.ndim() == 0) param_.stride = Shape1(1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape1(0);
+    } else if (param_.kernel.ndim() == 2) {
       if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
       if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
     } else {
@@ -160,11 +163,31 @@ class PoolingProp : public OperatorProperty {
                   std::vector<TShape> *aux_shape) const override {
     CHECK_EQ(in_shape->size(), 1U);
     const TShape &dshape = (*in_shape)[0];
-    CHECK_GE(dshape.ndim(), 4U) << "Pooling: Input data should be 4D in (batch, channel, y, x) "
-                               << "Or 5D in (batch, channel, d, y, x)";
+    CHECK_GE(dshape.ndim(), 3U) << "Pooling: Input data should be  3D in (batch, channel, x)"
+                                << " Or 4D in (batch, channel, y, x) "
+                                << " Or 5D in (batch, channel, d, y, x)";
     TShape oshape = dshape;
     if (dshape.ndim() ==  0) return false;
-    if (param_.kernel.ndim() == 2) {
+    if (param_.kernel.ndim() == 1) {
+      CHECK_EQ(dshape.ndim(), 3U) << "Pooling: Input data should be 3D in (batch, channel, x)";
+      if (param_.global_pool) {
+        oshape[2] = 1;
+      } else {
+        CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0])
+            << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2]
+            << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")";
+        if (param_.pooling_convention == pool_enum::kValid) {
+          oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
+                              param_.stride[0];
+        } else {
+          oshape[2] = 1 + static_cast<int>(ceil(static_cast<float>(
+                              dshape[2] + 2 * param_.pad[0] -
+                              param_.kernel[0]) / param_.stride[0]));
+        }
+      }
+      out_shape->clear();
+      out_shape->push_back(oshape);  // save output shape
+    } else if (param_.kernel.ndim() == 2) {
       CHECK_EQ(dshape.ndim(), 4U) << "Pooling: Input data should be 4D in (batch, channel, y, x)";
       if (param_.global_pool) {
         oshape[2] = 1;
diff --git a/src/operator/pooling_v1-inl.h b/src/operator/pooling_v1-inl.h
index ff71d2e7b8a5..c6bc01e52f5a 100644
--- a/src/operator/pooling_v1-inl.h
+++ b/src/operator/pooling_v1-inl.h
@@ -245,7 +245,7 @@ class PoolingV1Prop : public OperatorProperty {
         oshape[3] = 1;
         oshape[4] = 1;
       } else {
-        if (param_.pool_type == pool_v1_enum::kValid) {
+        if (param_.pooling_convention == pool_v1_enum::kValid) {
           oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
                               param_.stride[0];
           oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) /

From 076b60f6982c7d37994f422ac27e503fa41d0152 Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Mon, 20 Mar 2017 10:01:48 -0700
Subject: [PATCH 09/16] Added 1d avg/sum pooling cpu and gpu

---
 src/operator/nn/pool.cuh | 95 +++++++++++++++++++++++++++++-----------
 src/operator/nn/pool.h   | 72 ++++++++++++++++++++++++++++--
 2 files changed, 137 insertions(+), 30 deletions(-)

diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh
index de541616d8ce..026b9e88d104 100644
--- a/src/operator/nn/pool.cuh
+++ b/src/operator/nn/pool.cuh
@@ -115,6 +115,30 @@ __global__ void pool_max_3d_gpu_kernel(const int nthreads, const DType* in_data,
   }
 }
 
+template <typename DType>
+__global__ void pool_sum_1d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
+                                       const int width, const int pooled_width, const int kernel_w,
+                                       const int stride_w, const int pad_w, OpReqType req_type,
+                                       DType* out_data, bool getAvg = false) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+	  const int pw = index % pooled_width;
+	  const int c = (index / pooled_width) % channels;
+	  const int n = index / pooled_width / channels;
+	  int wstart = pw * stride_w - pad_w;
+	  int wend = min(wstart + kernel_w, width + pad_w);
+	  const int pool_size = (getAvg? (wend - wstart) : 1);
+	  wstart = max(wstart, 0);
+	  wend = min(wend, width);
+	  DType sum = 0;
+	  const DType* out_slice =
+	 		in_data + (n * channels + c) * width;
+    for (int w = wstart; w < wend; ++w) {
+      sum += out_slice[w];
+    }
+	  KERNEL_ASSIGN(out_data[index], req_type, sum / pool_size);
+  }
+}
+
 template <typename DType>
 __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
                                        const int height, const int width,
@@ -341,6 +365,37 @@ __global__ void unpool_max_3d_gpu_kernel(const int nthreads, const DType* out_gr
   }
 }
 
+template<typename DType>
+__global__ void unpool_sum_1d_gpu_kernel(const int nthreads, const DType* out_grad,
+                                         const int channels, const int width,
+                                         const int pooled_width, const int kernel_w,
+                                         const int stride_w, const int pad_w,
+                                         DType* in_grad, bool isAvg = false) {
+  // index is the input image index in NCW
+  CUDA_KERNEL_LOOP(index, nthreads) {
+	  // find out the local index
+	  // find out the local offset
+	  const int w = index % width + pad_w;
+	  const int c = (index / width) % channels;
+	  const int n = index / width / channels;
+	  const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+	  const int pwend = min(w / stride_w + 1, pooled_width);
+	  DType gradient = 0;
+	  const DType* out_grad_slice =
+      out_grad + (n * channels + c) * pooled_width;
+    for (int pw = pwstart; pw < pwend; ++pw) {
+      // figure out the pooling size
+      int wstart = pw * stride_w - pad_w;
+      int wend = min(wstart + kernel_w, width + pad_w);
+      int pool_size = (isAvg? (wend - wstart) : 1);
+      gradient += out_grad_slice[pw] / pool_size;
+    }
+    // if req=kWriteTo, in_grad has already been assigned zero values in unpool()
+    // use "+=" here instead of "=" to accommodate when req=kAddTo
+	  in_grad[index] += gradient;
+  }
+}
+
 template<typename DType>
 __global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_grad,
                                          const int channels, const int height, const int width,
@@ -442,27 +497,21 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2],
                                    oshape[2], kernel[0], stride[0], pad[0], req_type, out_data);
-      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_2d_gpu_kernel);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_1d_gpu_kernel);
     } else if (pool_enum::kAvgPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-#if 0
       pool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
-                                   oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
-                                   oshape[2], oshape[3], kernel[0], kernel[1],
-                                   stride[0], stride[1], pad[0], pad[1], req_type, out_data, true);
-      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
-#endif
+                                   oshape.Size(), in_data, ishape[1], ishape[2], oshape[2],
+                                   kernel[0], stride[0], pad[0], req_type, out_data, true);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_1d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-#if 0
       pool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
-                                   oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
-                                   oshape[2], oshape[3], kernel[0], kernel[1],
-                                   stride[0], stride[1], pad[0], pad[1], req_type, out_data);
-      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
-#endif
+                                   oshape.Size(), in_data, ishape[1], ishape[2], oshape[2],
+                                   kernel[0], stride[0], pad[0], req_type, out_data);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_1d_gpu_kernel);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
@@ -547,29 +596,23 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
                                      oshape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], oshape[2], kernel[0], stride[0], pad[0],
                                      in_grad);
-      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_2d_gpu_kernel);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_1d_gpu_kernel);
     } else if (pool_enum::kAvgPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-#if 0
       unpool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      ishape.Size(), out_grad,
-                                     ishape[1], ishape[2], ishape[3],
-                                     oshape[2], oshape[3], kernel[0], kernel[1],
-                                     stride[0], stride[1], pad[0], pad[1], in_grad, true);
-      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel);
-#endif
+                                     ishape[1], ishape[2], oshape[2], kernel[0],
+                                     stride[0], pad[0], in_grad, true);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_1d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-#if 0
       unpool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      ishape.Size(), out_grad,
-                                     ishape[1], ishape[2], ishape[3],
-                                     oshape[2], oshape[3], kernel[0], kernel[1],
-                                     stride[0], stride[1], pad[0], pad[1], in_grad);
-      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel);
-#endif
+                                     ishape[1], ishape[2], oshape[2], kernel[0],
+                                     stride[0], pad[0], in_grad);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_1d_gpu_kernel);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h
index bc373b49d996..debb7eae404d 100644
--- a/src/operator/nn/pool.h
+++ b/src/operator/nn/pool.h
@@ -136,6 +136,37 @@ inline void pool_max_3d_cpu(const DType* in_data, const TShape& ishape, const TS
   }
 }
 
+template<typename DType>
+inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
+                            const TShape& kernel, const TShape& pad, const TShape& stride,
+                            OpReqType req_type, DType* out_data, bool getAvg = false) {
+  const int width = ishape[2];
+  const int pooled_width = oshape[2];
+  const int kernel_w = kernel[0];
+  const int pad_w = pad[0];
+  const int stride_w = stride[0];
+  const index_t in_data_offset = ishape[2];
+  const index_t out_data_offset = oshape[2];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int pw = 0; pw < pooled_width; ++pw) {
+        int wstart = pw * stride_w - pad_w;
+        int wend = std::min(wstart + kernel_w, width + pad_w);
+        int pool_size = (wend - wstart);
+        wstart = std::max(wstart, 0);
+        wend = std::min(wend, width);
+        DType sum = 0;
+        for (int w = wstart; w < wend; ++w) {
+          sum += in_data[w];
+        }
+        KERNEL_ASSIGN(out_data[pw], req_type, getAvg? sum/pool_size : sum);
+      }
+      in_data += in_data_offset;
+      out_data += out_data_offset;
+    }
+  }
+}
+
 template<typename DType>
 inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
@@ -375,6 +406,39 @@ inline void unpool_max_3d_cpu(const DType* out_grad, const DType* in_data,
   }
 }
 
+template<typename DType>
+inline void unpool_sum_1d_cpu(const DType* out_grad, const TShape& ishape,
+                              const TShape& oshape, const TShape& kernel,
+                              const TShape& pad, const TShape& stride,
+                              DType* in_grad, bool isAvg = false) {
+  const int width = ishape[2];
+  const int pooled_width = oshape[2];
+  const int kernel_w = kernel[0];
+  const int pad_w = pad[0];
+  const int stride_w = stride[0];
+  const index_t in_grad_offset = ishape[2];
+  const index_t out_grad_offset = oshape[2];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int pw = 0; pw < pooled_width; ++pw) {
+        int wstart = pw * stride_w - pad_w;
+        int wend = std::min(wstart + kernel_w, width + pad_w);
+        int pool_size = 1;
+        if (isAvg) {
+          pool_size = wend - wstart;
+        }
+        wstart = std::max(wstart, 0);
+        wend = std::min(wend, width);
+        for (int w = wstart; w < wend; ++w) {
+          in_grad[w] += out_grad[pw] / pool_size;
+        }
+      }
+      in_grad += in_grad_offset;
+      out_grad += out_grad_offset;
+    }
+  }
+}
+
 template<typename DType>
 inline void unpool_sum_2d_cpu(const DType* out_grad, const TShape& ishape,
                               const TShape& oshape, const TShape& kernel,
@@ -476,9 +540,9 @@ inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& is
     if (pool_enum::kMaxPooling == pool_type) {
       pool_max_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data);
     } else if (pool_enum::kAvgPooling == pool_type) {
-      //pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data, true);
+      pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data, true);
     } else if (pool_enum::kSumPooling == pool_type) {
-      //pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data);
+      pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
@@ -520,9 +584,9 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
     if (pool_enum::kMaxPooling == pool_type) {
       unpool_max_1d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
     } else if (pool_enum::kAvgPooling == pool_type) {
-      //unpool_sum_1d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true);
+      unpool_sum_1d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true);
     } else if (pool_enum::kSumPooling == pool_type) {
-      //unpool_sum_1d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad);
+      unpool_sum_1d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }

From 1e162a1cc4ff843ac9bf1166781e5c6d0eff70f2 Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Mon, 20 Mar 2017 10:11:24 -0700
Subject: [PATCH 10/16] Added pooling test

---
 tests/python/gpu/test_operator_gpu.py | 162 +++++++++++++++++++++++++-
 1 file changed, 161 insertions(+), 1 deletion(-)

diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index a82530928bbc..6aa3912216a3 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -182,6 +182,9 @@ def test_grid_generator_with_type():
     check_consistency(sym, ctx_list, grad_req="add")
 
 
+# Checking max pooling consistency over the data sets of different float types is problematic
+# as one max value in a float32 data set may not be the max value in a float16 data set.
+# This function will not be called.
 def test_pooling_with_type():
     np.random.seed(1234)
     ctx_list = [{'ctx': mx.gpu(0), 'pool_data': (10, 2, 10, 10), 'type_dict': {'pool_data': np.float64}},
@@ -204,6 +207,163 @@ def test_pooling_with_type():
     check_consistency(sym, ctx_list)
 
 
+def test_pooling_versions():
+    def test_pooling_versions_helper(pool_op_list, data, kernel, pool_type, pad, stride,
+                                     pooling_convention='valid', global_pool=False):
+        ctx_list = []
+        sym_list = []
+        # PoolingV1 cpu
+        if 'pool_v1_cpu' in pool_op_list:
+            ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+            if not global_pool:
+                sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                                  pooling_convention=pooling_convention, name='pool'))
+            else:
+                sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pool_type=pool_type, global_pool=True, name='pool'))
+        # PoolingV1 gpu
+        if 'pool_v1_gpu' in pool_op_list:
+            ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+            if not global_pool:
+                sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                                  pooling_convention=pooling_convention, name='pool'))
+            else:
+                sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pool_type=pool_type, global_pool=True, name='pool'))
+        # Pooling cpu
+        if 'pool_cpu' in pool_op_list:
+            ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+            if not global_pool:
+                sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                               pooling_convention=pooling_convention, name='pool'))
+            else:
+                sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, name='pool'))
+        # Pooling gpu
+        if 'pool_gpu' in pool_op_list:
+            ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+            if not global_pool:
+                sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                               pooling_convention=pooling_convention, cudnn_off=True, name='pool'))
+            else:
+                sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, cudnn_off=True,
+                                               name='pool'))
+        # CuDNNPooling
+        if 'pool_cudnn' in pool_op_list:
+            ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+            if not global_pool:
+                sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                               pooling_convention=pooling_convention, cudnn_off=False, name='pool'))
+            else:
+                sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, cudnn_off=False,
+                                               name='pool'))
+        check_consistency(sym_list, ctx_list)
+
+    def test_1d_pooling(pool_type):
+        data = (2, 3, 20)
+        kernel = (4,)
+        pad = (0,)
+        stride = (1,)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='valid', global_pool=False)
+
+        pad = (2,)
+        stride = (2,)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='valid', global_pool=False)
+
+        pad = (0,)
+        stride = (1,)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='full', global_pool=False)
+
+        pad = (2,)
+        stride = (2,)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='full', global_pool=False)
+
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     global_pool=True)
+
+    def test_2d_pooling(pool_type):
+        data = (2, 3, 20, 20)
+        kernel = (4, 5)
+        pad = (0, 0)
+        stride = (1, 1)
+        test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='valid', global_pool=False)
+
+        # pool_v1 has bugs when pad is not 0, do not test PoolingV1 here
+        pad = (2, 3)
+        stride = (2, 3)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='valid', global_pool=False)
+
+        pad = (0, 0)
+        stride = (1, 1)
+        test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='full', global_pool=False)
+
+        # pool_v1 has bugs when pad is not 0, do not test PoolingV1 here
+        pad = (2, 3)
+        stride = (2, 3)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='full', global_pool=False)
+
+        test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     global_pool=True)
+
+    def test_3d_pooling(pool_type):
+        data = (2, 3, 20, 20, 20)
+        kernel = (4, 5, 3)
+        pad = (0, 0, 0)
+        stride = (1, 1, 1)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='valid', global_pool=False)
+
+        pad = (2, 3, 3)
+        stride = (2, 3, 1)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='valid', global_pool=False)
+
+        pad = (0, 0, 0)
+        stride = (1, 1, 1)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='full', global_pool=False)
+
+        pad = (2, 3, 3)
+        stride = (2, 3, 1)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='full', global_pool=False)
+
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     global_pool=True)
+
+    test_1d_pooling('max')
+    test_1d_pooling('avg')
+    test_1d_pooling('sum')
+
+    test_2d_pooling('max')
+    test_2d_pooling('avg')
+    test_2d_pooling('sum')
+
+    test_3d_pooling('max')
+    test_3d_pooling('avg')
+    test_3d_pooling('sum')
+
+
 def test_upsampling_with_type():
     sym = mx.sym.UpSampling(scale=2, num_filter=2, name='up', sample_type='nearest', num_args=1)
     ctx_list = [{'ctx': mx.gpu(0), 'up_arg0': (2, 2, 2, 10), 'type_dict': {'up_arg0': np.float64}},
@@ -490,7 +650,7 @@ def test_unfuse():
     test_convolution_options()
     test_convolution_versions()
     test_convolution_with_type()
-    test_pooling_with_type()
+    test_pooling_versions()
     test_batchnorm_with_type()
     test_batchnorm_with_type()
     test_deconvolution_with_type()

From f7b8600ef662234b0be55a741dc884af141a7f3f Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Mon, 20 Mar 2017 12:09:54 -0700
Subject: [PATCH 11/16] Added description and Caffe copyright notice

---
 src/operator/nn/pool.cuh | 60 +++++++++++++++++++++++++++++++++++++++-
 src/operator/nn/pool.h   | 58 ++++++++++++++++++++++++++++++++++++++
 src/operator/pooling.cc  |  6 +++-
 3 files changed, 122 insertions(+), 2 deletions(-)

diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh
index 026b9e88d104..7859ea776626 100644
--- a/src/operator/nn/pool.cuh
+++ b/src/operator/nn/pool.cuh
@@ -1,3 +1,61 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
+ *
+ * COPYRIGHT
+ * 
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ * 
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ * 
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ * 
+ * LICENSE
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met: 
+ * 
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. 
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution. 
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * 
+ * CONTRIBUTION AGREEMENT
+ * 
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer ********************
+ *
+ * Copyright (c) 2017 by Contributors
+ * \file pool.h
+ * \brief Function definitions of pooling 1/2/3-D images.
+ * We adopted looping 2-D image pixels from Caffe and extended it to 1-D and 3-D cases.
+ * \ref https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu
+ * \author Jun Wu
+ */
+
 #ifndef MXNET_OPERATOR_NN_POOL_CUH_
 #define MXNET_OPERATOR_NN_POOL_CUH_
 
@@ -16,7 +74,7 @@ __global__ void pool_max_1d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int stride_w, const int pad_w,
                                        OpReqType req_type, DType* out_data) {
   using mshadow::red::limits::MinValue;
-  // index is the output image's pixel index in NCHW
+  // index is the output image's pixel index in NCW
   CUDA_KERNEL_LOOP(index, nthreads) {
     const int pw = index % pooled_width;
     const int c = (index / pooled_width) % channels;
diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h
index debb7eae404d..bc83b5c32307 100644
--- a/src/operator/nn/pool.h
+++ b/src/operator/nn/pool.h
@@ -1,3 +1,61 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
+ *
+ * COPYRIGHT
+ * 
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ * 
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ * 
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ * 
+ * LICENSE
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met: 
+ * 
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. 
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution. 
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * 
+ * CONTRIBUTION AGREEMENT
+ * 
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer ********************
+ *
+ * Copyright (c) 2017 by Contributors
+ * \file pool.h
+ * \brief Function definitions of pooling 1/2/3-D images.
+ * We adopted looping 2-D image pixels from Caffe and extended it to 1-D and 3-D cases.
+ * \ref https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cpp
+ * \author Jun Wu
+ */
+
 #ifndef MXNET_OPERATOR_NN_POOL_H_
 #define MXNET_OPERATOR_NN_POOL_H_
 
diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc
index e4722acaf3e8..b706c5d9a252 100644
--- a/src/operator/pooling.cc
+++ b/src/operator/pooling.cc
@@ -79,6 +79,10 @@ DMLC_REGISTER_PARAMETER(PoolingParam);
 MXNET_REGISTER_OP_PROPERTY(Pooling, PoolingProp)
 .describe(R"code(Perform pooling on the input.
 
+The shapes for 1-D pooling are
+- **data**: *(batch_size, channel, width)*,
+- **out**: *(batch_size, num_filter, out_width)*.
+
 The shapes for 2-D pooling is
 
 - **data**: *(batch_size, channel, height, width)*
@@ -106,7 +110,7 @@ Three pooling options are supported by ``pool_type``:
 - **max**: max pooling
 - **sum**: sum pooling
 
-1-D pooling is special case of 2-D pooling with *weight=1* and
+1-D pooling is special case of 2-D pooling with *width=1* and
 *kernel[1]=1*.
 
 For 3-D pooling, an additional *depth* dimension is added before

From cefe72d3105e6e76d4220e2a878bebfb6e29ecc0 Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Mon, 20 Mar 2017 13:44:19 -0700
Subject: [PATCH 12/16] Added comments for pool and unpool functions

---
 src/operator/nn/pool.cuh | 102 ++++++++++++++++++++++++++++++++++-----
 src/operator/nn/pool.h   | 100 +++++++++++++++++++++++++++++++++-----
 2 files changed, 177 insertions(+), 25 deletions(-)

diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh
index 7859ea776626..22d0973a2dd8 100644
--- a/src/operator/nn/pool.cuh
+++ b/src/operator/nn/pool.cuh
@@ -2,33 +2,33 @@
  ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
  *
  * COPYRIGHT
- * 
+ *
  * All contributions by the University of California:
  * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
  * All rights reserved.
- * 
+ *
  * All other contributions:
  * Copyright (c) 2014-2017, the respective contributors
  * All rights reserved.
- * 
+ *
  * Caffe uses a shared copyright model: each contributor holds copyright over
  * their contributions to Caffe. The project versioning records all such
  * contribution and copyright details. If a contributor wants to further mark
  * their specific copyright on a particular contribution, they should indicate
  * their copyright solely in the commit message of the change when it is
  * committed.
- * 
+ *
  * LICENSE
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met: 
- * 
+ * modification, are permitted provided that the following conditions are met:
+ *
  * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer. 
+ * list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution. 
- * 
+ * and/or other materials provided with the distribution.
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -39,9 +39,9 @@
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- * 
+ *
  * CONTRIBUTION AGREEMENT
- * 
+ *
  * By contributing to the BVLC/caffe repository through pull-request, comment,
  * or otherwise, the contributor releases their content to the
  * license and copyright terms herein.
@@ -49,7 +49,7 @@
  ***************** END Caffe Copyright Notice and Disclaimer ********************
  *
  * Copyright (c) 2017 by Contributors
- * \file pool.h
+ * \file pool.cuh
  * \brief Function definitions of pooling 1/2/3-D images.
  * We adopted looping 2-D image pixels from Caffe and extended it to 1-D and 3-D cases.
  * \ref https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu
@@ -67,6 +67,10 @@
 namespace mxnet {
 namespace op {
 
+/*!
+ * \brief max pooling gpu kernel for 1-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
 template <typename DType>
 __global__ void pool_max_1d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int channels, const int width,
@@ -95,6 +99,10 @@ __global__ void pool_max_1d_gpu_kernel(const int nthreads, const DType* in_data,
   }
 }
 
+/*!
+ * \brief max pooling gpu kernel for 2-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
 template <typename DType>
 __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int channels, const int height, const int width,
@@ -130,6 +138,10 @@ __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data,
   }
 }
 
+/*!
+ * \brief max pooling gpu kernel for 3-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
 template <typename DType>
 __global__ void pool_max_3d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
                                        const int depth, const int height, const int width,
@@ -173,6 +185,10 @@ __global__ void pool_max_3d_gpu_kernel(const int nthreads, const DType* in_data,
   }
 }
 
+/*!
+ * \brief avg/sum pooling gpu kernel for 1-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
 template <typename DType>
 __global__ void pool_sum_1d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
                                        const int width, const int pooled_width, const int kernel_w,
@@ -197,6 +213,10 @@ __global__ void pool_sum_1d_gpu_kernel(const int nthreads, const DType* in_data,
   }
 }
 
+/*!
+ * \brief avg/sum pooling gpu kernel for 2-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
 template <typename DType>
 __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
                                        const int height, const int width,
@@ -231,6 +251,10 @@ __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data,
   }
 }
 
+/*!
+ * \brief avg/sum pooling gpu kernel for 3-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
 template <typename DType>
 __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
                                        const int depth, const int height, const int width,
@@ -273,6 +297,10 @@ __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data,
   }
 }
 
+/*!
+ * \brief max unpooling gpu kernel for 1-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
 template <typename DType>
 __global__ void unpool_max_1d_gpu_kernel(const int nthreads, const DType* out_grad,
                                          const DType* in_data, const DType* out_data,
@@ -312,6 +340,10 @@ __global__ void unpool_max_1d_gpu_kernel(const int nthreads, const DType* out_gr
   }
 }
 
+/*!
+ * \brief max unpooling gpu kernel for 2-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
 template <typename DType>
 __global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_grad,
                                          const DType* in_data, const DType* out_data,
@@ -363,6 +395,10 @@ __global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_gr
   }
 }
 
+/*!
+ * \brief max unpooling gpu kernel for 3-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
 template <typename DType>
 __global__ void unpool_max_3d_gpu_kernel(const int nthreads, const DType* out_grad,
                                          const DType* in_data, const DType* out_data,
@@ -423,6 +459,10 @@ __global__ void unpool_max_3d_gpu_kernel(const int nthreads, const DType* out_gr
   }
 }
 
+/*!
+ * \brief avg/sum unpooling gpu kernel for 1-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
 template<typename DType>
 __global__ void unpool_sum_1d_gpu_kernel(const int nthreads, const DType* out_grad,
                                          const int channels, const int width,
@@ -454,6 +494,10 @@ __global__ void unpool_sum_1d_gpu_kernel(const int nthreads, const DType* out_gr
   }
 }
 
+/*!
+ * \brief avg/sum unpooling gpu kernel for 2-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
 template<typename DType>
 __global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_grad,
                                          const int channels, const int height, const int width,
@@ -494,6 +538,10 @@ __global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_gr
   }
 }
 
+/*!
+ * \brief avg/sum unpooling gpu kernel for 3-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
 template<typename DType>
 __global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_grad,
                                          const int channels, const int depth, const int height,
@@ -542,6 +590,19 @@ __global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_gr
   }
 }
 
+/*!
+ * \brief This function serves as an interface for 1/2/3-D pooling operations.
+ * \param s context stream defining the device in use is gpu
+ * \param in_data pointer of the input tensor data in the format of NCW, NCHW, or NCDHW
+ * \param ishape input tensor shape
+ * \param oshape output tensor shape
+ * \param kernel kernel shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param pool_type supported pooling type: max, avg, sum
+ * \param req_type operator request type: kNullOp, kNullWriteInplace, kNullWriteTo, kNullAddTo
+ * \param out_data pointer of the output tensor data in the format of NCW, NCHW, or NCDHW
+ */
 template<typename DType>
 inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& ishape,
                  const TShape& oshape, const TShape& kernel, const TShape& pad,
@@ -636,6 +697,21 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
   }
 }
 
+/*!
+ * \brief This function serves as an interface for 1/2/3-D unpooling operations.
+ * \param s context stream defining the device in use is gpu
+ * \param out_grad pointer of the gradient of operator's output tensor
+ * \param in_data pointer of the input tensor in the format of NCW, NCHW, or NCDHW
+ * \param out_data pointer of the output tensor in the format of NCW, NCHW, or NCDHW
+ * \param ishape input tensor shape
+ * \param oshape output tensor shape
+ * \param kernel kernel shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param pool_type supported pooling type: max, avg, sum
+ * \param req_type operator request type: kNullOp, kNullWriteInplace, kNullWriteTo, kNullAddTo
+ * \param in_grad pointer of the gradient of the operator's input tensor
+ */
 template<typename DType>
 inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType* in_data,
                    const DType* out_data, const TShape& ishape, const TShape& oshape,
diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h
index bc83b5c32307..c32d12a48236 100644
--- a/src/operator/nn/pool.h
+++ b/src/operator/nn/pool.h
@@ -2,33 +2,33 @@
  ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
  *
  * COPYRIGHT
- * 
+ *
  * All contributions by the University of California:
  * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
  * All rights reserved.
- * 
+ *
  * All other contributions:
  * Copyright (c) 2014-2017, the respective contributors
  * All rights reserved.
- * 
+ *
  * Caffe uses a shared copyright model: each contributor holds copyright over
  * their contributions to Caffe. The project versioning records all such
  * contribution and copyright details. If a contributor wants to further mark
  * their specific copyright on a particular contribution, they should indicate
  * their copyright solely in the commit message of the change when it is
  * committed.
- * 
+ *
  * LICENSE
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met: 
- * 
+ * modification, are permitted provided that the following conditions are met:
+ *
  * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer. 
+ * list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution. 
- * 
+ * and/or other materials provided with the distribution.
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -39,9 +39,9 @@
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- * 
+ *
  * CONTRIBUTION AGREEMENT
- * 
+ *
  * By contributing to the BVLC/caffe repository through pull-request, comment,
  * or otherwise, the contributor releases their content to the
  * license and copyright terms herein.
@@ -73,6 +73,10 @@ enum PoolingOpType {kMaxPooling, kAvgPooling, kSumPooling};
 enum PoolingOpPadConventionType {kValid, kFull};
 }  // namespace pool_enum
 
+/*!
+ * \brief max pooling cpu function for 1-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
 template<typename DType>
 inline void pool_max_1d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
@@ -105,6 +109,10 @@ inline void pool_max_1d_cpu(const DType* in_data, const TShape& ishape, const TS
   }
 }
 
+/*!
+ * \brief max pooling cpu function for 2-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
 template<typename DType>
 inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
@@ -146,6 +154,10 @@ inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TS
   }
 }
 
+/*!
+ * \brief max pooling cpu function for 3-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
 template<typename DType>
 inline void pool_max_3d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
@@ -194,6 +206,10 @@ inline void pool_max_3d_cpu(const DType* in_data, const TShape& ishape, const TS
   }
 }
 
+/*!
+ * \brief avg/sum pooling cpu function for 1-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
 template<typename DType>
 inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
@@ -225,6 +241,10 @@ inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TS
   }
 }
 
+/*!
+ * \brief avg/sum pooling cpu function for 2-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
 template<typename DType>
 inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
@@ -264,6 +284,10 @@ inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TS
   }
 }
 
+/*!
+ * \brief avg/sum pooling cpu function for 3-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
 template<typename DType>
 inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
@@ -312,6 +336,10 @@ inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TS
   }
 }
 
+/*!
+ * \brief max unpooling cpu function for 1-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
 template<typename DType>
 inline void unpool_max_1d_cpu(const DType* out_grad, const DType* in_data,
                               const DType* out_data, const TShape& ishape,
@@ -352,6 +380,10 @@ inline void unpool_max_1d_cpu(const DType* out_grad, const DType* in_data,
   }
 }
 
+/*!
+ * \brief max unpooling cpu function for 2-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
 template<typename DType>
 inline void unpool_max_2d_cpu(const DType* out_grad, const DType* in_data,
                               const DType* out_data, const TShape& ishape,
@@ -404,6 +436,10 @@ inline void unpool_max_2d_cpu(const DType* out_grad, const DType* in_data,
   }
 }
 
+/*!
+ * \brief max unpooling cpu function for 3-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
 template<typename DType>
 inline void unpool_max_3d_cpu(const DType* out_grad, const DType* in_data,
                               const DType* out_data, const TShape& ishape,
@@ -464,6 +500,10 @@ inline void unpool_max_3d_cpu(const DType* out_grad, const DType* in_data,
   }
 }
 
+/*!
+ * \brief avg/sum unpooling cpu function for 1-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
 template<typename DType>
 inline void unpool_sum_1d_cpu(const DType* out_grad, const TShape& ishape,
                               const TShape& oshape, const TShape& kernel,
@@ -497,6 +537,10 @@ inline void unpool_sum_1d_cpu(const DType* out_grad, const TShape& ishape,
   }
 }
 
+/*!
+ * \brief avg/sum unpooling cpu function for 2-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
 template<typename DType>
 inline void unpool_sum_2d_cpu(const DType* out_grad, const TShape& ishape,
                               const TShape& oshape, const TShape& kernel,
@@ -539,6 +583,10 @@ inline void unpool_sum_2d_cpu(const DType* out_grad, const TShape& ishape,
   }
 }
 
+/*!
+ * \brief avg/sum unpooling cpu function for 3-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
 template<typename DType>
 inline void unpool_sum_3d_cpu(const DType* out_grad, const TShape& ishape,
                               const TShape& oshape, const TShape& kernel,
@@ -589,6 +637,19 @@ inline void unpool_sum_3d_cpu(const DType* out_grad, const TShape& ishape,
   }
 }
 
+/*!
+ * \brief This function serves as an interface for 1/2/3-D pooling operations.
+ * \param s context stream defining the device in use is cpu
+ * \param in_data pointer of the input tensor data in the format of NCW, NCHW, or NCDHW
+ * \param ishape input tensor shape
+ * \param oshape output tensor shape
+ * \param kernel kernel shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param pool_type supported pooling type: max, avg, sum
+ * \param req_type operator request type: kNullOp, kNullWriteInplace, kNullWriteTo, kNullAddTo
+ * \param out_data pointer of the output tensor data in the format of NCW, NCHW, or NCDHW
+ */
 template<typename DType>
 inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& ishape,
                  const TShape& oshape, const TShape& kernel, const TShape& pad,
@@ -629,6 +690,21 @@ inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& is
   }
 }
 
+/*!
+ * \brief This function serves as an interface for 1/2/3-D unpooling operations.
+ * \param s context stream defining the device in use is cpu
+ * \param out_grad pointer of the gradient of operator's output tensor
+ * \param in_data pointer of the input tensor in the format of NCW, NCHW, or NCDHW
+ * \param out_data pointer of the output tensor in the format of NCW, NCHW, or NCDHW
+ * \param ishape input tensor shape
+ * \param oshape output tensor shape
+ * \param kernel kernel shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param pool_type supported pooling type: max, avg, sum
+ * \param req_type operator request type: kNullOp, kNullWriteInplace, kNullWriteTo, kNullAddTo
+ * \param in_grad pointer of the gradient of the operator's input tensor
+ */
 template<typename DType>
 inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType* in_data,
                    const DType* out_data, const TShape& ishape, const TShape& oshape,

From 7e041a1af9cebcea327e498492a2082b422288a4 Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Tue, 21 Mar 2017 12:55:31 -0700
Subject: [PATCH 13/16] Fixed lint

---
 src/operator/nn/pool.cuh      | 46 +++++++++++++++----------------
 src/operator/nn/pool.h        | 51 ++++++++++++++++++-----------------
 src/operator/pooling_v1-inl.h |  6 +++--
 3 files changed, 53 insertions(+), 50 deletions(-)

diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh
index 22d0973a2dd8..54fd3461d80f 100644
--- a/src/operator/nn/pool.cuh
+++ b/src/operator/nn/pool.cuh
@@ -76,7 +76,7 @@ __global__ void pool_max_1d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int channels, const int width,
                                        const int pooled_width, const int kernel_w,
                                        const int stride_w, const int pad_w,
-                                       OpReqType req_type, DType* out_data) {
+                                       DType* out_data) {
   using mshadow::red::limits::MinValue;
   // index is the output image's pixel index in NCW
   CUDA_KERNEL_LOOP(index, nthreads) {
@@ -95,7 +95,7 @@ __global__ void pool_max_1d_gpu_kernel(const int nthreads, const DType* in_data,
         max_val = in_val;
       }
     }
-    KERNEL_ASSIGN(out_data[index], req_type, max_val);
+    out_data[index] = max_val;
   }
 }
 
@@ -109,7 +109,7 @@ __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int pooled_height, const int pooled_width,
                                        const int kernel_h, const int kernel_w, const int stride_h,
                                        const int stride_w, const int pad_h, const int pad_w,
-                                       OpReqType req_type, DType* out_data) {
+                                       DType* out_data) {
   using mshadow::red::limits::MinValue;
   // index is the output image's pixel index in NCHW
   CUDA_KERNEL_LOOP(index, nthreads) {
@@ -134,7 +134,7 @@ __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data,
         }
       }
     }
-    KERNEL_ASSIGN(out_data[index], req_type, max_val);
+    out_data[index] = max_val;
   }
 }
 
@@ -149,7 +149,7 @@ __global__ void pool_max_3d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int pooled_width, const int kernel_d,
                                        const int kernel_h, const int kernel_w, const int stride_d,
                                        const int stride_h, const int stride_w, const int pad_d,
-                                       const int pad_h, const int pad_w, OpReqType req_type,
+                                       const int pad_h, const int pad_w,
                                        DType* out_data) {
   using mshadow::red::limits::MinValue;
   // index is the output image's pixel index in NCDHW
@@ -181,7 +181,7 @@ __global__ void pool_max_3d_gpu_kernel(const int nthreads, const DType* in_data,
         }
       }
     }
-    KERNEL_ASSIGN(out_data[index], req_type, max_val);
+    out_data[index] = max_val;
   }
 }
 
@@ -192,7 +192,7 @@ __global__ void pool_max_3d_gpu_kernel(const int nthreads, const DType* in_data,
 template <typename DType>
 __global__ void pool_sum_1d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
                                        const int width, const int pooled_width, const int kernel_w,
-                                       const int stride_w, const int pad_w, OpReqType req_type,
+                                       const int stride_w, const int pad_w,
                                        DType* out_data, bool getAvg = false) {
   CUDA_KERNEL_LOOP(index, nthreads) {
 	  const int pw = index % pooled_width;
@@ -209,7 +209,7 @@ __global__ void pool_sum_1d_gpu_kernel(const int nthreads, const DType* in_data,
     for (int w = wstart; w < wend; ++w) {
       sum += out_slice[w];
     }
-	  KERNEL_ASSIGN(out_data[index], req_type, sum / pool_size);
+    out_data[index] = sum / pool_size;
   }
 }
 
@@ -223,7 +223,7 @@ __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int pooled_height, const int pooled_width,
                                        const int kernel_h, const int kernel_w,
                                        const int stride_h, const int stride_w,
-                                       const int pad_h, const int pad_w, OpReqType req_type,
+                                       const int pad_h, const int pad_w,
                                        DType* out_data, bool getAvg = false) {
   CUDA_KERNEL_LOOP(index, nthreads) {
 	  const int pw = index % pooled_width;
@@ -247,7 +247,7 @@ __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data,
 		    sum += out_slice[h * width + w];
 		  }
 	  }
-	  KERNEL_ASSIGN(out_data[index], req_type, sum / pool_size);
+    out_data[index] = sum / pool_size;
   }
 }
 
@@ -263,7 +263,7 @@ __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int kernel_h, const int kernel_w,
                                        const int stride_d, const int stride_h, const int stride_w,
                                        const int pad_d, const int pad_h, const int pad_w,
-                                       OpReqType req_type, DType* out_data, bool getAvg = false) {
+                                       DType* out_data, bool getAvg = false) {
   CUDA_KERNEL_LOOP(index, nthreads) {
 	  const int pw = index % pooled_width;
 	  const int ph = (index / pooled_width) % pooled_height;
@@ -293,7 +293,7 @@ __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data,
         }
       }
     }
-	  KERNEL_ASSIGN(out_data[index], req_type, sum / pool_size);
+    out_data[index] = sum / pool_size;
   }
 }
 
@@ -600,7 +600,7 @@ __global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_gr
  * \param pad pad shape
  * \param stride stride shape
  * \param pool_type supported pooling type: max, avg, sum
- * \param req_type operator request type: kNullOp, kNullWriteInplace, kNullWriteTo, kNullAddTo
+ * \param req_type operator request type, only support kWriteTo for now
  * \param out_data pointer of the output tensor data in the format of NCW, NCHW, or NCDHW
  */
 template<typename DType>
@@ -608,6 +608,7 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
                  const TShape& oshape, const TShape& kernel, const TShape& pad,
                  const TShape& stride, const int pool_type, OpReqType req_type,
                  DType* out_data) {
+  CHECK_EQ(req_type, kWriteTo) << "Only support req=kWriteTo in pooling operations";
   using namespace mxnet_op;
   if (kernel.ndim() == 1) {
     if (pool_enum::kMaxPooling == pool_type) {
@@ -615,21 +616,21 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
       pool_max_1d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2],
-                                   oshape[2], kernel[0], stride[0], pad[0], req_type, out_data);
+                                   oshape[2], kernel[0], stride[0], pad[0], out_data);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_1d_gpu_kernel);
     } else if (pool_enum::kAvgPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       pool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], oshape[2],
-                                   kernel[0], stride[0], pad[0], req_type, out_data, true);
+                                   kernel[0], stride[0], pad[0], out_data, true);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_1d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
       pool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], oshape[2],
-                                   kernel[0], stride[0], pad[0], req_type, out_data);
+                                   kernel[0], stride[0], pad[0], out_data);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_1d_gpu_kernel);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
@@ -641,7 +642,7 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    oshape[2], oshape[3], kernel[0], kernel[1],
-                                   stride[0], stride[1], pad[0], pad[1], req_type, out_data);
+                                   stride[0], stride[1], pad[0], pad[1], out_data);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_2d_gpu_kernel);
     } else if (pool_enum::kAvgPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
@@ -649,7 +650,7 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    oshape[2], oshape[3], kernel[0], kernel[1],
-                                   stride[0], stride[1], pad[0], pad[1], req_type, out_data, true);
+                                   stride[0], stride[1], pad[0], pad[1], out_data, true);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
@@ -657,7 +658,7 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    oshape[2], oshape[3], kernel[0], kernel[1],
-                                   stride[0], stride[1], pad[0], pad[1], req_type, out_data);
+                                   stride[0], stride[1], pad[0], pad[1], out_data);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
@@ -670,8 +671,7 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    ishape[4], oshape[2], oshape[3], oshape[4],
                                    kernel[0], kernel[1], kernel[2], stride[0],
-                                   stride[1], stride[2], pad[0], pad[1], pad[2], req_type,
-                                   out_data);
+                                   stride[1], stride[2], pad[0], pad[1], pad[2], out_data);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_3d_gpu_kernel);
     } else if (pool_enum::kAvgPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
@@ -680,7 +680,7 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    ishape[4], oshape[2], oshape[3], oshape[4], kernel[0],
                                    kernel[1], kernel[2], stride[0], stride[1], stride[2],
-                                   pad[0], pad[1], pad[2], req_type, out_data, true);
+                                   pad[0], pad[1], pad[2], out_data, true);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_3d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
@@ -689,7 +689,7 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    ishape[4], oshape[2], oshape[3], oshape[4], kernel[0],
                                    kernel[1], kernel[2], stride[0], stride[1], stride[2],
-                                   pad[0], pad[1], pad[2], req_type, out_data);
+                                   pad[0], pad[1], pad[2], out_data);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_3d_gpu_kernel);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h
index c32d12a48236..79accb5d521f 100644
--- a/src/operator/nn/pool.h
+++ b/src/operator/nn/pool.h
@@ -61,6 +61,7 @@
 
 #include <mxnet/base.h>
 #include <mxnet/operator.h>
+#include <algorithm>
 #include "../mxnet_op.h"
 
 namespace mxnet {
@@ -80,7 +81,7 @@ enum PoolingOpPadConventionType {kValid, kFull};
 template<typename DType>
 inline void pool_max_1d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
-                            OpReqType req_type, DType* out_data) {
+                            DType* out_data) {
   using mshadow::red::limits::MinValue;
   const int width = ishape[2];
   const int pooled_width = oshape[2];
@@ -101,7 +102,7 @@ inline void pool_max_1d_cpu(const DType* in_data, const TShape& ishape, const TS
             max_val = in_data[w];
           }
         }
-        KERNEL_ASSIGN(out_data[pw], req_type, max_val);
+        out_data[pw] = max_val;
       }
       in_data += in_data_offset;
       out_data += out_data_offset;
@@ -116,7 +117,7 @@ inline void pool_max_1d_cpu(const DType* in_data, const TShape& ishape, const TS
 template<typename DType>
 inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
-                            OpReqType req_type, DType* out_data) {
+                            DType* out_data) {
   using mshadow::red::limits::MinValue;
   const int height = ishape[2], width = ishape[3];
   const int pooled_height = oshape[2], pooled_width = oshape[3];
@@ -139,13 +140,13 @@ inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TS
           DType max_val = MinValue<DType>();
           for (int h = hstart; h < hend; ++h) {
             for (int w = wstart; w < wend; ++w) {
-              const int in_index= h * width + w;
+              const int in_index = h * width + w;
               if (in_data[in_index] > max_val) {
                 max_val = in_data[in_index];
               }
             }
           }
-          KERNEL_ASSIGN(out_data[pool_index], req_type, max_val);
+          out_data[pool_index] = max_val;
         }
       }
       in_data += in_data_offset;
@@ -161,7 +162,7 @@ inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TS
 template<typename DType>
 inline void pool_max_3d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
-                            OpReqType req_type, DType* out_data) {
+                            DType* out_data) {
   using mshadow::red::limits::MinValue;
   const int depth = ishape[2], height = ishape[3], width = ishape[4];
   const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
@@ -189,14 +190,14 @@ inline void pool_max_3d_cpu(const DType* in_data, const TShape& ishape, const TS
             for (int d = dstart; d < dend; ++d) {
               for (int h = hstart; h < hend; ++h) {
                 for (int w = wstart; w < wend; ++w) {
-                  const int in_index= (d * height + h) * width + w;
+                  const int in_index = (d * height + h) * width + w;
                   if (in_data[in_index] > max_val) {
                     max_val = in_data[in_index];
                   }
                 }
               }
             }
-            KERNEL_ASSIGN(out_data[pool_index], req_type, max_val);
+            out_data[pool_index] = max_val;
           }
         }
       }
@@ -213,7 +214,7 @@ inline void pool_max_3d_cpu(const DType* in_data, const TShape& ishape, const TS
 template<typename DType>
 inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
-                            OpReqType req_type, DType* out_data, bool getAvg = false) {
+                            DType* out_data, bool getAvg = false) {
   const int width = ishape[2];
   const int pooled_width = oshape[2];
   const int kernel_w = kernel[0];
@@ -233,7 +234,7 @@ inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TS
         for (int w = wstart; w < wend; ++w) {
           sum += in_data[w];
         }
-        KERNEL_ASSIGN(out_data[pw], req_type, getAvg? sum/pool_size : sum);
+        out_data[pw] = (getAvg? sum/pool_size : sum);
       }
       in_data += in_data_offset;
       out_data += out_data_offset;
@@ -248,7 +249,7 @@ inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TS
 template<typename DType>
 inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
-                            OpReqType req_type, DType* out_data, bool getAvg = false) {
+                            DType* out_data, bool getAvg = false) {
   const int height = ishape[2], width = ishape[3];
   const int pooled_height = oshape[2], pooled_width = oshape[3];
   const int kernel_h = kernel[0], kernel_w = kernel[1];
@@ -275,7 +276,7 @@ inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TS
               sum += in_data[h*width+w];
             }
           }
-          KERNEL_ASSIGN(out_data[ph*pooled_width+pw], req_type, getAvg? sum/pool_size : sum);
+          out_data[ph*pooled_width+pw] = (getAvg? sum/pool_size : sum);
         }
       }
       in_data += in_data_offset;
@@ -291,7 +292,7 @@ inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TS
 template<typename DType>
 inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
-                            OpReqType req_type, DType* out_data, bool getAvg = false) {
+                            DType* out_data, bool getAvg = false) {
   const int depth = ishape[2], height = ishape[3], width = ishape[4];
   const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
   const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
@@ -325,8 +326,7 @@ inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TS
                 }
               }
             }
-            KERNEL_ASSIGN(out_data[(pd*pooled_height+ph)*pooled_width+pw],
-                          req_type, getAvg? sum/pool_size : sum);
+            out_data[(pd*pooled_height+ph)*pooled_width+pw] = (getAvg? sum/pool_size : sum);
           }
         }
       }
@@ -647,7 +647,7 @@ inline void unpool_sum_3d_cpu(const DType* out_grad, const TShape& ishape,
  * \param pad pad shape
  * \param stride stride shape
  * \param pool_type supported pooling type: max, avg, sum
- * \param req_type operator request type: kNullOp, kNullWriteInplace, kNullWriteTo, kNullAddTo
+ * \param req_type operator request type, only support kWriteTo for now
  * \param out_data pointer of the output tensor data in the format of NCW, NCHW, or NCDHW
  */
 template<typename DType>
@@ -655,33 +655,34 @@ inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& is
                  const TShape& oshape, const TShape& kernel, const TShape& pad,
                  const TShape& stride, const int pool_type, OpReqType req_type,
                  DType* out_data) {
+  CHECK_EQ(req_type, kWriteTo) << "Only support req=kWriteTo in pooling operations";
   if (kernel.ndim() == 1) {
     if (pool_enum::kMaxPooling == pool_type) {
-      pool_max_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data);
+      pool_max_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
     } else if (pool_enum::kAvgPooling == pool_type) {
-      pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data, true);
+      pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data, true);
     } else if (pool_enum::kSumPooling == pool_type) {
-      pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data);
+      pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
   } else if (kernel.ndim() == 2) {
     if (pool_enum::kMaxPooling == pool_type) {
-      pool_max_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data);
+      pool_max_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
     } else if (pool_enum::kAvgPooling == pool_type) {
-      pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data, true);
+      pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data, true);
     } else if (pool_enum::kSumPooling == pool_type) {
-      pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data);
+      pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
   } else if (kernel.ndim() == 3) {
     if (pool_enum::kMaxPooling == pool_type) {
-      pool_max_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data);
+      pool_max_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
     } else if (pool_enum::kAvgPooling == pool_type) {
-      pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data, true);
+      pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data, true);
     } else if (pool_enum::kSumPooling == pool_type) {
-      pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data);
+      pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
     } else {
       LOG(FATAL) << "Unknown pooling type " << pool_type;
     }
diff --git a/src/operator/pooling_v1-inl.h b/src/operator/pooling_v1-inl.h
index c6bc01e52f5a..7fef1258093f 100644
--- a/src/operator/pooling_v1-inl.h
+++ b/src/operator/pooling_v1-inl.h
@@ -86,7 +86,8 @@ class PoolingV1Op : public Operator {
     Tensor<xpu, 4, DType> data = in_data[pool_v1_enum::kData].get<xpu, 4, DType>(s);
     Tensor<xpu, 4, DType> out = out_data[pool_v1_enum::kOut].get<xpu, 4, DType>(s);
     mshadow::Shape<2> out_shape = Shape2(out.shape_[2], out.shape_[3]);
-    if (param_.pool_type == pool_v1_enum::kMaxPooling || param_.pool_type == pool_v1_enum::kSumPooling) {
+    if (param_.pool_type == pool_v1_enum::kMaxPooling
+        || param_.pool_type == pool_v1_enum::kSumPooling) {
       Assign(out,
              req[pool_v1_enum::kOut],
              pool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
@@ -136,7 +137,8 @@ class PoolingV1Op : public Operator {
 
     mshadow::Shape<2> in_shape = Shape2(data.shape_[2], data.shape_[3]);
 
-    if (param_.pool_type == pool_v1_enum::kMaxPooling || param_.pool_type == pool_v1_enum::kSumPooling) {
+    if (param_.pool_type == pool_v1_enum::kMaxPooling
+        || param_.pool_type == pool_v1_enum::kSumPooling) {
       Assign(input_grad, req[pool_v1_enum::kData],
              crop(unpool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
                                   pad(output_data, 0, 0),

From 32a888be781ebfa175481a76ff0bf3d55bc6d956 Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Tue, 21 Mar 2017 17:49:12 -0700
Subject: [PATCH 14/16] Fix MKL test

---
 src/operator/pooling.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc
index b706c5d9a252..c0fabe11e41c 100644
--- a/src/operator/pooling.cc
+++ b/src/operator/pooling.cc
@@ -21,8 +21,9 @@ template<>
 Operator *CreateOp<cpu>(PoolingParam param, int dtype) {
   Operator *op = NULL;
 #if MXNET_USE_MKL2017 == 1
-    if ((param.pool_type == pool_enum::kMaxPooling)
-      || (param.pool_type == pool_enum::kAvgPooling)) {
+    if (param.kernel.ndim() != 1
+      && ((param.pool_type == pool_enum::kMaxPooling)
+      || (param.pool_type == pool_enum::kAvgPooling))) {
       switch (dtype) {
       case mshadow::kFloat32:
         return new MKLPoolingOp<cpu, float>(param);

From e311deebb8df5aeb7fc184cbc343243994d5c147 Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Tue, 21 Mar 2017 20:21:13 -0700
Subject: [PATCH 15/16] Fix mkl

---
 src/operator/pooling.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc
index c0fabe11e41c..870dbf13d08a 100644
--- a/src/operator/pooling.cc
+++ b/src/operator/pooling.cc
@@ -21,7 +21,7 @@ template<>
 Operator *CreateOp<cpu>(PoolingParam param, int dtype) {
   Operator *op = NULL;
 #if MXNET_USE_MKL2017 == 1
-    if (param.kernel.ndim() != 1
+    if (param.kernel.ndim() == 2
       && ((param.pool_type == pool_enum::kMaxPooling)
       || (param.pool_type == pool_enum::kAvgPooling))) {
       switch (dtype) {

From c65cbdf3b1e6337452a81bf959e4797eb59b316e Mon Sep 17 00:00:00 2001
From: reminisce <wujun.nju@gmail.com>
Date: Tue, 21 Mar 2017 21:07:55 -0700
Subject: [PATCH 16/16] Disable MKL for pad and stride > 0

---
 src/operator/pooling.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc
index 870dbf13d08a..85f15668f19a 100644
--- a/src/operator/pooling.cc
+++ b/src/operator/pooling.cc
@@ -20,8 +20,13 @@ namespace op {
 template<>
 Operator *CreateOp<cpu>(PoolingParam param, int dtype) {
   Operator *op = NULL;
+  // TODO(junwu): Since MKL has a bug when pad and stride > 0,
+  // we disable MKL in those cases and will re-enable it after
+  // it is fixed by deleting lines 28 and 29.
 #if MXNET_USE_MKL2017 == 1
     if (param.kernel.ndim() == 2
+      && 0 == param.pad[0] && 0 == param.pad[1]
+      && 0 == param.stride[0] && 0 == param.stride[1]
       && ((param.pool_type == pool_enum::kMaxPooling)
       || (param.pool_type == pool_enum::kAvgPooling))) {
       switch (dtype) {