From 4ca713c6c05e200409cfad9b6dd0f6be83dc37c3 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 20 Nov 2018 20:22:36 -0800
Subject: [PATCH 1/2]   # This is a combination of 2 commits.

[TOPI] Fix atlest1d for reduce and squeeze
---
 nnvm/include/nnvm/compiler/util.h        |  11 +++
 nnvm/src/top/tensor/reduce.cc            |  19 ++--
 nnvm/src/top/tensor/transform.cc         |   7 +-
 topi/include/topi/detail/fuse.h          |  14 +--
 topi/include/topi/nn/l2_normalize.h      |   2 +-
 topi/include/topi/nn/softmax.h           |   2 +-
 topi/include/topi/reduction.h            | 109 ++++++++++++++---------
 topi/include/topi/tags.h                 |   8 ++
 topi/include/topi/transform.h            |  22 +++--
 topi/python/topi/cuda/reduction.py       |   6 +-
 topi/src/topi.cc                         |   4 +-
 topi/tests/python/test_topi_reduce.py    |   4 +
 topi/tests/python/test_topi_transform.py |   5 +-
 13 files changed, 131 insertions(+), 82 deletions(-)
diff --git a/nnvm/include/nnvm/compiler/util.h b/nnvm/include/nnvm/compiler/util.h
index 5d5bc4478530..0f7fb2a5c875 100644
--- a/nnvm/include/nnvm/compiler/util.h
+++ b/nnvm/include/nnvm/compiler/util.h
@@ -28,6 +28,17 @@ inline tvm::Array<tvm::Expr> ShapeToArray(TShape shape) {
   return result;
 }
 
+/*
+ * \brief Helper function to convert TShape to TVM array. Useful for
+ * passing data from NNVM param structures to TOPI ops.
+ *
+ * \param shape The shape to convert
+ *
+ * \return An Array of Expr, where each element is a constant int32
+ */
+inline tvm::Array<tvm::Integer> ShapeToIntArray(TShape shape) {
+  return tvm::Array<tvm::Integer>(ShapeToArray(shape).node_);
+}
 }  // namespace compiler
 }  // namespace nnvm
 #endif  // NNVM_COMPILER_UTIL_H_
diff --git a/nnvm/src/top/tensor/reduce.cc b/nnvm/src/top/tensor/reduce.cc
index 7b768ac64304..a9480856f533 100644
--- a/nnvm/src/top/tensor/reduce.cc
+++ b/nnvm/src/top/tensor/reduce.cc
@@ -4,7 +4,7 @@
  * \brief reduce operator.
  */
 // Enforce TOPI to use old behavior that reduces to at least 1d
-#define TOPI_REDUCE_ATLEAST1D 1
+#define TOPI_OUTPUT_ATLEAST1D 1
 
 #include <nnvm/op.h>
 #include <nnvm/node.h>
@@ -20,13 +20,14 @@
 #include "topi/reduction.h"
 #include "topi/transform.h"
 
-static_assert(TOPI_REDUCE_ATLEAST1D, "need to use legacy reduce behavior");
+static_assert(TOPI_OUTPUT_ATLEAST1D, "need to use legacy reduce behavior");
 
 namespace nnvm {
 namespace top {
 using namespace tvm;
 using namespace nnvm::compiler;
 
+
 // reduce
 DMLC_REGISTER_PARAMETER(ReduceParam);
 
@@ -168,7 +169,7 @@ Example::
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
     if (!r_axes.ndim()) return Array<Tensor> { topi::identity(inputs[0]) };
-    auto axis = ShapeToArray(r_axes);
+    auto axis = ShapeToIntArray(r_axes);
     return Array<Tensor>{
       topi::sum(inputs[0], axis, param.keepdims) };
 })
@@ -202,7 +203,7 @@ NNVM_REGISTER_REDUCE_OP(max)
     const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
-    auto axis = ShapeToArray(r_axes);
+    auto axis = ShapeToIntArray(r_axes);
     return Array<Tensor>{
       topi::max(inputs[0], axis, param.keepdims) };
 })
@@ -235,7 +236,7 @@ NNVM_REGISTER_REDUCE_OP(min)
     const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
-    auto axis = ShapeToArray(r_axes);
+    auto axis = ShapeToIntArray(r_axes);
     return Array<Tensor>{
       topi::min(inputs[0], axis, param.keepdims) };
 })
@@ -299,7 +300,7 @@ values over a given axis.
     const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
-    auto axis = ShapeToArray(r_axes);
+    auto axis = ShapeToIntArray(r_axes);
     Tensor out = topi::argmax(inputs[0], axis, param.keepdims);
     if (param.dtype == kFloat32) out = topi::cast(out, out_info[0]->dtype);
     return Array<Tensor>{out};
@@ -322,7 +323,7 @@ values over a given axis.
     const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
-    auto axis = ShapeToArray(r_axes);
+    auto axis = ShapeToIntArray(r_axes);
     Tensor out = topi::argmin(inputs[0], axis, param.keepdims);
     if (param.dtype == kFloat32) out = topi::cast(out, out_info[0]->dtype);
     return Array<Tensor>{out};
@@ -352,7 +353,7 @@ Example::
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
     if (!r_axes.ndim()) return Array<Tensor> { topi::identity(inputs[0]) };
-    auto axis = ShapeToArray(r_axes);
+    auto axis = ShapeToIntArray(r_axes);
 
     Expr count = make_const(inputs[0]->dtype, 1);
     for (auto& i : r_axes) {
@@ -387,7 +388,7 @@ Example::
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
     if (!r_axes.ndim()) return Array<Tensor> { topi::identity(inputs[0]) };
-    auto axis = ShapeToArray(r_axes);
+    auto axis = ShapeToIntArray(r_axes);
     return Array<Tensor>{
       topi::prod(inputs[0], axis, param.keepdims) };
 });
diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc
index 2f42727d6083..1fab5cf99c5a 100644
--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -3,6 +3,9 @@
  * \file transform.cc
  * \brief Injective transformation of shape or type.
  */
+
+#define TOPI_OUTPUT_ATLEAST1D 1
+
 #include <nnvm/op.h>
 #include <nnvm/node.h>
 #include <nnvm/op_attr_types.h>
@@ -19,6 +22,8 @@
 #include "topi/detail/constant_utils.h"
 #include "../../compiler/compile_engine.h"
 
+static_assert(TOPI_OUTPUT_ATLEAST1D, "need to use legacy reduce behavior");
+
 namespace nnvm {
 namespace top {
 using namespace tvm;
@@ -756,7 +761,7 @@ Examples::
                     const Array<Tensor>& inputs,
                     const Array<Tensor>& out_info) {
     const SqueezeParam& param = nnvm::get<SqueezeParam>(attrs.parsed);
-    auto axis = ShapeToArray(param.axis);
+    auto axis = ShapeToIntArray(param.axis);
     return Array<Tensor>{ topi::squeeze(inputs[0], axis) };
 })
 .set_attr<FGradient>(
diff --git a/topi/include/topi/detail/fuse.h b/topi/include/topi/detail/fuse.h
index 9ee7fbd1cffd..85ca0f9efacb 100644
--- a/topi/include/topi/detail/fuse.h
+++ b/topi/include/topi/detail/fuse.h
@@ -14,22 +14,16 @@ using namespace tvm;
 
 /*!
  * \brief Fuse all of the given args
- * 
+ *
  * \param stage The stage in which to apply the fuse
  * \param args The iteration variables to be fused
  *
  * \return The fused iteration variable
  */
 inline IterVar Fuse(Stage stage, const Array<IterVar>& args) {
-  CHECK_GE(args.size(), 1) << "Fuse requires at least 1 arg";
-
-  auto fused = args[0];
-  for (size_t i = 1; i < args.size(); ++i) {
-    IterVar out;
-    stage.fuse(fused, args[i], &out);
-    fused = out;
-  }
-  return fused;
+  IterVar res;
+  stage.fuse(args, &res);
+  return res;
 }
 
 }  // namespace detail
diff --git a/topi/include/topi/nn/l2_normalize.h b/topi/include/topi/nn/l2_normalize.h
index cda1f3b5c813..6d98a75ec157 100644
--- a/topi/include/topi/nn/l2_normalize.h
+++ b/topi/include/topi/nn/l2_normalize.h
@@ -27,7 +27,7 @@ using namespace tvm;
 */
 inline Tensor l2_normalize(const Tensor& data,
                            float eps,
-                           const Array<Expr>& axis,
+                           const Array<Integer>& axis,
                            std::string name = "tensor",
                            std::string tag = "l2_normalize") {
   CHECK_EQ(data->shape.size(), 4) << "L2 normalization requires 4-D input";
diff --git a/topi/include/topi/nn/softmax.h b/topi/include/topi/nn/softmax.h
index d17f93046e72..8ee747ccd07c 100644
--- a/topi/include/topi/nn/softmax.h
+++ b/topi/include/topi/nn/softmax.h
@@ -40,7 +40,7 @@ inline Tensor softmax(const Tensor &x,
 
   auto k1 = tvm::reduce_axis(Range(0, input_shape[axis]), "k1");
   auto k2 = tvm::reduce_axis(Range(0, input_shape[axis]), "k2");
-  auto reduced_shape = MakeReduceTargetShape({axis}, x, false);
+  auto reduced_shape = MakeReduceTargetShape({axis}, x, false, false);
 
   auto insert_reduce_index = [axis, ndim](const Array<Var> &indices,
                                           const IterVar &reduce_index) {
diff --git a/topi/include/topi/reduction.h b/topi/include/topi/reduction.h
index 777c103ec950..ed48aad7c433 100644
--- a/topi/include/topi/reduction.h
+++ b/topi/include/topi/reduction.h
@@ -8,7 +8,6 @@
 
 #include <algorithm>
 #include <string>
-#include <set>
 #include <vector>
 #include <iterator>
 
@@ -20,13 +19,6 @@
 #include "topi/detail/constant_utils.h"
 #include "tvm/tvm.h"
 
-/*!
- * \brief macro flag to enable some legacy behavior which requires
- * reduction result to be at least 1d.
- */
-#ifndef TOPI_REDUCE_ATLEAST1D
-#define TOPI_REDUCE_ATLEAST1D 0
-#endif
 
 namespace topi {
 using namespace tvm;
@@ -42,30 +34,34 @@ using FCommReduce = std::function<
 * \brief Convert a reduction axis which could be empty or have negative
 * elements into a real axis with valid dimension indices.
 *
+* \param ndim Number of dimensions in the target.
+* \param axis The axis parameter.
+*
 * \return A non-empty sorted array of valid dimension indices, with no duplicates.
 * If the input axis is empty, the result will be an axis including all dimensions.
 * If any input element is negative, it will be treated as an offset from the
 * last dimension (same as python indexing rules).
 */
-inline std::vector<int> GetRealAxis(int ndim, const std::vector<int>& axis) {
+inline std::vector<int> GetRealAxis(int ndim, const Array<Integer>& axis) {
   std::vector<int> real_axis;
-  if (axis.size() == 0) {
+  if (!axis.defined() || axis.size() == 0) {
     for (int i = 0; i < ndim; ++i) {
       real_axis.push_back(i);
     }
   } else {
     // Use a set so duplicates are removed and the dims are sorted
-    std::set<int> dims;
-    for (auto ele : axis) {
-      if (ele < 0) {
-        ele += ndim;
-      }
-      if (ele >= ndim) {
-        LOG(ERROR) << ele << " exceeds the maximum dimension " << ndim;
+    for (auto elem : axis) {
+      int64_t val = elem->value;
+      if (val < 0) {
+        val += ndim;
       }
-      dims.emplace(ele);
+      CHECK_LE(val, ndim) << " exceeds the maximum dimension " << ndim;
+      CHECK_GE(val, 0);
+      real_axis.push_back(static_cast<int>(val));
     }
-    std::copy(dims.begin(), dims.end(), std::back_inserter(real_axis));
+    std::sort(real_axis.begin(), real_axis.end());
+    real_axis.resize(
+        std::unique(real_axis.begin(), real_axis.end()) - real_axis.begin());
   }
   return real_axis;
 }
@@ -84,7 +80,8 @@ inline Array<IterVar> MakeReduceAxes(const std::vector<int>& real_axis, const Te
 /*! \brief Calculate the target shape for a reduce op */
 inline Array<Expr> MakeReduceTargetShape(const std::vector<int>& real_axis,
                                          const Tensor& data,
-                                         bool keepdims) {
+                                         bool keepdims,
+                                         bool atleast1d) {
   auto ndim = data->shape.size();
   Array<Expr> target_shape;
   if (keepdims) {
@@ -104,7 +101,7 @@ inline Array<Expr> MakeReduceTargetShape(const std::vector<int>& real_axis,
       }
     }
   }
-  if (target_shape.size() == 0 && TOPI_REDUCE_ATLEAST1D) {
+  if (target_shape.size() == 0 && atleast1d) {
     target_shape.push_back(1);
   }
   return target_shape;
@@ -163,18 +160,19 @@ inline Tensor DoCommReduce(const Tensor& data,
  * \param keepdims If this is set to true, the axes which are reduced are
  * left in the result as dimensions with size one. This enables the result
  * to broadcast correctly against the input array.
+ * \param atleast1d Whether the output need to be atleast1d.
  *
  * \return The result tensor.
  */
 inline Tensor CommReduce(const Tensor& data,
-                         const Array<Expr>& axis,
+                         const Array<Integer>& axis,
                          FReduce func,
-                         bool keepdims = false) {
+                         bool keepdims,
+                         bool atleast1d) {
   auto ndim = data->shape.size();
   CHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor";
-  auto axis_val = detail::GetConstIntValues(axis, "axis");
-  auto real_axis = GetRealAxis(static_cast<int>(ndim), axis_val);
-  auto target_shape = MakeReduceTargetShape(real_axis, data, keepdims);
+  auto real_axis = GetRealAxis(static_cast<int>(ndim), axis);
+  auto target_shape = MakeReduceTargetShape(real_axis, data, keepdims, atleast1d);
   return DoCommReduce(data, func, target_shape, real_axis,
       keepdims ? std::vector<int>() : real_axis);
 }
@@ -188,19 +186,20 @@ inline Tensor CommReduce(const Tensor& data,
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return The result tensor.
 */
 inline Tensor CommReduceIdx(const Tensor& data,
-                            const Array<Expr>& axis,
+                            const Array<Integer>& axis,
                             FCommReduce func,
-                            bool keepdims = false) {
+                            bool keepdims,
+                            bool atleast1d) {
   auto ndim = data->shape.size();
   CHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor";
-  auto axis_val = detail::GetConstIntValues(axis, "axis");
-  auto real_axis = GetRealAxis(static_cast<int>(ndim), axis_val);
+  auto real_axis = GetRealAxis(static_cast<int>(ndim), axis);
   auto reduce_axes = MakeReduceAxes(real_axis, data);
-  auto target_shape = MakeReduceTargetShape(real_axis, data, keepdims);
+  auto target_shape = MakeReduceTargetShape(real_axis, data, keepdims, atleast1d);
 
   auto compute = [ndim, keepdims, &real_axis, &reduce_axes, &func, &data]
   (const Array<Var>& indices) {
@@ -311,11 +310,15 @@ inline Expr ProdOp(Expr source, Array<IterVar> axis) {
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return A Tensor whose op member is the sum operation
 */
-inline Tensor sum(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
-  return CommReduce(data, axis, tvm::sum, keepdims);
+inline Tensor sum(const Tensor& data,
+                  const Array<Integer>& axis,
+                  bool keepdims = false,
+                  bool atleast1d = TOPI_OUTPUT_ATLEAST1D) {
+  return CommReduce(data, axis, tvm::sum, keepdims, atleast1d);
 }
 
 inline Tensor collapse_sum(const Tensor& data, Array<Expr> target_shape) {
@@ -356,11 +359,15 @@ inline Tensor collapse_sum(const Tensor& data, Array<Expr> target_shape) {
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return A Tensor whose op member is the min operation
 */
-inline Tensor min(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
-  return CommReduce(data, axis, MinOp, keepdims);
+inline Tensor min(const Tensor& data,
+                  const Array<Integer>& axis,
+                  bool keepdims = false,
+                  bool atleast1d = TOPI_OUTPUT_ATLEAST1D) {
+  return CommReduce(data, axis, MinOp, keepdims, atleast1d);
 }
 
 /*!
@@ -373,11 +380,15 @@ inline Tensor min(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return A Tensor whose op member is the max operation
 */
-inline Tensor max(const Tensor& data, Array<Expr> axis, bool keepdims = false) {  // NOLINT(*)
-  return CommReduce(data, axis, MaxOp, keepdims);
+inline Tensor max(const Tensor& data,
+                  const Array<Integer>& axis,
+                  bool keepdims = false,
+                  bool atleast1d = TOPI_OUTPUT_ATLEAST1D) {
+  return CommReduce(data, axis, MaxOp, keepdims, atleast1d);
 }
 
 /*!
@@ -390,10 +401,14 @@ inline Tensor max(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return A Tensor whose op member is the argmin operation
 */
-inline Tensor argmin(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
+inline Tensor argmin(const Tensor& data,
+                     const Array<Integer>& axis,
+                     bool keepdims = false,
+                     bool atleast1d = TOPI_OUTPUT_ATLEAST1D) {
   auto fcombine = [](Array<Var> lhs, Array<Var> rhs) {
     Array<Expr> result;
     result.push_back(tvm::select(lhs[1] <= rhs[1], lhs[0], rhs[0]));  // idx
@@ -407,7 +422,7 @@ inline Tensor argmin(const Tensor& data, Array<Expr> axis, bool keepdims = false
     return result;
   };
   auto func = MakeCommReducer(fcombine, fidentity, "argmin");
-  return CommReduceIdx(data, axis, func, keepdims);
+  return CommReduceIdx(data, axis, func, keepdims, atleast1d);
 }
 
 /*!
@@ -420,10 +435,14 @@ inline Tensor argmin(const Tensor& data, Array<Expr> axis, bool keepdims = false
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return A Tensor whose op member is the argmax operation
 */
-inline Tensor argmax(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
+inline Tensor argmax(const Tensor& data,
+                     const Array<Integer>& axis,
+                     bool keepdims = false,
+                     bool atleast1d = TOPI_OUTPUT_ATLEAST1D) {
   auto fcombine = [](Array<Var> lhs, Array<Var> rhs) {
     Array<Expr> result;
     result.push_back(tvm::select(lhs[1] >= rhs[1], lhs[0], rhs[0]));  // idx
@@ -437,7 +456,7 @@ inline Tensor argmax(const Tensor& data, Array<Expr> axis, bool keepdims = false
     return result;
   };
   auto func = MakeCommReducer(fcombine, fidentity, "argmax");
-  return CommReduceIdx(data, axis, func, keepdims);
+  return CommReduceIdx(data, axis, func, keepdims, atleast1d);
 }
 
 /*!
@@ -449,11 +468,15 @@ inline Tensor argmax(const Tensor& data, Array<Expr> axis, bool keepdims = false
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return A Tensor whose op member is the prod operation
 */
-inline Tensor prod(const Tensor& data, Array<Expr> axis, bool keepdims = false) {  // NOLINT(*)
-  return CommReduce(data, axis, ProdOp, keepdims);
+inline Tensor prod(const Tensor& data,
+                   const Array<Integer>& axis,
+                   bool keepdims = false,
+                   bool atleast1d = TOPI_OUTPUT_ATLEAST1D) {
+  return CommReduce(data, axis, ProdOp, keepdims, atleast1d);
 }
 
 }  // namespace topi
diff --git a/topi/include/topi/tags.h b/topi/include/topi/tags.h
index 8c92644d96d3..cd24d60d6dfd 100644
--- a/topi/include/topi/tags.h
+++ b/topi/include/topi/tags.h
@@ -37,6 +37,14 @@ inline bool is_injective(std::string tag) {
     tag.rfind(kInjective, 0) == 0;
 }
 
+/*!
+ * \brief macro flag to enable some legacy behavior which requires
+ * reduction result to be at least 1d.
+ */
+#ifndef TOPI_OUTPUT_ATLEAST1D
+#define TOPI_OUTPUT_ATLEAST1D 0
+#endif
+
 }  // namespace topi
 
 #endif  // TOPI_TAGS_H_
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index cb09f1cb419e..f1e942acb9d7 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -196,30 +196,34 @@ inline Tensor reshape(const Tensor& x,
 * \param x The input tensor
 * \param axis Indices of the dimensions to remove. If this is empty,
 * all entries with a constant size of 1 will be removed.
+ * \param atleast1d Whether the output need to be atleast1d.
 * \param name The name of the operation
 * \param tag The tag to mark the operation
 *
 * \return A Tensor whose op member is the squeeze operation
 */
 inline Tensor squeeze(const Tensor& x,
-                      Array<Expr> axis,
+                      Array<Integer> axis,
+                      bool atleast1d = TOPI_OUTPUT_ATLEAST1D,
                       std::string name = "tensor",
                       std::string tag = kInjective) {
-  auto axis_val = GetConstIntValues(axis, "axis");
   auto ndim = x->shape.size();
-  if (axis_val.size() == 0) {
+  std::vector<int> axis_val;
+  if (!axis.defined() || axis.size() == 0) {
     for (size_t i = 0; i < ndim; ++i) {
       if (IsConstInt(x->shape[i]) && GetConstInt(x->shape[i]) == 1) {
         axis_val.push_back(static_cast<int>(i));
       }
     }
   } else {
-    for (size_t i = 0; i < axis_val.size(); ++i) {
-      if (axis_val[i] < 0) {
-        axis_val[i] += static_cast<int>(x->shape.size());
+    for (size_t i = 0; i < axis.size(); ++i) {
+      int64_t val = axis[i]->value;
+      if (val < 0) {
+        val += static_cast<int>(x->shape.size());
       }
-      CHECK_EQ(GetConstInt(x->shape[axis_val[i]]), 1) <<
-        "Dimension " << axis[i] << " must have size 1";
+      CHECK_EQ(GetConstInt(x->shape[val]), 1) <<
+          "Dimension " << val << " must have size 1";
+      axis_val.push_back(val);
     }
   }
 
@@ -231,7 +235,7 @@ inline Tensor squeeze(const Tensor& x,
       out_shape.push_back(x->shape[i]);
     }
   }
-  if (out_shape.size() == 0) {
+  if (out_shape.size() == 0 && atleast1d) {
     out_shape.push_back(1);
   }
 
diff --git a/topi/python/topi/cuda/reduction.py b/topi/python/topi/cuda/reduction.py
index 79fa02156b19..4c5d1a507660 100644
--- a/topi/python/topi/cuda/reduction.py
+++ b/topi/python/topi/cuda/reduction.py
@@ -63,10 +63,12 @@ def _schedule_reduce(op, sch, is_idx_reduce=False):
             sch[temp_val_input].compute_at(sch[real_output], outer_in)
     else:
         if is_idx_reduce:
+            spatial_axis = sch[real_output].fuse(*(sch[real_output].op.axis))
+            sch[real_output].bind(spatial_axis, tvm.thread_axis("blockIdx.x"))
             sch[temp_idx_input].compute_at(sch[real_output],
-                                           sch[real_output].op.axis[0])
+                                           spatial_axis)
             sch[temp_val_input].compute_at(sch[real_output],
-                                           sch[real_output].op.axis[0])
+                                           spatial_axis)
     sch[real_output].set_store_predicate(thread_x.equal(0))
     return sch
 
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index b47ba1165eb9..13a5ccad654c 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -59,9 +59,9 @@ using namespace tvm;
 using namespace tvm::runtime;
 
 /*! \brief Canonicalize an argument that may be Array<Expr> or int to Array<Expr> */
-Array<Expr> ArrayOrInt(TVMArgValue arg) {
+Array<Integer> ArrayOrInt(TVMArgValue arg) {
   if (arg.type_code() == kDLInt || arg.type_code() == kDLUInt) {
-    Array<Expr> result;
+    Array<Integer> result;
     result.push_back(arg.operator int());
     return result;
   } else {
diff --git a/topi/tests/python/test_topi_reduce.py b/topi/tests/python/test_topi_reduce.py
index 3b3472f538b7..77a33d86ed3e 100644
--- a/topi/tests/python/test_topi_reduce.py
+++ b/topi/tests/python/test_topi_reduce.py
@@ -97,6 +97,10 @@ def check_device(device):
 
 
 def test_reduce_map():
+    verify_reduce_map_ele(in_shape=(32,),
+                          axis=0,
+                          keepdims=False,
+                          type="argmax")
     verify_reduce_map_ele(in_shape=(128, 24, 128, 24),
                         axis=(1, 2, 3),
                         keepdims=True,
diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py
index dc3c3fb70b24..84d4aa6dc952 100644
--- a/topi/tests/python/test_topi_transform.py
+++ b/topi/tests/python/test_topi_transform.py
@@ -91,10 +91,7 @@ def check_device(device):
         data_npy = np.random.normal(size=src_shape).astype(A.dtype)
         out_npy = np.squeeze(data_npy, axis=axis)
         data_nd = tvm.nd.array(data_npy, ctx)
-        if out_npy.shape == ():
-            out_nd_shape = (1,)
-        else:
-            out_nd_shape = out_npy.shape
+        out_nd_shape = out_npy.shape
         out_nd = tvm.nd.empty(out_nd_shape, ctx=ctx, dtype=B.dtype)
         foo(data_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)

From a9928178bbfe0778b5ec952f572c99f3ecf88af7 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 21 Nov 2018 11:48:46 -0800
Subject: [PATCH 2/2] remove macro switch

---
 nnvm/src/top/tensor/reduce.cc                | 19 +++++++------------
 nnvm/src/top/tensor/transform.cc             |  7 +------
 topi/include/topi/reduction.h                | 12 ++++++------
 topi/include/topi/tags.h                     |  8 --------
 topi/include/topi/transform.h                |  2 +-
 topi/tests/python_cpp/test_topi_transform.py |  5 +----
 6 files changed, 16 insertions(+), 37 deletions(-)

diff --git a/nnvm/src/top/tensor/reduce.cc b/nnvm/src/top/tensor/reduce.cc
index a9480856f533..007a3cc6e3fb 100644
--- a/nnvm/src/top/tensor/reduce.cc
+++ b/nnvm/src/top/tensor/reduce.cc
@@ -3,9 +3,6 @@
  * \file reduce.cc
  * \brief reduce operator.
  */
-// Enforce TOPI to use old behavior that reduces to at least 1d
-#define TOPI_OUTPUT_ATLEAST1D 1
-
 #include <nnvm/op.h>
 #include <nnvm/node.h>
 #include <nnvm/op_attr_types.h>
@@ -20,8 +17,6 @@
 #include "topi/reduction.h"
 #include "topi/transform.h"
 
-static_assert(TOPI_OUTPUT_ATLEAST1D, "need to use legacy reduce behavior");
-
 namespace nnvm {
 namespace top {
 using namespace tvm;
@@ -171,7 +166,7 @@ Example::
     if (!r_axes.ndim()) return Array<Tensor> { topi::identity(inputs[0]) };
     auto axis = ShapeToIntArray(r_axes);
     return Array<Tensor>{
-      topi::sum(inputs[0], axis, param.keepdims) };
+      topi::sum(inputs[0], axis, param.keepdims, true) };
 })
 .set_attr<FGradient>(
   "FGradient", [](const NodePtr& n,
@@ -205,7 +200,7 @@ NNVM_REGISTER_REDUCE_OP(max)
                                   param.axis, param.exclude);
     auto axis = ShapeToIntArray(r_axes);
     return Array<Tensor>{
-      topi::max(inputs[0], axis, param.keepdims) };
+      topi::max(inputs[0], axis, param.keepdims, true) };
 })
 .set_attr<FGradient>(
   "FGradient", [](const NodePtr& n,
@@ -238,7 +233,7 @@ NNVM_REGISTER_REDUCE_OP(min)
                                   param.axis, param.exclude);
     auto axis = ShapeToIntArray(r_axes);
     return Array<Tensor>{
-      topi::min(inputs[0], axis, param.keepdims) };
+      topi::min(inputs[0], axis, param.keepdims, true) };
 })
 .set_attr<FGradient>(
   "FGradient", [](const NodePtr& n,
@@ -301,7 +296,7 @@ values over a given axis.
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
     auto axis = ShapeToIntArray(r_axes);
-    Tensor out = topi::argmax(inputs[0], axis, param.keepdims);
+    Tensor out = topi::argmax(inputs[0], axis, param.keepdims, true);
     if (param.dtype == kFloat32) out = topi::cast(out, out_info[0]->dtype);
     return Array<Tensor>{out};
 });
@@ -324,7 +319,7 @@ values over a given axis.
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
     auto axis = ShapeToIntArray(r_axes);
-    Tensor out = topi::argmin(inputs[0], axis, param.keepdims);
+    Tensor out = topi::argmin(inputs[0], axis, param.keepdims, true);
     if (param.dtype == kFloat32) out = topi::cast(out, out_info[0]->dtype);
     return Array<Tensor>{out};
 });
@@ -361,7 +356,7 @@ Example::
     }
 
     return Array<Tensor>{
-      topi::divide(topi::sum(inputs[0], axis, param.keepdims), count) };
+      topi::divide(topi::sum(inputs[0], axis, param.keepdims, true), count) };
 });
 
 NNVM_REGISTER_REDUCE_OP(prod)
@@ -390,7 +385,7 @@ Example::
     if (!r_axes.ndim()) return Array<Tensor> { topi::identity(inputs[0]) };
     auto axis = ShapeToIntArray(r_axes);
     return Array<Tensor>{
-      topi::prod(inputs[0], axis, param.keepdims) };
+      topi::prod(inputs[0], axis, param.keepdims, true) };
 });
 
 
diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc
index 1fab5cf99c5a..492208ed7a7c 100644
--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -3,9 +3,6 @@
  * \file transform.cc
  * \brief Injective transformation of shape or type.
  */
-
-#define TOPI_OUTPUT_ATLEAST1D 1
-
 #include <nnvm/op.h>
 #include <nnvm/node.h>
 #include <nnvm/op_attr_types.h>
@@ -22,8 +19,6 @@
 #include "topi/detail/constant_utils.h"
 #include "../../compiler/compile_engine.h"
 
-static_assert(TOPI_OUTPUT_ATLEAST1D, "need to use legacy reduce behavior");
-
 namespace nnvm {
 namespace top {
 using namespace tvm;
@@ -762,7 +757,7 @@ Examples::
                     const Array<Tensor>& out_info) {
     const SqueezeParam& param = nnvm::get<SqueezeParam>(attrs.parsed);
     auto axis = ShapeToIntArray(param.axis);
-    return Array<Tensor>{ topi::squeeze(inputs[0], axis) };
+    return Array<Tensor>{ topi::squeeze(inputs[0], axis, true) };
 })
 .set_attr<FGradient>(
   "FGradient", [](const NodePtr& n,
diff --git a/topi/include/topi/reduction.h b/topi/include/topi/reduction.h
index ed48aad7c433..f26d14951fd4 100644
--- a/topi/include/topi/reduction.h
+++ b/topi/include/topi/reduction.h
@@ -317,7 +317,7 @@ inline Expr ProdOp(Expr source, Array<IterVar> axis) {
 inline Tensor sum(const Tensor& data,
                   const Array<Integer>& axis,
                   bool keepdims = false,
-                  bool atleast1d = TOPI_OUTPUT_ATLEAST1D) {
+                  bool atleast1d = false) {
   return CommReduce(data, axis, tvm::sum, keepdims, atleast1d);
 }
 
@@ -366,7 +366,7 @@ inline Tensor collapse_sum(const Tensor& data, Array<Expr> target_shape) {
 inline Tensor min(const Tensor& data,
                   const Array<Integer>& axis,
                   bool keepdims = false,
-                  bool atleast1d = TOPI_OUTPUT_ATLEAST1D) {
+                  bool atleast1d = false) {
   return CommReduce(data, axis, MinOp, keepdims, atleast1d);
 }
 
@@ -387,7 +387,7 @@ inline Tensor min(const Tensor& data,
 inline Tensor max(const Tensor& data,
                   const Array<Integer>& axis,
                   bool keepdims = false,
-                  bool atleast1d = TOPI_OUTPUT_ATLEAST1D) {
+                  bool atleast1d = false) {
   return CommReduce(data, axis, MaxOp, keepdims, atleast1d);
 }
 
@@ -408,7 +408,7 @@ inline Tensor max(const Tensor& data,
 inline Tensor argmin(const Tensor& data,
                      const Array<Integer>& axis,
                      bool keepdims = false,
-                     bool atleast1d = TOPI_OUTPUT_ATLEAST1D) {
+                     bool atleast1d = false) {
   auto fcombine = [](Array<Var> lhs, Array<Var> rhs) {
     Array<Expr> result;
     result.push_back(tvm::select(lhs[1] <= rhs[1], lhs[0], rhs[0]));  // idx
@@ -442,7 +442,7 @@ inline Tensor argmin(const Tensor& data,
 inline Tensor argmax(const Tensor& data,
                      const Array<Integer>& axis,
                      bool keepdims = false,
-                     bool atleast1d = TOPI_OUTPUT_ATLEAST1D) {
+                     bool atleast1d = false) {
   auto fcombine = [](Array<Var> lhs, Array<Var> rhs) {
     Array<Expr> result;
     result.push_back(tvm::select(lhs[1] >= rhs[1], lhs[0], rhs[0]));  // idx
@@ -475,7 +475,7 @@ inline Tensor argmax(const Tensor& data,
 inline Tensor prod(const Tensor& data,
                    const Array<Integer>& axis,
                    bool keepdims = false,
-                   bool atleast1d = TOPI_OUTPUT_ATLEAST1D) {
+                   bool atleast1d = false) {
   return CommReduce(data, axis, ProdOp, keepdims, atleast1d);
 }
 
diff --git a/topi/include/topi/tags.h b/topi/include/topi/tags.h
index cd24d60d6dfd..8c92644d96d3 100644
--- a/topi/include/topi/tags.h
+++ b/topi/include/topi/tags.h
@@ -37,14 +37,6 @@ inline bool is_injective(std::string tag) {
     tag.rfind(kInjective, 0) == 0;
 }
 
-/*!
- * \brief macro flag to enable some legacy behavior which requires
- * reduction result to be at least 1d.
- */
-#ifndef TOPI_OUTPUT_ATLEAST1D
-#define TOPI_OUTPUT_ATLEAST1D 0
-#endif
-
 }  // namespace topi
 
 #endif  // TOPI_TAGS_H_
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index f1e942acb9d7..9bc62b2c0249 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -204,7 +204,7 @@ inline Tensor reshape(const Tensor& x,
 */
 inline Tensor squeeze(const Tensor& x,
                       Array<Integer> axis,
-                      bool atleast1d = TOPI_OUTPUT_ATLEAST1D,
+                      bool atleast1d = false,
                       std::string name = "tensor",
                       std::string tag = kInjective) {
   auto ndim = x->shape.size();
diff --git a/topi/tests/python_cpp/test_topi_transform.py b/topi/tests/python_cpp/test_topi_transform.py
index 492f1d94c341..b411375b333e 100644
--- a/topi/tests/python_cpp/test_topi_transform.py
+++ b/topi/tests/python_cpp/test_topi_transform.py
@@ -100,10 +100,7 @@ def check_device(device):
         data_npy = np.random.normal(size=src_shape).astype(A.dtype)
         out_npy = np.squeeze(data_npy, axis=axis)
         data_nd = tvm.nd.array(data_npy, ctx)
-        if out_npy.shape == ():
-            out_nd_shape = (1,)
-        else:
-            out_nd_shape = out_npy.shape
+        out_nd_shape = out_npy.shape
         out_nd = tvm.nd.empty(out_nd_shape, ctx=ctx, dtype=B.dtype)
         foo(data_nd, out_nd)
         tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)