From badd8608d41a8b10efc4f373de03c9227f189e78 Mon Sep 17 00:00:00 2001
From: Yong Wu <ywu118@alumni.jh.edu>
Date: Tue, 12 Nov 2019 00:49:05 -0800
Subject: [PATCH 01/22] [TOPI][Relay][OP] Dynamic NMS and strided_slice

---
 include/tvm/relay/attrs/transform.h           |  15 +-
 include/tvm/relay/attrs/vision.h              |  26 +-
 python/tvm/relay/frontend/common.py           |   2 +-
 python/tvm/relay/frontend/tensorflow.py       |  51 +++
 .../tvm/relay/frontend/tensorflow_parser.py   |   4 +-
 python/tvm/relay/op/_transform.py             |  12 +
 python/tvm/relay/op/transform.py              |  10 +-
 python/tvm/relay/op/vision/nms.py             |  36 ++-
 python/tvm/relay/testing/tf.py                |   2 +-
 src/relay/op/tensor/transform.cc              | 306 +++++++++++-------
 src/relay/op/vision/nms.cc                    |  34 +-
 .../transforms/combine_parallel_conv2d.cc     |  19 +-
 src/relay/transforms/pattern_util.h           |   2 +-
 .../frontend/tensorflow/test_control_flow.py  |  26 +-
 .../frontend/tensorflow/test_debugging.py     |  12 +-
 .../frontend/tensorflow/test_forward.py       |  27 +-
 tests/python/frontend/tflite/test_forward.py  |   2 +-
 tests/python/relay/test_any.py                |  32 ++
 tests/python/relay/test_op_level4.py          |  42 ++-
 tests/python/relay/test_op_level5.py          |  28 +-
 topi/python/topi/sort.py                      |   4 +-
 topi/python/topi/vision/nms.py                | 247 ++++++++++++--
 topi/python/topi/vision/ssd/multibox.py       |   2 +-
 topi/tests/python/test_topi_vision.py         |  17 +-
 tutorials/frontend/from_tensorflow.py         |   4 +-
 25 files changed, 740 insertions(+), 222 deletions(-)
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index ccf8e54fea96..4f0c90ec4f4a 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -210,14 +210,17 @@ struct SplitAttrs : public tvm::AttrsNode<SplitAttrs> {
 
 /*! \brief Attributes for StridedSlice operator */
 struct StridedSliceAttrs : public tvm::AttrsNode<StridedSliceAttrs> {
-  Array<Integer> begin;
-  Array<Integer> end;
-  Array<Integer> strides;
+  Expr begin;
+  Expr end;
+  Expr strides;
 
   TVM_DECLARE_ATTRS(StridedSliceAttrs, "relay.attrs.StridedSliceAttrs") {
-    TVM_ATTR_FIELD(begin).describe("Indices for begin of slice, begin index is also inclusive");
-    TVM_ATTR_FIELD(end).describe("Indices for end of slice, end index is exclusive");
-    TVM_ATTR_FIELD(strides).set_default(Array<Integer>({})).describe("Stride values of the slice");
+    TVM_ATTR_FIELD(begin)
+        .describe("Indices for begin of slice, begin index is also inclusive");
+    TVM_ATTR_FIELD(end)
+        .describe("Indices for end of slice, end index is exclusive");
+    TVM_ATTR_FIELD(strides)
+        .describe("Stride values of the slice");
   }
 };
 
diff --git a/include/tvm/relay/attrs/vision.h b/include/tvm/relay/attrs/vision.h
index e7e24b19228b..0a0ca9fe93d6 100644
--- a/include/tvm/relay/attrs/vision.h
+++ b/include/tvm/relay/attrs/vision.h
@@ -89,6 +89,7 @@ struct GetValidCountsAttrs : public tvm::AttrsNode<GetValidCountsAttrs> {
 /*! \brief Attributes used in non_maximum_suppression operator */
 struct NonMaximumSuppressionAttrs : public tvm::AttrsNode<NonMaximumSuppressionAttrs> {
   int max_output_size;
+  double score_threshold;
   double iou_threshold;
   bool force_suppress;
   int top_k;
@@ -101,20 +102,29 @@ struct NonMaximumSuppressionAttrs : public tvm::AttrsNode<NonMaximumSuppressionA
   TVM_DECLARE_ATTRS(NonMaximumSuppressionAttrs, "relay.attrs.NonMaximumSuppressionAttrs") {
     TVM_ATTR_FIELD(max_output_size)
         .set_default(-1)
-        .describe(
-            "Max number of output valid boxes for each instance."
-            "By default all valid boxes are returned.");
-    TVM_ATTR_FIELD(iou_threshold).set_default(0.5).describe("Non-maximum suppression threshold.");
+        .describe("Max number of output valid boxes for each instance."
+                  "By default all valid boxes are returned.");
+    TVM_ATTR_FIELD(score_threshold)
+        .set_default(0.0)
+        .describe("Non-maximum suppression score threshold.");
+    TVM_ATTR_FIELD(iou_threshold)
+        .set_default(0.5)
+        .describe("Non-maximum suppression iou threshold.");
     TVM_ATTR_FIELD(force_suppress)
         .set_default(false)
         .describe("Suppress all detections regardless of class_id.");
-    TVM_ATTR_FIELD(top_k).set_default(-1).describe(
-        "Keep maximum top k detections before nms, -1 for no limit.");
+    TVM_ATTR_FIELD(top_k)
+        .set_default(-1)
+        .describe("Keep maximum top k detections before nms, -1 for no limit.");
     TVM_ATTR_FIELD(coord_start)
         .set_default(2)
         .describe("Start index of the consecutive 4 coordinates.");
-    TVM_ATTR_FIELD(score_index).set_default(1).describe("Index of the scores/confidence of boxes.");
-    TVM_ATTR_FIELD(id_index).set_default(0).describe("Axis index of id.");
+    TVM_ATTR_FIELD(score_index)
+        .set_default(1)
+        .describe("Index of the scores/confidence of boxes.");
+    TVM_ATTR_FIELD(id_index)
+        .set_default(0)
+        .describe("Axis index of id.");
     TVM_ATTR_FIELD(return_indices)
         .set_default(true)
         .describe("Whether to return box indices in input data.");
diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index e86890f3639a..8dc1a70f5b68 100644
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -571,4 +571,4 @@ def __init__(self, new_name):
     def __call__(self, inputs, attrs, *args):
         if 'tvm_custom' in attrs:
             attrs.pop('tvm_custom')
-        return get_relay_op(self._new_name)(*inputs, **attrs)
+        return get_relay_op(self._new_name)(*inputs, **attrs)
\ No newline at end of file
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index ab9e9e656516..5926959349c8 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -96,6 +96,9 @@ def _get_tuple_param(params, input_node):
 def _need_prelude_for_shape_inference(op):
     return "TensorArray" in op
 
+def _need_module_for_shape_inference(op):
+    return op in ['StridedSlice, NonMaxSuppressionV3']
+
 def _rsqrt():
     def _impl(inputs, attr, params, mod):
         inputs.append(tvm.relay.const(-0.5, attr['T'].name))
@@ -612,6 +615,52 @@ def _impl(inputs, attr, params, mod):
             out = _op.transpose(out, axes=(0, 2, 3, 4, 1))
 
         return out
+
+def _nms():
+    def _impl(inputs, attr, params):
+        # Get parameter values
+        max_output_size = int(np.atleast_1d(inputs[2].data.asnumpy().astype("int64"))[0])
+        iou_threshold = np.atleast_1d(inputs[3].data.asnumpy())[0]
+        # score_threshold was introduced from V3
+        score_threshold = np.atleast_1d(inputs[4].data.asnumpy())[0] if len(inputs) > 4 else None
+
+        scores = AttrCvt(op_name="expand_dims",
+                         extras={'axis': -1, 'num_newaxis': 1})([inputs[1]], attr)
+
+        data = get_relay_op('concatenate')([scores, inputs[0]], -1)
+        # expand to [class_id, prob, box]
+        # data = _get_relay_op('concatenate')([scores, data], -1)
+        # expand to [batch_size, num_anchors, 6] or [batch_size, num_anchors, 5]
+
+        data = get_relay_op('expand_dims')(data, 0, 1)
+
+        # Don't need to call get_valid_counts for TensorFlow and ONNX
+        # ct, data = _get_relay_op('get_valid_counts')(data, score_threshold=score_threshold,
+        #                                              id_index=-1, score_index=0)
+        # get the number of anchors
+        data_shape = attr['_input_shapes'][inputs[1]]
+        valid_cnt = _expr.const(data_shape)
+        # TensorFlow NMS doesn't have parameter top_k
+        top_k = -1
+        # score_index is 0 since TF doesn't have class id for nms input
+        score_index = 0
+        nms_ret = get_relay_op('non_max_suppression')(data=data,
+                                                      valid_count=valid_cnt,
+                                                      max_output_size=max_output_size,
+                                                      score_threshold=score_threshold,
+                                                      iou_threshold=iou_threshold,
+                                                      force_suppress=False,
+                                                      top_k=top_k,
+                                                      coord_start=1,
+                                                      score_index=score_index,
+                                                      id_index=-1,
+                                                      return_indices=True,
+                                                      invalid_to_bottom=False)
+
+        end = get_relay_op("squeeze")(nms_ret[1], axis=[1])
+        data_slice = get_relay_op("squeeze")(nms_ret[0], axis=[0])
+        ret = get_relay_op("strided_slice")(data_slice, _expr.const([0]), end, _expr.const([1]))
+        return ret
     return _impl
 
 def _decode_image():
@@ -2027,6 +2076,8 @@ def _impl(inputs, attr, params, mod):
     'Mod'                               : _elemwise('mod'),
     'Mul'                               : _elemwise('multiply'),
     'Neg'                               : AttrCvt('negative'),
+    'NonMaxSuppressionV2'               : _nms(),
+    'NonMaxSuppressionV3'               : _nms(),
     'NoOp'                              : _no_op(),
     'NotEqual'                          : _broadcast('not_equal'),
     'OneHot'                            : _one_hot(),
diff --git a/python/tvm/relay/frontend/tensorflow_parser.py b/python/tvm/relay/frontend/tensorflow_parser.py
index fdbb8768597f..4e0f14c577cb 100644
--- a/python/tvm/relay/frontend/tensorflow_parser.py
+++ b/python/tvm/relay/frontend/tensorflow_parser.py
@@ -80,14 +80,14 @@ def _get_output_names(self):
                 "required to restore from saved model.")
         tags = self._get_tag_set()
         output_names = set()
-        with tf.Session() as sess:
+        with tf.compat.v1.Session() as sess:
             meta_graph_def = tf.saved_model.loader.load(sess,
                                                         tags,
                                                         self._model_dir)
             for sig_def in meta_graph_def.signature_def.values():
                 for output_tensor in sig_def.outputs.values():
                     output_names.add(output_tensor.name.replace(":0", ""))
-        tf.reset_default_graph()
+        tf.compat.v1.reset_default_graph()
         return ",".join(output_names)
 
     def _load_saved_model(self):
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index e1c2bd7b0acf..7d81ab13f6db 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -101,6 +101,18 @@ def _arange_shape_func(start, stop, step):
 def arange_shape_func(attrs, inputs, _):
     return [_arange_shape_func(*inputs)]
 
+@script
+def _strided_slice_shape_func(data_shape, begin, end, strides):
+    ndim = len(data_shape.shape)
+    out = output_tensor((ndim,), "int64")
+    for i in const_range(ndim):
+        out[i] = int64(ceil_div((int64(end[i]) - int64(begin[i])), int64(strides[i])))
+    return out
+
+@_reg.register_shape_func("strided_slice", True)
+def strided_slice_shape_func(attrs, inputs, _):
+    return [_strided_slice_shape_func(*inputs)]
+
 @script
 def _concatenate_shape_func(inputs, axis):
     ndim = inputs[0].shape[0]
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 1da58ae3d90e..c8c540fa861b 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -619,13 +619,13 @@ def strided_slice(data, begin, end, strides=None):
     data : relay.Expr
         The source array to be sliced.
 
-    begin: list of int
+    begin: relay.Expr
         The indices to begin with in the slicing.
 
-    end: list of int
+    end: relay.Expr
         Indices indicating end of the slice.
 
-    strides: list of int, optional
+    strides: relay.Expr, optional
         Specifies the stride values, it can be negative in that case,
         the input tensor will be reversed in that particular axis.
 
@@ -634,8 +634,8 @@ def strided_slice(data, begin, end, strides=None):
     ret : relay.Expr
         The computed result.
     """
-    strides = strides or []
-    return _make.strided_slice(data, list(begin), list(end), list(strides))
+    strides = strides or const(1)
+    return _make.strided_slice(data, begin, end, strides)
 
 
 def strided_set(data, v, begin, end, strides=None):
diff --git a/python/tvm/relay/op/vision/nms.py b/python/tvm/relay/op/vision/nms.py
index 70a9ec9ed5e4..a4e98a732715 100644
--- a/python/tvm/relay/op/vision/nms.py
+++ b/python/tvm/relay/op/vision/nms.py
@@ -56,6 +56,7 @@ def get_valid_counts(data,
 def non_max_suppression(data,
                         valid_count,
                         max_output_size=-1,
+                        score_threshold=0.0,
                         iou_threshold=0.5,
                         force_suppress=False,
                         top_k=-1,
@@ -69,9 +70,11 @@ def non_max_suppression(data,
     Parameters
     ----------
     data : relay.Expr
-        3-D tensor with shape [batch_size, num_anchors, 6].
+        3-D tensor with shape [batch_size, num_anchors, 6]
+        or [batch_size, num_anchors, 5].
         The last dimension should be in format of
-        [class_id, score, box_left, box_top, box_right, box_bottom].
+        [class_id, score, box_left, box_top, box_right, box_bottom]
+        or [score, box_left, box_top, box_right, box_bottom].
 
     valid_count : relay.Expr
         1-D tensor for valid number of boxes.
@@ -80,6 +83,9 @@ def non_max_suppression(data,
         Max number of output valid boxes for each instance.
         By default all valid boxes are returned.
 
+    score_threshold : float, optional
+        Lower limit of score for valid bounding boxes.
+
     iou_threshold : float, optional
         Non-maximum suppression threshold.
 
@@ -106,10 +112,24 @@ def non_max_suppression(data,
 
     Returns
     -------
-    out : relay.Expr
-        3-D tensor with shape [batch_size, num_anchors, 6].
+    out : relay.Expr or relay.Tuple
+        return relay.Expr if return_indices is disabled, a 3-D tensor
+        with shape [batch_size, num_anchors, 6] or [batch_size, num_anchors, 6].
+        if return_indices is True, return relay.Tuple of two 2-D tensors, with
+        shape [batch_size, num_anchors] and [batch_size, num_valid_anchors] respectively.
     """
-    return _make.non_max_suppression(data, valid_count, max_output_size,
-                                     iou_threshold, force_suppress, top_k,
-                                     coord_start, score_index, id_index,
-                                     return_indices, invalid_to_bottom)
+    out = _make.non_max_suppression(data,
+                                    valid_count,
+                                    max_output_size,
+                                    score_threshold,
+                                    iou_threshold,
+                                    force_suppress,
+                                    top_k,
+                                    coord_start,
+                                    score_index,
+                                    id_index,
+                                    return_indices,
+                                    invalid_to_bottom)
+    if return_indices:
+        return TupleWrapper(out, 2)
+    return out
diff --git a/python/tvm/relay/testing/tf.py b/python/tvm/relay/testing/tf.py
index dc7937c0b346..567724d9d251 100644
--- a/python/tvm/relay/testing/tf.py
+++ b/python/tvm/relay/testing/tf.py
@@ -77,7 +77,7 @@ def AddShapesToGraphDef(session, out_node):
 
     Parameters
     ----------
-    session : tf.Session
+    session : tf.compat.v1.Session
         Tensorflow session
     out_node : String or List
         Final output node of the graph.
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 7282ac74e6f3..5d471f99a47f 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1095,9 +1095,12 @@ bool ArangeRel(const Array<Type>& types, int num_inputs, const Attrs& raw_attrs,
   }
 }
 
-inline te::Tensor DynamicArange(const te::Tensor& start, const te::Tensor& stop,
-                                const te::Tensor& step, tvm::DataType dtype,
-                                std::string name = "tensor", std::string tag = topi::kInjective) {
+inline te::Tensor DynamicArange(const te::Tensor& start,
+                                 const te::Tensor& stop,
+                                 const te::Tensor& step,
+                                 tvm::DataType dtype,
+                                 std::string name = "T_arange_dynamic",
+                                 std::string tag = topi::kInjective) {
   tvm::PrimExpr num_elem = tvm::tir::Var("num_elem");
   return te::compute(
       {num_elem},
@@ -1110,6 +1113,7 @@ inline te::Tensor DynamicArange(const te::Tensor& start, const te::Tensor& stop,
 Array<te::Tensor> ArangeCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                 const Type& out_type) {
   const ArangeAttrs* param = attrs.as<ArangeAttrs>();
+  CHECK(param != nullptr);
   te::Tensor start = inputs[0];
   te::Tensor stop = inputs[1];
   te::Tensor step = inputs[2];
@@ -1671,93 +1675,101 @@ Array<Integer> GetIntArray(Array<IndexExpr> arr) {
 
 // strided_slice
 TVM_REGISTER_NODE_TYPE(StridedSliceAttrs);
-bool StridedSliceRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
-                     const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 2);
-  const auto* data = types[0].as<TensorTypeNode>();
-  if (data == nullptr) return false;
 
+bool StridedSliceRel(const Array<Type>& types,
+                     int num_inputs,
+                     const Attrs& attrs,
+                     const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 5);
   const StridedSliceAttrs* param = attrs.as<StridedSliceAttrs>();
   CHECK(param != nullptr);
-
+  const auto* data = types[0].as<TensorTypeNode>();
+  CHECK(data != nullptr);
   auto dshape = data->shape;
   auto num_axis = dshape.size();
 
-  std::vector<int64_t> stride_vec;
-  for (Integer i : param->strides) {
-    CHECK(i.defined());
-    stride_vec.push_back(i->value);
-  }
-  for (size_t i = stride_vec.size(); i < num_axis; ++i) {
-    stride_vec.push_back(1);
-  }
-  const int64_t max_range = std::numeric_limits<int64_t>::max();
-
-  std::vector<int64_t> begin_vec;
-  for (size_t i = 0; i < param->begin.size(); ++i) {
-    if (!param->begin[i].defined()) {
-      // value=None
+  // calculate output shape
+  std::vector<IndexExpr> oshape(num_axis);
+  const ConstantNode *cbegin, *cend, *cstrides;
+  if ((cbegin = param->begin.as<ConstantNode>()) &&
+      (cend = param->end.as<ConstantNode>()) &&
+      (cstrides = param->strides.as<ConstantNode>())) {
+    std::vector<int64_t> stride_vec;
+    int32_t* strides_val = reinterpret_cast<int32_t*>(cstrides->data->data);
+    for (size_t i = 0; i < cstrides->data.Shape().front(); ++i){
+      stride_vec.push_back(strides_val[i]);
+    }
+    for (size_t i = stride_vec.size(); i < num_axis; ++i) {
+      stride_vec.push_back(1);
+    }
+    const int64_t max_range = std::numeric_limits<int64_t>::max();
+    std::vector<int64_t> begin_vec;
+    int32_t* begin_val = reinterpret_cast<int32_t*>(cbegin->data->data);
+    for (size_t i = 0; i < cbegin->data.Shape().front(); ++i){
+      begin_vec.push_back(begin_val[i]);
+    }
+    for (size_t i = begin_vec.size(); i < num_axis; ++i) {
       begin_vec.push_back(stride_vec[i] > 0 ? 0 : max_range);
-    } else {
-      begin_vec.push_back(param->begin[i]->value);
     }
-  }
-  for (size_t i = begin_vec.size(); i < num_axis; ++i) {
-    begin_vec.push_back(stride_vec[i] > 0 ? 0 : max_range);
-  }
-
-  std::vector<int64_t> end_vec;
-  for (size_t i = 0; i < param->end.size(); ++i) {
-    // allow end to be None
-    if (!param->end[i].defined()) {
+    std::vector<int64_t> end_vec;
+    int32_t* end_val = reinterpret_cast<int32_t*>(cend->data->data);
+    for (size_t i = 0; i < cend->data.Shape().front(); ++i){
+      end_vec.push_back(end_val[i]);
+    }
+    for (size_t i = end_vec.size(); i < num_axis; ++i) {
       end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
-    } else {
-      end_vec.push_back(param->end[i]->value);
     }
-  }
-  for (size_t i = end_vec.size(); i < num_axis; ++i) {
-    end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
-  }
-
-  std::vector<IndexExpr> oshape(dshape.size());
-  for (size_t i = 0; i < num_axis; ++i) {
-    int64_t stride_v = stride_vec[i];
-    int64_t begin_v = begin_vec[i];
-    int64_t end_v = end_vec[i];
 
-    if ((stride_v == 1 && begin_v == 0 && end_v == max_range) ||
-        (stride_v == -1 && begin_v == max_range && end_v == 0)) {
-      // Quick path, do not slice this dimension.
-      oshape[i] = dshape[i];
-      continue;
+    for (size_t i = 0; i < num_axis; ++i) {
+      int64_t stride_v = stride_vec[i];
+      int64_t begin_v = begin_vec[i];
+      int64_t end_v = end_vec[i];
+
+      if ((stride_v == 1 &&
+           begin_v == 0 &&
+           end_v == max_range) ||
+          (stride_v == -1 &&
+           begin_v == max_range &&
+           end_v == 0)) {
+        // Quick path, do not slice this dimension.
+        oshape[i] = dshape[i];
+        continue;
+      }
+      // Normal path, require the shape to be concrete integer.
+      // Require concrete integer as symbolic inference of min/max
+      // can get complicated and not very helpful.
+      const int64_t* p_dim_size = tir::as_const_int(dshape[i]);
+      CHECK(p_dim_size)
+          << "strided_slice requires sliced dimension to be concrete int";
+      int64_t dim_size = p_dim_size[0];
+      begin_v = (begin_v < 0) ? dim_size + begin_v : begin_v;
+      end_v = (end_v < 0) ? dim_size + end_v : end_v;
+
+      int64_t slice_range, step;
+      if (stride_v < 0) {
+        if (end_v < -1) end_v = -1;
+        CHECK_LT(end_v, begin_v)
+            << "strided_slice get empty slice at axis " << i;
+        begin_v = std::min(dim_size - 1, begin_v);
+        slice_range = begin_v - end_v;
+        step = -stride_v;
+      } else {
+        if (begin_v < 0) begin_v = 0;
+        CHECK_GE(stride_v, 0);
+        CHECK_LT(begin_v, end_v)
+            << "strided_slice get empty slice at axis " << i;
+        end_v = std::min(dim_size, end_v);
+        slice_range = end_v - begin_v;
+        step = stride_v;
+      }
+      oshape[i] = tir::make_const(dshape[i].dtype(), (slice_range + step - 1) / step);
     }
-    // Normal path, require the shape to be concrete integer.
-    // Require concrete integer as symbolic inference of min/max
-    // can get complicated and not very helpful.
-    const int64_t* p_dim_size = tir::as_const_int(dshape[i]);
-    CHECK(p_dim_size) << "strided_slice requires sliced dimension to be concrete int";
-    int64_t dim_size = p_dim_size[0];
-    begin_v = (begin_v < 0) ? dim_size + begin_v : begin_v;
-    end_v = (end_v < 0) ? dim_size + end_v : end_v;
-
-    int64_t slice_range, step;
-    if (stride_v < 0) {
-      if (end_v < -1) end_v = -1;
-      CHECK_LT(end_v, begin_v) << "strided_slice get empty slice at axis " << i;
-      begin_v = std::min(dim_size - 1, begin_v);
-      slice_range = begin_v - end_v;
-      step = -stride_v;
-    } else {
-      if (begin_v < 0) begin_v = 0;
-      CHECK_GE(stride_v, 0);
-      CHECK_LT(begin_v, end_v) << "strided_slice get empty slice at axis " << i;
-      end_v = std::min(dim_size, end_v);
-      slice_range = end_v - begin_v;
-      step = stride_v;
+  } else {
+    for (size_t i = 0; i < num_axis; ++i) {
+      oshape[i] = Any::make();
     }
-    oshape[i] = tir::make_const(dshape[i].dtype(), (slice_range + step - 1) / step);
   }
-  reporter->Assign(types[1], TensorType(oshape, data->dtype));
+  reporter->Assign(types[4], TensorType(oshape, data->dtype));
   return true;
 }
 
@@ -1783,11 +1795,31 @@ Array<Array<Layout>> StridedSliceInferCorrectLayout(const Attrs& attrs,
     auto shape = old_in_shapes[0];
 
     // NOTE: Discard "const" qualifier here.
-    auto* params = const_cast<StridedSliceAttrs*>(attrs.as<StridedSliceAttrs>());
+    auto *params = const_cast<StridedSliceAttrs*>(attrs.as<StridedSliceAttrs>());
+    CHECK(params != nullptr);
+    Array<Integer> begin, end, strides;
+    const ConstantNode *cbegin, *cend, *cstrides;
+    if ((cbegin = params->begin.as<ConstantNode>()) &&
+        (cend = params->end.as<ConstantNode>()) &&
+        (cstrides = params->strides.as<ConstantNode>())) {
+
+      int32_t* strides_val = reinterpret_cast<int32_t*>(cstrides->data->data);
+      for (size_t i = 0; i < cstrides->data.Shape().front(); ++i){
+        strides.push_back(strides_val[i]);
+      }
+      int32_t* begin_val = reinterpret_cast<int32_t*>(cbegin->data->data);
+      for (size_t i = 0; i < cbegin->data.Shape().front(); ++i){
+        begin.push_back(begin_val[i]);
+      }
+      int32_t* end_val = reinterpret_cast<int32_t*>(cend->data->data);
+      for (size_t i = 0; i < cend->data.Shape().front(); ++i){
+        end.push_back(end_val[i]);
+      }
+    }
 
     Array<Integer> new_begin, new_end;
 
-    for (size_t i = 0; i < params->begin.size(); i++) {
+    for (size_t i = 0; i < begin.size(); i++) {
       const LayoutAxis& axis = layout[i];
       if (!axis.IsPrimal()) {
         // original layout that contains splitted axes is not supported
@@ -1795,53 +1827,104 @@ Array<Array<Layout>> StridedSliceInferCorrectLayout(const Attrs& attrs,
       }
       auto factor = new_layout.FactorOf(axis);
       if (factor == -1) {
-        new_begin.push_back(params->begin[i]);
-        new_end.push_back(params->end[i]);
+        new_begin.push_back(begin[i]);
+        new_end.push_back(end[i]);
       } else {
-        if (params->strides.defined() && i < params->strides.size()) {
-          auto stride = params->strides[i];
+        if (strides.defined() && i < strides.size()) {
+          auto stride = strides[i];
           // arbitrary stride is not supported
           if (stride.defined() && stride->value != 1) {
             return {{Layout::Undef()}, {Layout::Undef()}};
           }
         }
-        int64_t begin = params->begin[i].defined() ? params->begin[i]->value : 0;
-        int64_t end =
-            params->end[i].defined() ? params->end[i]->value : shape[i].as<IntImmNode>()->value;
-        if (begin % factor || end % factor) {
+        int64_t bg = begin[i].defined() ? begin[i]->value : 0;
+        int64_t ed = end[i].defined() ? end[i]->value :
+            shape[i].as<IntImmNode>()->value;
+        if (bg % factor || ed % factor) {
           // transform to original layout
           return {{Layout::Undef()}, {Layout::Undef()}};
         }
-        new_begin.push_back(tvm::Integer(begin / factor));
-        new_end.push_back(tvm::Integer(end / factor));
+        new_begin.push_back(tvm::Integer(bg / factor));
+        new_end.push_back(tvm::Integer(ed / factor));
       }
     }
     layout = new_layout;
-    params->begin = new_begin;
-    params->end = new_end;
   }
   return {{layout}, {layout}};
 }
 
-// Positional relay function to create StridedSlice operator used by frontend FFI.
-Expr MakeStridedSlice(Expr data, Array<Integer> begin, Array<Integer> end, Array<Integer> strides) {
-  auto attrs = make_object<StridedSliceAttrs>();
-  attrs->begin = std::move(begin);
-  attrs->end = std::move(end);
-  attrs->strides = std::move(strides);
-  static const Op& op = Op::Get("strided_slice");
-  return Call(op, {data}, Attrs(attrs), {});
+inline Tensor DynamicStridedSlice(const tvm::Tensor& input,
+                                  const tvm::Tensor& begin,
+                                  const tvm::Tensor& end,
+                                  const tvm::Tensor& strides,
+                                  std::string name = "T_strided_slice_dynamic",
+                                  std::string tag = topi::kInjective) {
+  size_t src_tensor_dim = static_cast<size_t>(input->shape.size());
+  Array<tvm::Expr> out_shape;
+  for(size_t i = 0; i < src_tensor_dim; ++i){
+    out_shape.push_back(tvm::Var("dim"));
+  }
+  return tvm::compute(out_shape, [&](const Array<tvm::Var>& indices) {
+      Array<tvm::Expr> real_indices;
+      for (int32_t i = 0; i < src_tensor_dim; ++i) {
+        real_indices.push_back(indices[i] * strides(i) + begin(i));
+      }
+      return input(real_indices);
+    }, name, tag);
 }
 
 Array<te::Tensor> StridedSliceCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                       const Type& out_type) {
   const StridedSliceAttrs* param = attrs.as<StridedSliceAttrs>();
   CHECK(param != nullptr);
-  return Array<te::Tensor>{
-      topi::strided_slice(inputs[0], param->begin, param->end, param->strides)};
+  const ConstantNode *cbegin, *cend, *cstrides;
+  if ((cbegin = param->begin.as<ConstantNode>()) &&
+      (cend = param->end.as<ConstantNode>()) &&
+      (cstrides = param->strides.as<ConstantNode>())) {
+    Array<Integer> begin, end, strides;
+    int32_t* strides_val = reinterpret_cast<int32_t*>(cstrides->data->data);
+    for (size_t i = 0; i < cstrides->data.Shape().front(); ++i){
+      strides.push_back(strides_val[i]);
+    }
+    int32_t* begin_val = reinterpret_cast<int32_t*>(cbegin->data->data);
+    for (size_t i = 0; i < cbegin->data.Shape().front(); ++i){
+      begin.push_back(begin_val[i]);
+    }
+    int32_t* end_val = reinterpret_cast<int32_t*>(cend->data->data);
+    for (size_t i = 0; i < cend->data.Shape().front(); ++i){
+      end.push_back(end_val[i]);
+    }
+    return Array<te::Tensor>{
+      topi::strided_slice(inputs[0], begin, end, strides)
+    };
+  } else {
+    Tensor data = inputs[0];
+    Tensor begin = inputs[1];
+    Tensor end = inputs[2];
+    Tensor strides = inputs[3];
+    // Dynamic computation
+    return Array<Tensor>{
+      DynamicStridedSlice(data, begin, end, strides)
+    };
+  }
 }
 
-TVM_REGISTER_GLOBAL("relay.op._make.strided_slice").set_body_typed(MakeStridedSlice);
+// Positional relay function to create StridedSlice operator used by frontend FFI.
+Expr MakeStridedSlice(Expr data,
+                      Expr begin,
+                      Expr end,
+                      Expr strides) {
+  auto attrs = make_object<StridedSliceAttrs>();
+  attrs->begin = begin;
+  attrs->end = end;
+  attrs->strides = strides;
+  static const Op& op = Op::Get("strided_slice");
+  return Call(op, {data, begin, end, strides}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.strided_slice")
+.set_body_typed(MakeStridedSlice);
+
 
 RELAY_REGISTER_OP("strided_slice")
     .describe(R"code(Strided slice of an array.
@@ -1867,14 +1950,19 @@ Examples::
                                                 [[ 5.,  6.],
                                                  [ 7.,  8.]]]
 )code" TVM_ADD_FILELINE)
-    .set_num_inputs(1)
-    .add_argument("data", "Tensor", "The input tensor.")
-    .set_support_level(4)
-    .set_attrs_type<StridedSliceAttrs>()
-    .add_type_rel("StridedSlice", StridedSliceRel)
-    .set_attr<FTVMCompute>("FTVMCompute", StridedSliceCompute)
-    .set_attr<TOpPattern>("TOpPattern", kInjective)
-    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", StridedSliceInferCorrectLayout);
+.set_num_inputs(4)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("begin", "Tensor", "The indices to begin with in the slicing.")
+.add_argument("end", "Tensor", "Indices indicating end of the slice.")
+.add_argument("strides", "Tensor", "The stride values.")
+.set_support_level(4)
+.set_attrs_type<StridedSliceAttrs>()
+.add_type_rel("StridedSlice", StridedSliceRel)
+.set_attr<FTVMCompute>("FTVMCompute", StridedSliceCompute)
+// TODO(@icemelon, @yongwww): Change to kOpaque because FuseOps doesn't consider dynamic shape
+.set_attr<TOpPattern>("TOpPattern", kOpaque)
+.set_attr<AnyCodegenStrategy>("AnyCodegenStrategy", kVariableDimensions)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", StridedSliceInferCorrectLayout);
 
 // strided_set
 bool StridedSetRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
diff --git a/src/relay/op/vision/nms.cc b/src/relay/op/vision/nms.cc
index b1aaaf01ae9c..bdd4f664165b 100644
--- a/src/relay/op/vision/nms.cc
+++ b/src/relay/op/vision/nms.cc
@@ -82,19 +82,35 @@ bool NMSRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 
   // assign output type
   if (param->return_indices) {
+    std::vector<Type> fields;
+    // dynamic happens for return_indices in TensorFlow & ONNX
     std::vector<IndexExpr> oshape({dshape[0], dshape[1]});
-    reporter->Assign(types[2], TensorType(oshape, DataType::Int(32)));
+    fields.push_back(TensorType(oshape, DataType::Int(32)));
+    std::vector<IndexExpr> countshape({dshape[0], 1});
+    fields.push_back(TensorType(countshape, DataType::Int(32)));
+    reporter->Assign(types[2], TupleType(Array<Type>(fields)));
   } else {
     reporter->Assign(types[2], TensorType(dshape, data->dtype));
   }
   return true;
 }
 
-Expr MakeNMS(Expr data, Expr valid_count, int max_output_size, double iou_threshold,
-             bool force_suppress, int top_k, int coord_start, int score_index, int id_index,
-             bool return_indices, bool invalid_to_bottom) {
+
+Expr MakeNMS(Expr data,
+             Expr valid_count,
+             int max_output_size,
+             double score_threshold,
+             double iou_threshold,
+             bool force_suppress,
+             int top_k,
+             int coord_start,
+             int score_index,
+             int id_index,
+             bool return_indices,
+             bool invalid_to_bottom) {
   auto attrs = make_object<NonMaximumSuppressionAttrs>();
   attrs->max_output_size = max_output_size;
+  attrs->score_threshold = score_threshold;
   attrs->iou_threshold = iou_threshold;
   attrs->force_suppress = force_suppress;
   attrs->top_k = top_k;
@@ -107,12 +123,14 @@ Expr MakeNMS(Expr data, Expr valid_count, int max_output_size, double iou_thresh
   return Call(op, {data, valid_count}, Attrs(attrs), {});
 }
 
-TVM_REGISTER_GLOBAL("relay.op.vision._make.non_max_suppression").set_body_typed(MakeNMS);
+TVM_REGISTER_GLOBAL("relay.op.vision._make.non_max_suppression")
+.set_body_typed(MakeNMS);
 
 RELAY_REGISTER_OP("vision.non_max_suppression")
-    .describe(R"doc(Non-maximum suppression. The input boxes should
-be in the format of [class_id, score, left, top, right, bottom].
-Set id_index to be -1 to ignore class_id axis.
+.describe(R"doc(Non-maximum suppression. The input boxes should
+be in the format of [class_id, score, left, top, right, bottom]
+or [score, left, top, right, bottom]. Set id_index to be -1 to
+ignore class_id axis.
 )doc" TVM_ADD_FILELINE)
     .set_num_inputs(2)
     .add_argument("data", "Tensor", "Input data.")
diff --git a/src/relay/transforms/combine_parallel_conv2d.cc b/src/relay/transforms/combine_parallel_conv2d.cc
index 1990414c3aa4..e4b2e1ea980b 100644
--- a/src/relay/transforms/combine_parallel_conv2d.cc
+++ b/src/relay/transforms/combine_parallel_conv2d.cc
@@ -176,7 +176,24 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
       begin.push_back(index);
       index += channels;
       end.push_back(index);
-      auto slice = MakeStridedSlice(data, std::move(begin), std::move(end), Array<Integer>{});
+
+
+      DLContext ctx;
+      ctx.device_type = kDLCPU;
+      ctx.device_id = 0;
+      auto begin_ndarray = runtime::NDArray::Empty({1}, DataType::Int(64), ctx);
+      auto end_ndarray = runtime::NDArray::Empty({1}, DataType::Int(64), ctx);
+      auto strides_ndarray = runtime::NDArray::Empty({1}, DataType::Int(64), ctx);
+      int64_t* begin_data = static_cast<int64_t*>(begin_ndarray->data);
+      int64_t* end_data = static_cast<int64_t*>(end_ndarray->data);
+
+      for (size_t i = 0; i < begin.size(); ++i){
+        begin_data[i] = begin[i];
+        end_data[i] = end[i];
+      }
+
+      auto slice = MakeStridedSlice(data, ConstantNode::make(begin_ndarray),
+                                    ConstantNode::make(end_ndarray), ConstantNode::make(strides_ndarray));
       subst_map->insert({GetRef<Expr>(branch[depth]), slice});
     }
   }
diff --git a/src/relay/transforms/pattern_util.h b/src/relay/transforms/pattern_util.h
index 06b1e8290fe0..8964959bfcfd 100644
--- a/src/relay/transforms/pattern_util.h
+++ b/src/relay/transforms/pattern_util.h
@@ -673,7 +673,7 @@ Expr MakeConcatenate(Expr data, int axis);
 
 Expr MakeRepeat(Expr data, int repeats, int axis);
 
-Expr MakeStridedSlice(Expr data, Array<Integer> begin, Array<Integer> end, Array<Integer> strides);
+Expr MakeStridedSlice(Expr data, Expr begin, Expr end, Expr strides);
 
 Expr MakeStack(Expr data, int axis);
 
diff --git a/tests/python/frontend/tensorflow/test_control_flow.py b/tests/python/frontend/tensorflow/test_control_flow.py
index 9777a8dc4462..95d5b797430c 100644
--- a/tests/python/frontend/tensorflow/test_control_flow.py
+++ b/tests/python/frontend/tensorflow/test_control_flow.py
@@ -53,7 +53,7 @@ def b(i): return tf.add(i, 1)
 
         r = tf.while_loop(c, b, [i])
 
-        with tf.Session() as sess:
+        with tf.compat.v1.Session() as sess:
             tf_out = sess.run(r)
 
         check_equal(graph, tf_out)
@@ -70,7 +70,7 @@ def b(i): return tf.add(i, 1)
 
         r = tf.while_loop(c, b, [i])
 
-        with tf.Session() as sess:
+        with tf.compat.v1.Session() as sess:
             tf_out = sess.run(r)
 
         check_equal(graph, tf_out)
@@ -89,7 +89,7 @@ def b(i, j): return [tf.add(i, 1), j]
         i1, i2 = tf.while_loop(c, b, loop_vars=[i0, j0])
         i1 += tf.constant(1337)
 
-        with tf.Session() as sess:
+        with tf.compat.v1.Session() as sess:
             tf_out = sess.run(i1)
 
     check_equal(graph, tf_out)
@@ -107,7 +107,7 @@ def c(i, j, k): return i < 10
         def b(i, j, k): return [i+1, j * k, k + i]
         r = tf.while_loop(c, b, loop_vars=[i0, j0, k0])
 
-        with tf.Session() as sess:
+        with tf.compat.v1.Session() as sess:
             tf_out = sess.run(r)
 
     check_equal(graph, tf_out)
@@ -127,7 +127,7 @@ def c(i, j, k): return \
 
         def b(i, j, k): return [i+j, j+k, k+1]
         r = tf.while_loop(c, b, loop_vars=[i, j, k])
-        with tf.Session() as sess:
+        with tf.compat.v1.Session() as sess:
             tf_out = sess.run(r)
 
     check_equal(graph, tf_out)
@@ -147,7 +147,7 @@ def condition(x):
             return tf.reduce_sum(x) < 100
         x = tf.constant(0, shape=[2, 2])
         r = tf.while_loop(condition, body, [x])
-        with tf.Session() as sess:
+        with tf.compat.v1.Session() as sess:
             tf_out = sess.run(r)
 
     check_equal(graph, tf_out)
@@ -170,7 +170,7 @@ def condition(x):
         x = tf.constant(3)
         r = tf.while_loop(condition, body, loop_vars=[x])
 
-        with tf.Session() as sess:
+        with tf.compat.v1.Session() as sess:
             tf_out = sess.run(r)
 
     check_equal(graph, tf_out)
@@ -189,7 +189,7 @@ def f2():
             return tf.add(4, 23)
         r = tf.cond(tf.less(i, j), f1, f2)
 
-    with tf.Session(graph=graph) as sess:
+    with tf.compat.v1.Session(graph=graph) as sess:
         tf_out = sess.run(r)
 
     check_equal(graph, tf_out)
@@ -204,7 +204,7 @@ def test_multiple_cond_vars():
         r = tf.cond(tf.less(tf.add(x1, x2), 10),
                     lambda: tf.add(10, 2), lambda: tf.square(5))
 
-        with tf.Session() as sess:
+        with tf.compat.v1.Session() as sess:
             tf_out = sess.run(r)
 
     check_equal(graph, tf_out)
@@ -224,7 +224,7 @@ def fn2(x, y):
         k = tf.constant(3)
         r = tf.cond(tf.less(i, j), lambda: fn1(i, k), lambda: fn2(j, k))
 
-        with tf.Session() as sess:
+        with tf.compat.v1.Session() as sess:
             tf_out = sess.run(r, feed_dict={i: 1, j: 2, k: 3})
 
     check_equal(graph, tf_out)
@@ -252,7 +252,7 @@ def fn2(a, b):
         pred = tf.less(x, y)
         r = tf.cond(pred, lambda: fn1(x, y), lambda: fn2(y, z))
 
-        with tf.Session() as sess:
+        with tf.compat.v1.Session() as sess:
             tf_out = sess.run(r, feed_dict={x: 1, y: 2, z: 3, pred: True})
 
     check_equal(graph, tf_out)
@@ -279,7 +279,7 @@ def fn2(a, b):
         pred = tf.less(x, y)
         r = tf.cond(pred, lambda: fn1(x, y), lambda: fn2(y, z))
 
-        with tf.Session() as sess:
+        with tf.compat.v1.Session() as sess:
             tf_out = sess.run(r, feed_dict={x: 1, y: 2, z: 3, pred: True})
 
     check_equal(graph, tf_out)
@@ -300,7 +300,7 @@ def condition(x):
             return tf.less(x, 100)
 
         r = tf.while_loop(condition, body, loop_vars=[x])
-        with tf.Session() as sess:
+        with tf.compat.v1.Session() as sess:
             tf_out = sess.run(r)
 
     check_equal(graph, tf_out)
diff --git a/tests/python/frontend/tensorflow/test_debugging.py b/tests/python/frontend/tensorflow/test_debugging.py
index 01ad6a256f88..8dac612b4879 100644
--- a/tests/python/frontend/tensorflow/test_debugging.py
+++ b/tests/python/frontend/tensorflow/test_debugging.py
@@ -17,6 +17,7 @@
 """Unit tests for converting TensorFlow debugging ops to Relay."""
 try:
     import tensorflow.compat.v1 as tf
+
     tf.disable_v2_behavior()
 except ImportError:
     import tensorflow as tf
@@ -24,6 +25,7 @@
 from tvm import relay
 from tvm.relay.frontend.tensorflow import from_tensorflow
 
+
 def run_relay(graph, shape_dict=None, *vars):
     mod, params = from_tensorflow(
         graph.as_graph_def(add_shapes=True),
@@ -31,6 +33,7 @@ def run_relay(graph, shape_dict=None, *vars):
     ex = relay.create_executor('debug', mod=mod)
     return ex.evaluate()(*vars)
 
+
 def test_assert_true():
     g = tf.Graph()
     shape = (1, 2)
@@ -51,7 +54,8 @@ def test_assert_true():
         # do that, it's happening in Relay, and that optimization shouldn't
         # affect the arity of the main function. We should have to pass in
         # x_value here.
-        np.testing.assert_allclose(0, run_relay(g, {'input':shape}).asnumpy())
+        np.testing.assert_allclose(0, run_relay(g, {'input': shape}).asnumpy())
+
 
 def test_assert_true_var_capture():
     g = tf.Graph()
@@ -71,7 +75,8 @@ def test_assert_true_var_capture():
         # the graph as a boolean, which is not correct - as you can see above,
         # TF believes that the value of this graph is None.
         np.testing.assert_allclose(True,
-            run_relay(g, None, x_value).asnumpy())
+                                   run_relay(g, None, x_value).asnumpy())
+
 
 def test_assert_false():
     g = tf.Graph()
@@ -91,9 +96,8 @@ def test_assert_false():
         # argument is false.
         np.testing.assert_allclose(0, run_relay(g).asnumpy())
 
-        
+
 if __name__ == "__main__":
     test_assert_true()
     test_assert_true_var_capture()
     test_assert_false()
-    
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index c6a285c93d6a..76d2fe13aa49 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -1886,6 +1886,30 @@ def test_forward_crop_and_resize():
                                   extrapolation_value=0.2, method='nearest')
 
 
+#######################################################################
+# Non Max Suppression
+# -------------------
+def _test_forward_nms_v3(bx_shape, score_shape, iou_threshold, score_threshold, out_size, dtype="float32"):
+    boxes = np.random.uniform(0, 10, size=bx_shape).astype(dtype)
+    scores = np.random.uniform(size=score_shape).astype(dtype)
+    tf.reset_default_graph()
+    in_data_1 = tf.placeholder(dtype, boxes.shape, name="in_data_1")
+    in_data_2 = tf.placeholder(dtype, scores.shape, name="in_data_2")
+    tf.image.non_max_suppression(boxes=in_data_1, scores=in_data_2,
+                                 max_output_size=out_size, iou_threshold=iou_threshold,
+                                 score_threshold=score_threshold, name="nms")
+    compare_tf_with_tvm([boxes, scores], ['in_data_1:0', 'in_data_2:0'],
+                        'nms/NonMaxSuppressionV3:0', mode='vm')
+    compare_tf_with_tvm([boxes, scores], ['in_data_1:0', 'in_data_2:0'],
+                        'nms/NonMaxSuppressionV3:0', mode='debug')
+
+def test_forward_nms_v3():
+    """ NonMaxSuppressionV3 """
+    _test_forward_nms_v3((5, 4), (5,), 0.7, 0.5, 5)
+    _test_forward_nms_v3((20, 4), (20,), 0.5, 0.6, 10)
+    _test_forward_nms_v3((1000, 4), (1000,), 0.3, 0.7, 1000)
+
+
 #######################################################################
 # LSTM
 # ----
@@ -3231,6 +3255,7 @@ def test_forward_isfinite():
     test_forward_truncatemod()
     test_forward_one_hot()
     test_forward_atan2()
+    test_forward_nms_v3()
 
     # Activations
     test_forward_sigmoid()
@@ -3322,4 +3347,4 @@ def test_forward_isfinite():
     test_read_variable_op()
 
     # Sharing params case using Mean ops
-    test_sharing_node()
+    test_sharing_node()
\ No newline at end of file
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 7a8437aaedd3..eca5fb7d4b74 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -169,7 +169,7 @@ def compare_tflite_with_tvm(in_data, in_name, input_tensors,
     for i in range(len(in_name)):
         in_node[i] = in_name[i].split(':')[0] if ":" in in_name[i] else in_name[i]
 
-    with tf.Session() as sess:
+    with tf.compat.v1.Session() as sess:
         if init_global_variables:
             sess.run(variables.global_variables_initializer())
         # convert to tflite model
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 504c20a7f21f..e6a462bd737e 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -22,6 +22,7 @@
 from tvm import relay
 from tvm.relay.loops import while_loop
 from tvm.relay.testing import run_infer_type as infer_type
+import topi.testing
 
 def int32(val):
     return relay.const(val, 'int32')
@@ -642,6 +643,35 @@ def test_arange_with_dynamic_shape():
         result = ex.evaluate()(data)
         tvm.testing.assert_allclose(result.asnumpy(), np.array(range(10)).astype("int32")+1)
 
+def verify_any_strided_slice(data_shape, begin_shape, end_shape,
+                             strides_shape, data_np_shape):
+    mod = relay.Module()
+    data = relay.var('data', shape=data_shape, dtype='float32')
+    begin = relay.var('begin', shape=begin_shape, dtype="int32")
+    end = relay.var('end', shape=end_shape, dtype="int32")
+    strides = relay.var('strides', shape=strides_shape, dtype="int32")
+    y = relay.strided_slice(data, begin, end, strides)
+    mod["main"] = relay.Function([data, begin, end, strides], y)
+
+    # Generate random numpy input data
+    data_np = np.random.uniform(size=data_np_shape).astype('float32')
+    begin_np = np.random.randint(2, size=begin_shape, dtype="int32")
+    end_np = np.random.randint(5, 15, size=end_shape, dtype="int32")
+    strides_np = np.random.randint(1, 3, size=strides_shape, dtype="int32")
+
+    ref_res = topi.testing.strided_slice_python(data_np, begin_np, end_np, strides_np)
+
+    for kind in ["debug", "vm"]:
+        ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm")
+        result = ex.evaluate()(data_np, begin_np, end_np, strides_np)
+        tvm.testing.assert_allclose(result.asnumpy(), ref_res)
+
+def test_any_strided_slice():
+    verify_any_strided_slice(any_dims(3), (3,), (3,), (3,), (15, 17, 21))
+    verify_any_strided_slice(any_dims(3), (3,), (3,), (3,), (23, 29, 41))
+    verify_any_strided_slice(any_dims(4), (4,), (4,), (4,), (40, 50, 60, 70))
+
+
 def test_recursive_concat():
     """
     fn @concat_loop(%i: int32, %st: (any, 1)) -> (any, 1) {
@@ -796,7 +826,9 @@ def test_mixed_input_type():
     test_any_softmax()
     test_any_topk()
     test_fused_ops()
+    test_any_argwhere()
     test_arange_with_dynamic_shape()
+    test_any_strided_slice()
     test_recursive_concat()
     test_recursive_concat_with_wrong_annotation()
     test_tuple_get_item()
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 947a4bfd0b3b..358c8f18ea34 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -297,15 +297,44 @@ def test_mean_var_std():
 
 def test_strided_slice():
     def verify(dshape, begin, end, strides, output, test_ref=True):
+        dtype = "int32"
         x = relay.var("x", relay.TensorType(dshape, "float32"))
-        z = relay.strided_slice(x, begin=begin, end=end, strides=strides)
+        ndim = len(dshape)
+        begin = begin if begin else [0] * ndim
+        end = end if end else list(dshape)
+        strides = strides if strides else [1] * ndim
+        for i in range(ndim):
+            if len(begin) <= i:
+                begin.append(0)
+            if len(end) <= i:
+                end.append(dshape[i])
+            if len(strides) <= i:
+                strides.append(1)
+            if not begin[i]:
+                begin[i] = 0
+            if not end[i]:
+                end[i] = dshape[i]
+            if not strides[i]:
+                strides[i] = 1
+
+        begin_expr = relay.const(begin, dtype=dtype)
+        end_expr = relay.const(end, dtype=dtype)
+        strides_expr = relay.const(strides, dtype=dtype)
+
+        z = relay.strided_slice(x,
+                                begin=begin_expr,
+                                end=end_expr,
+                                strides=strides_expr)
         func = relay.Function([x], z)
+
         func = run_infer_type(func)
         text = func.astext()
         assert "begin=" in text
         assert "end=" in text
+
         if output:
             assert func.body.checked_type == relay.ty.TensorType(output, "float32")
+
         if not test_ref:
             return
         x_data = np.random.uniform(size=dshape).astype("float32")
@@ -316,18 +345,18 @@ def verify(dshape, begin, end, strides, output, test_ref=True):
             op_res = intrp.evaluate(func)(x_data)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
 
-    d1, d2, d3, d4 = te.var("d1"), te.var("d2"), te.var("d3"), te.var("d4")
-    verify((d1, d2, 3), [None, None, 1], [None, None, 2], None, (d1, d2, 1), False)
+    verify((1, 224, 224, 3), [0, 20, 20, 0], [1, 140, 140, 3], [1, 1, 1, 1], (1, 120, 120, 3), dtype="int64")
+    verify((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], (1, 3, 3), dtype="int16")
     verify((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2], (3, 1, 2))
     verify((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], (1, 3, 3))
-    verify((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], (1, 4, 3))
+    verify((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2], (3, 1, 2))
     verify((3, 4, 3), [1, 0, 0], [2, 2, 3], [1, 1, 2], (1, 2, 2))
-    verify((3, 4, 3), [1, -1, 0], [2, -3, 3], [1, -1, 1], (1, 2, 3))
     verify((3, 4, 3), [1, 1, 0], [4, 4, 3], None, (2, 3, 3))
     verify((3, 4, 3), [1, 1, 0], [4, 1000, 3], None, (2, 3, 3))
     verify((3, 4, 3), [1, 1, 0], [4, 4], None, (2, 3, 3))
     verify((3, 4, 3), [1, 1], [4, 4, 3], None, (2, 3, 3))
-
+    verify((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], (1, 4, 3))
+    verify((3, 4, 3), [1, -1, 0], [2, -3, 3], [1, -1, 1], (1, 2, 3))
 
 def test_strided_set():
     def verify(dshape, begin, end, strides, vshape, test_ref=True):
@@ -379,3 +408,4 @@ def verify(dshape, begin, end, strides, vshape, test_ref=True):
     test_where()
     test_reduce_functions()
     test_mean_var_std()
+
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index c3067523fb03..b73428b0a8f5 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -284,18 +284,22 @@ def verify_nms(x0_data, x1_data, dshape, ref_res, ref_indices_res,
                    check_type_only=False):
         x0 = relay.var("x0", relay.ty.TensorType(dshape, "float32"))
         x1 = relay.var("x1", relay.ty.TensorType((dshape[0],), "int32"))
-        z = relay.vision.non_max_suppression(x0, x1, max_output_size = -1, \
-            iou_threshold = iou_threshold, force_suppress = force_suppress, \
-            top_k = top_k, return_indices=False)
-        z_indices = relay.vision.non_max_suppression(x0, x1, max_output_size = -1, \
-                    iou_threshold = iou_threshold, force_suppress = force_suppress, \
-                    top_k = top_k)
+        z = relay.vision.non_max_suppression(x0, x1, max_output_size=-1, \
+            iou_threshold=iou_threshold, force_suppress=force_suppress, \
+            top_k=top_k, return_indices=False)
+        z_indices = relay.vision.non_max_suppression(x0, x1, max_output_size=-1, score_threshold=0.5, \
+                    iou_threshold=iou_threshold, force_suppress=force_suppress, \
+                    top_k=top_k)
+        if isinstance(z_indices, relay.expr.TupleWrapper):
+            z_indices = z_indices.astuple()
         assert "iou_threshold" in z.astext()
         assert "iou_threshold" in z_indices.astext()
         zz = run_infer_type(z)
         zz_indices = run_infer_type(z_indices)
         assert zz.checked_type == relay.ty.TensorType(dshape, "float32")
-        assert zz_indices.checked_type == relay.ty.TensorType((dshape[0], dshape[1]), "int32")
+        assert zz_indices.checked_type == relay.ty.TupleType(
+            [relay.ty.TensorType((dshape[0], dshape[1]), "int32"),
+             relay.ty.TensorType((dshape[0], 1), "int32")])
 
         if check_type_only:
             return
@@ -307,14 +311,16 @@ def verify_nms(x0_data, x1_data, dshape, ref_res, ref_indices_res,
         for target, ctx in ctx_list():
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             op_res1 = intrp1.evaluate(func)(x0_data, x1_data)
-            op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
-            tvm.testing.assert_allclose(op_indices_res1.asnumpy(), ref_indices_res, rtol=1e-5)
+            if top_k == -1:
+                op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data)
+                tvm.testing.assert_allclose(op_indices_res1[0].asnumpy(), ref_indices_res, rtol=1e-5)
             intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
             op_res2 = intrp2.evaluate(func)(x0_data, x1_data)
-            op_indices_res2 = intrp2.evaluate(func_indices)(x0_data, x1_data)
             tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
-            tvm.testing.assert_allclose(op_indices_res2.asnumpy(), ref_indices_res, rtol=1e-5)
+            if top_k == -1:
+                op_indices_res2 = intrp2.evaluate(func_indices)(x0_data, x1_data)
+                tvm.testing.assert_allclose(op_indices_res2[0].asnumpy(), ref_indices_res, rtol=1e-5)
 
     np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80],
                          [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79],
diff --git a/topi/python/topi/sort.py b/topi/python/topi/sort.py
index e492d683a09d..f79eb52e9266 100644
--- a/topi/python/topi/sort.py
+++ b/topi/python/topi/sort.py
@@ -31,10 +31,10 @@ def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
         The input tensor.
 
     valid_count : tvm.te.Tensor, optional
-        1-D tensor for valid number of boxes only for ssd.
+        1-D tensor for valid number of boxes.
 
     axis : int, optional
-            Axis along which to sort the input tensor.
+        Axis along which to sort the input tensor.
         By default the flattened array is used.
 
     is_ascend : boolean, optional
diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index 28598dedffbd..48e85cdd574a 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -22,6 +22,55 @@
 from tvm.te import hybrid
 from ..sort import argsort
 
+
+@hybrid.script
+def hybrid_rearrange_idx(data):
+    """Hybrid routine to rearrange nms output to
+    move all valid entries to top.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor or numpy NDArray
+        NMS output. 2-D tensor with shape
+        [batch_size, num_anchors].
+
+    one: tvm.tir.const
+        Constant one with the same dtype as data.
+
+    Returns
+    -------
+    output : tvm.te.Tensor or numpy NDArray
+        Transformed NMS output. 2-D tensor with shape
+        [batch_size, num_anchors].
+
+    shape : tvm.te.Tensor or numpy NDArray
+        Shape of Tensor with valid indexes
+        [Batch_size, num_valid_indices]
+    """
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+    out_tensor = output_tensor((batch_size,
+                                num_anchors),
+                               data.dtype)
+    out_shape = output_tensor((batch_size,
+                               1),
+                              data.dtype)
+
+    for i in range(batch_size): # range instead
+        valid_idx = 0
+        for j in range(num_anchors):
+            if data[i, j] >= 0:
+                out_tensor[i, valid_idx] = data[i, j]
+                valid_idx += 1
+            if data[i, j] > num_anchors or data[i, j] < -num_anchors:
+                out_tensor[i, valid_idx] = 0
+                valid_idx += 1
+            if j >= valid_idx:
+                out_tensor[i, j] = -1
+        out_shape[i, 0] = valid_idx
+    return out_tensor, out_shape
+
+
 @hybrid.script
 def hybrid_rearrange_out(data, one):
     """Hybrid routine to rearrange nms output to
@@ -40,7 +89,7 @@ def hybrid_rearrange_out(data, one):
     -------
     output : tvm.te.Tensor or numpy NDArray
         Transformed NMS output. 3-D tensor with shape
-        [batch_size, num_anchors, 6].
+        [batch_size, num_anchors, 6] or [batch_size, num_anchors, 5].
     """
     batch_size = data.shape[0]
     num_anchors = data.shape[1]
@@ -60,6 +109,7 @@ def hybrid_rearrange_out(data, one):
             if j >= valid_idx:
                 for k in range(elem_length):
                     output[i, j, k] = -one
+
     return output
 
 
@@ -154,9 +204,8 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
 
 
 @hybrid.script
-def hybrid_nms(data, sorted_index, valid_count,
-               max_output_size, iou_threshold, force_suppress,
-               top_k, coord_start, id_index, score_index, zero, one):
+def hybrid_nms(data, sorted_index, valid_count, max_output_size, iou_threshold,
+               force_suppress, top_k, coord_start, id_index, score_index, zero, one):
     """Hybrid routing for non-maximum suppression.
 
     Parameters
@@ -203,7 +252,8 @@ def hybrid_nms(data, sorted_index, valid_count,
     Returns
     -------
     output : tvm.te.Tensor
-        3-D tensor with shape [batch_size, num_anchors, 6].
+        3-D tensor with shape [batch_size, num_anchors, 6]
+        or [batch_size, num_anchors, 5].
 
     box_indices: tvm.te.Tensor
         2-D tensor with shape [batch_size, num_anchors].
@@ -211,7 +261,7 @@ def hybrid_nms(data, sorted_index, valid_count,
     batch_size = data.shape[0]
     num_anchors = data.shape[1]
     box_data_length = data.shape[2]
-    box_indices = output_tensor((batch_size, num_anchors), "int32")
+    box_indices = output_tensor((batch_size, num_anchors), sorted_index.dtype)
     output = output_tensor((batch_size,
                             num_anchors,
                             box_data_length,), data.dtype)
@@ -289,8 +339,136 @@ def hybrid_nms(data, sorted_index, valid_count,
                         num_valid_boxes += 1
     return output, box_indices
 
+@hybrid.script
+def hybrid_dynamic_nms(data, sorted_index, max_output_size, score_threshold,
+                       iou_threshold, score_index, zero, one):
+    """Hybrid routing for non-maximum suppression.
 
-def non_max_suppression(data, valid_count, max_output_size=-1,
+    Parameters
+    ----------
+    data: tvm.te.Tensor or numpy NDArray
+        Bounding boxes with class and score. 3-D tensor with shape
+        [batch_size, num_anchors, 6] or [batch_size, num_anchors, 5].
+
+    sorted_index : tvm.te.Tensor or numpy NDArray
+        Bounding box indexes sorted by score, with shape
+        [batch_size, num_anchors].
+
+    max_output_size : tvm.tir.const
+        Max number of output valid boxes for each instance.
+        By default all valid boxes are returned.
+
+    score_threshold : tvm.tir.const
+        Lower limit of score for valid bounding boxes.
+
+    iou_threshold : tvm.tir.const
+        Overlapping(IoU) threshold to suppress object with smaller score.
+
+    score_index: tvm.tir.const
+        Index of the scores/confidence of boxes.
+
+    zero: tvm.tir.const
+        Constant zero with the same dtype as data.
+
+    one: tvm.tir.const
+        Constant one with the same dtype as data.
+
+    Returns
+    -------
+    box_indices: tvm.te.Tensor
+        2-D tensor with shape [batch_size, num_anchors].
+    """
+
+
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+    box_data_length = data.shape[2]
+
+    # box_indices is the expected value, similar to TF & ONNX
+    box_indices = output_tensor((batch_size, num_anchors), sorted_index.dtype)
+    output = output_tensor((batch_size,
+                            num_anchors,
+                            box_data_length,), data.dtype)
+
+    for i in range(batch_size):
+        if iou_threshold > 0:
+            # Reorder output
+            for j in parallel(num_anchors):
+                for k in range(box_data_length):
+                    output[i, j, k] = data[i, sorted_index[i, j], k]
+                if output[i, j, score_index] > score_threshold:
+                    box_indices[i, j] = sorted_index[i, j]
+                else:
+                    box_indices[i, j] = -1
+
+            # Apply nms
+            box_start_idx = 1
+            batch_idx = i
+
+            for j in range(num_anchors):
+                # index sorted
+                j_sorted = sorted_index[i, j]
+
+                box_a_idx = j
+                # l: left, t: top, r: right, b: bottom
+                a_l = min(output[batch_idx, box_a_idx, box_start_idx],
+                          output[batch_idx, box_a_idx, box_start_idx + 2])
+                a_t = min(output[batch_idx, box_a_idx, box_start_idx + 1],
+                          output[batch_idx, box_a_idx, box_start_idx + 3])
+                a_r = max(output[batch_idx, box_a_idx, box_start_idx],
+                          output[batch_idx, box_a_idx, box_start_idx + 2])
+                a_b = max(output[batch_idx, box_a_idx, box_start_idx + 1],
+                          output[batch_idx, box_a_idx, box_start_idx + 3])
+
+                for k in parallel(j + 1, num_anchors):
+                    k_sorted = sorted_index[i, k]
+                    box_b_idx = k
+                    # l: left, t: top, r: right, b: bottom
+                    b_l = min(output[batch_idx, box_b_idx, box_start_idx],
+                              output[batch_idx, box_b_idx, box_start_idx + 2])
+                    b_t = min(output[batch_idx, box_b_idx, box_start_idx + 1],
+                              output[batch_idx, box_b_idx, box_start_idx + 3])
+                    b_r = max(output[batch_idx, box_b_idx, box_start_idx],
+                              output[batch_idx, box_b_idx, box_start_idx + 2])
+                    b_b = max(output[batch_idx, box_b_idx, box_start_idx + 1],
+                              output[batch_idx, box_b_idx, box_start_idx + 3])
+
+                    # Overlapping width and height
+                    w = max(zero, min(a_r, b_r) - max(a_l, b_l))
+                    h = max(zero, min(a_b, b_b) - max(a_t, b_t))
+
+                    # Overlapping area
+                    area = h * w
+
+                    # total area of the figure formed by box a and box b except for overlapping area
+                    u = (a_r - a_l) * (a_b - a_t) + (b_r - b_l) * (b_b - b_t) - area
+
+                    # get the iou
+                    iou = area / u
+
+                    # output[i, k, sorted_index] = iou
+
+                    if iou >= score_threshold:
+                        box_indices[i, k] = -1
+
+        else:
+            for j in parallel(num_anchors):
+                box_indices[i, j] = sorted_index[i, j]
+
+        # Only return max_output_size valid boxes
+        num_valid_boxes = 0
+        if max_output_size > 0:
+            for j in parallel(num_anchors):
+                if num_valid_boxes == max_output_size:
+                    box_indices[i, j] = -1
+                else:
+                    num_valid_boxes += 1
+
+    return output, box_indices
+
+
+@tvm.target.generic_func
+def non_max_suppression(data, valid_count, max_output_size=-1, score_threshold=0.0,
                         iou_threshold=0.5, force_suppress=False, top_k=-1,
                         coord_start=2, score_index=1, id_index=0,
                         return_indices=True, invalid_to_bottom=False):
@@ -308,6 +486,9 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
         Max number of output valid boxes for each instance.
         By default all valid boxes are returned.
 
+    score_threshold : optional, float
+        Lower limit of score for valid bounding boxes.
+
     iou_threshold : optional, float
         Non-maximum suppression threshold.
 
@@ -334,8 +515,12 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
 
     Returns
     -------
-    out : tvm.te.Tensor
-        3-D tensor with shape [batch_size, num_anchors, 6].
+    out : tvm.te.Tensor or tuple of tvm.te.Tensor
+        3-D tensor with shape [batch_size, num_anchors, 6]
+        or [batch_size, num_anchors, 6]. Out is a tuple of tvm.te.Tensor
+        if return_indices is True, the Tensor in the tuple is 2-D tensor
+        with shape [batch_size, num_anchors] and shape
+        [batch_size, num_valid_anchors] respectively.
 
     Example
     --------
@@ -366,17 +551,33 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
     score_shape = (batch_size, num_anchors)
     score_tensor = te.compute(score_shape, lambda i, j: data[i, j, score_axis])
     sort_tensor = argsort(score_tensor, valid_count=valid_count, axis=1, is_ascend=False)
-    out, box_indices = hybrid_nms(data, sort_tensor, valid_count,
-                                  tvm.tir.const(max_output_size, dtype="int32"),
-                                  tvm.tir.const(iou_threshold, dtype=data.dtype),
-                                  tvm.tir.const(force_suppress, dtype="bool"),
-                                  tvm.tir.const(top_k, dtype="int32"),
-                                  tvm.tir.const(coord_start, dtype="int32"),
-                                  tvm.tir.const(id_index, dtype="int32"),
-                                  tvm.tir.const(score_index, dtype="int32"),
-                                  zero=tvm.tir.const(0, dtype=data.dtype),
-                                  one=tvm.tir.const(1, dtype=data.dtype))
-    if not return_indices and invalid_to_bottom:
-        out = hybrid_rearrange_out(out, one=tvm.tir.const(1, dtype=data.dtype))
-
-    return box_indices if return_indices else out
+
+    if return_indices:
+        # return a tuple with two tensor, one is the computed valid indices of boxes, appending -1 as invalid boxes
+        # the other one is the number of valid boxes
+        out, box_indices = hybrid_dynamic_nms(data,
+                                              sort_tensor,
+                                              tvm.tir.const(max_output_size, dtype="int32"),
+                                              tvm.tir.const(score_threshold, dtype=data.dtype),
+                                              tvm.tir.const(iou_threshold, dtype=data.dtype),
+                                              tvm.tir.const(score_index, dtype="int32"),
+                                              zero=tvm.tir.const(0, dtype=data.dtype),
+                                              one=tvm.tir.const(1, dtype=data.dtype))
+        box_indices, out_shape = hybrid_rearrange_idx(box_indices)
+        return [box_indices, out_shape]
+    else:
+        out, box_indices = hybrid_nms(data,
+                                      sort_tensor,
+                                      valid_count,
+                                      tvm.tir.const(max_output_size, dtype="int32"),
+                                      tvm.tir.const(iou_threshold, dtype=data.dtype),
+                                      tvm.tir.const(force_suppress, dtype="bool"),
+                                      tvm.tir.const(top_k, dtype="int32"),
+                                      tvm.tir.const(coord_start, dtype="int32"),
+                                      tvm.tir.const(id_index, dtype="int32"),
+                                      tvm.tir.const(score_index, dtype="int32"),
+                                      zero=tvm.tir.const(0, dtype=data.dtype),
+                                      one=tvm.tir.const(1, dtype=data.dtype))
+        if invalid_to_bottom:
+            out = hybrid_rearrange_out(out, one=tvm.tir.const(1, dtype=data.dtype))
+        return out
diff --git a/topi/python/topi/vision/ssd/multibox.py b/topi/python/topi/vision/ssd/multibox.py
index ba0cf5440c9a..e1ddc7bab9b0 100644
--- a/topi/python/topi/vision/ssd/multibox.py
+++ b/topi/python/topi/vision/ssd/multibox.py
@@ -304,7 +304,7 @@ def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nm
     """
     inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor,
                                        clip, threshold, variances)
-    out = non_max_suppression(inter_out[0], inter_out[1], max_output_size=-1,
+    out = non_max_suppression(inter_out[0], inter_out[1], max_output_size=-1, score_threshold=0,
                               iou_threshold=nms_threshold, force_suppress=force_suppress,
                               top_k=nms_topk, return_indices=False)
     return out
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index 3ccb44d0f47c..77cd2f1d7e65 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -120,13 +120,14 @@ def test_get_valid_counts():
     verify_get_valid_counts((16, 500, 5), 0.95, -1, 1)
 
 
-def verify_non_max_suppression(np_data, np_valid_count, np_result, np_indices_result, iou_threshold,
-                               force_suppress, top_k, coord_start, score_index, id_index):
+def verify_non_max_suppression(np_data, np_valid_count, np_indices, np_result, np_indices_result,
+                               iou_threshold, force_suppress, top_k, coord_start, score_index, id_index):
     dshape = np_data.shape
     batch, num_anchors, _ = dshape
     indices_dshape = (batch, num_anchors)
     data = te.placeholder(dshape, name="data")
     valid_count = te.placeholder((batch,), dtype="int32", name="valid_count")
+    indices = te.placeholder((batch, num_anchors), dtype="int32", name="indices")
 
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -136,10 +137,10 @@ def check_device(device):
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             fcompute, fschedule = topi.testing.dispatch(device, _nms_implement)
-            out = fcompute(data, valid_count, -1, iou_threshold, force_suppress, top_k,
+            out = fcompute(data, valid_count, indices, -1, iou_threshold, force_suppress, top_k,
                            coord_start=coord_start, score_index=score_index, id_index=id_index,
                            return_indices=False)
-            indices_out = fcompute(data, valid_count, -1, iou_threshold, force_suppress, top_k,
+            indices_out = fcompute(data, valid_count, indices, -1, iou_threshold, force_suppress, top_k,
                                    coord_start=coord_start, score_index=score_index, id_index=id_index)
             s = fschedule(out)
             indices_s = fschedule(indices_out)
@@ -153,7 +154,7 @@ def check_device(device):
         tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4)
 
         tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"), ctx)
-        f = tvm.build(indices_s, [data, valid_count, indices_out], device)
+        f = tvm.build(indices_s, [data, valid_count, indices_out[0]], device)
         f(tvm_data, tvm_valid_count, tvm_indices_out)
         tvm.testing.assert_allclose(tvm_indices_out.asnumpy(), np_indices_result, rtol=1e-4)
 
@@ -171,7 +172,7 @@ def test_non_max_suppression():
                            [-1, -1, -1, -1, -1, -1]]])
     np_indices_result = np.array([[3, 0, -1, -1, -1]])
 
-    verify_non_max_suppression(np_data, np_valid_count, np_result, np_indices_result, 0.7, True, 2, 2, 1, 0)
+    verify_non_max_suppression(np_data, np_valid_count, np_result, np_indices_result, 0.6, 0.7, True, 2, 2, 1, 0)
 
     np_data = np.array([[[0.8, 1, 20, 25, 45], [0.7, 30, 60, 50, 80],
                          [0.4, 4, 21, 19, 40], [0.9, 35, 61, 52, 79],
@@ -181,7 +182,7 @@ def test_non_max_suppression():
                            [-1, -1, -1, -1, -1], [-1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1]]])
     np_indices_result = np.array([[3, 0, -1, -1, -1]])
-    verify_non_max_suppression(np_data, np_valid_count, np_result, np_indices_result, 0.7, False, 2, 1, 0, -1)
+    verify_non_max_suppression(np_data, np_valid_count, np_result, np_indices_result, 0.6, 0.7, False, 2, 1, 0, -1)
 
 
 
@@ -459,9 +460,9 @@ def test_proposal():
 
 if __name__ == "__main__":
     test_get_valid_counts()
-    test_non_max_suppression()
     test_multibox_prior()
     test_multibox_detection()
     test_roi_align()
     test_roi_pool()
     test_proposal()
+    test_non_max_suppression()
diff --git a/tutorials/frontend/from_tensorflow.py b/tutorials/frontend/from_tensorflow.py
index 0ebd733ef9aa..1b97c442dbca 100644
--- a/tutorials/frontend/from_tensorflow.py
+++ b/tutorials/frontend/from_tensorflow.py
@@ -101,7 +101,7 @@
     # Call the utility to import the graph definition into default graph.
     graph_def = tf_testing.ProcessGraphDefParam(graph_def)
     # Add shapes to the graph.
-    with tf_compat_v1.Session() as sess:
+    with tf.compat.v1.Session() as sess:
         graph_def = tf_testing.AddShapesToGraphDef(sess, 'softmax')
 
 ######################################################################
@@ -218,7 +218,7 @@ def run_inference_on_image(image):
     # Creates graph from saved GraphDef.
     create_graph()
 
-    with tf_compat_v1.Session() as sess:
+    with tf.compat.v1.Session() as sess:
         softmax_tensor = sess.graph.get_tensor_by_name('softmax:0')
         predictions = sess.run(softmax_tensor,
                                {'DecodeJpeg/contents:0': image_data})

From 2b6b19a66ac5173b796ca69be78fbb025f015f8e Mon Sep 17 00:00:00 2001
From: Yong Wu <ywu118@alumni.jh.edu>
Date: Fri, 15 Nov 2019 12:24:29 -0800
Subject: [PATCH 02/22] Incorporate comments

---
 include/tvm/relay/attrs/vision.h              |   1 -
 python/tvm/relay/frontend/common.py           |   2 +-
 python/tvm/relay/frontend/keras.py            |   4 +-
 python/tvm/relay/frontend/mxnet.py            |  25 +-
 python/tvm/relay/frontend/onnx.py             |  13 +-
 python/tvm/relay/frontend/tensorflow.py       |  38 +-
 python/tvm/relay/op/_tensor_grad.py           |   6 +-
 python/tvm/relay/op/transform.py              |   2 +-
 python/tvm/relay/op/vision/nms.py             |  15 +-
 src/relay/op/tensor/transform.cc              | 116 ++++--
 src/relay/op/vision/nms.cc                    |  24 +-
 .../transforms/combine_parallel_conv2d.cc     |  17 +-
 .../frontend/tensorflow/test_forward.py       |   2 +
 tests/python/relay/test_op_level4.py          |  36 +-
 tests/python/relay/test_op_level5.py          |  40 +-
 topi/python/topi/cuda/ssd/multibox.py         |   2 +-
 topi/python/topi/vision/nms.py                | 375 +++++++-----------
 topi/python/topi/vision/ssd/multibox.py       |   2 +-
 topi/tests/python/test_topi_vision.py         |  33 +-
 19 files changed, 366 insertions(+), 387 deletions(-)

diff --git a/include/tvm/relay/attrs/vision.h b/include/tvm/relay/attrs/vision.h
index 0a0ca9fe93d6..3edd23f34494 100644
--- a/include/tvm/relay/attrs/vision.h
+++ b/include/tvm/relay/attrs/vision.h
@@ -89,7 +89,6 @@ struct GetValidCountsAttrs : public tvm::AttrsNode<GetValidCountsAttrs> {
 /*! \brief Attributes used in non_maximum_suppression operator */
 struct NonMaximumSuppressionAttrs : public tvm::AttrsNode<NonMaximumSuppressionAttrs> {
   int max_output_size;
-  double score_threshold;
   double iou_threshold;
   bool force_suppress;
   int top_k;
diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index 8dc1a70f5b68..e86890f3639a 100644
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -571,4 +571,4 @@ def __init__(self, new_name):
     def __call__(self, inputs, attrs, *args):
         if 'tvm_custom' in attrs:
             attrs.pop('tvm_custom')
-        return get_relay_op(self._new_name)(*inputs, **attrs)
\ No newline at end of file
+        return get_relay_op(self._new_name)(*inputs, **attrs)
diff --git a/python/tvm/relay/frontend/keras.py b/python/tvm/relay/frontend/keras.py
index 986995826724..ef76eb69311d 100644
--- a/python/tvm/relay/frontend/keras.py
+++ b/python/tvm/relay/frontend/keras.py
@@ -611,8 +611,8 @@ def _convert_cropping(inexpr, keras_layer, _):
         raise tvm.error.OpNotImplemented(
             'Operator {} is not supported for frontend Keras.'.format(crop_type))
     int32_max = np.iinfo(np.int32).max
-    return _op.strided_slice(inexpr, begin=[0, 0, crop_t, crop_l], \
-        end=[int32_max, int32_max, in_h-crop_b, in_w-crop_r])
+    return _op.strided_slice(inexpr, begin=_expr.const([0, 0, crop_t, crop_l]), \
+        end=_expr.const([int32_max, int32_max, in_h-crop_b, in_w-crop_r]))
 
 
 def _convert_batchnorm(inexpr, keras_layer, etab):
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index c75612dd4916..2853320212e4 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -411,16 +411,22 @@ def _mx_slice(inputs, attrs):
     begin = list(attrs.get_int_tuple('begin', None))
     end = list(attrs.get_int_tuple('end', None))
     stride = attrs.get_int_tuple('step', None)
+    input_shape = _infer_type(inputs[0]).checked_type.shape
     if begin is None:
         raise tvm.error.OpAttributeRequired(
             'Attribute "begin" not found in operator Slice.')
     if end is None:
         raise tvm.error.OpAttributeRequired(
             'Attribute "end" not found in operator Slice.')
-    begin = tuple(x if x is not None else 0 for x in begin)
-    new_attrs = {'begin': begin, 'end': end}
+    begin = (x if x is not None else 0 for x in begin)
+    for i, ed in enumerate(end):
+        if ed is None:
+            end[i] = input_shape[i]
+    new_attrs = {'begin': _expr.const(list(begin), dtype="int32"),
+                 'end': _expr.const(list(end), dtype="int32")}
     if stride is not None:
-        new_attrs['strides'] = stride
+        stride = (x if x is not None else 1 for x in stride)
+        new_attrs['strides'] = _expr.const(list(stride), dtype="int32")
     return _op.strided_slice(inputs[0], **new_attrs)
 
 
@@ -460,7 +466,9 @@ def _mx_slice_axis(inputs, attrs):
         else:
             begin.append(ax_beg)
             end.append(ax_end)
-    return _op.strided_slice(inputs[0], begin, end)
+    return _op.strided_slice(inputs[0],
+                             _expr.const(begin, dtype="int32"),
+                             _expr.const(end, dtype="int32"))
 
 
 def _mx_crop_like(inputs, attrs):
@@ -480,9 +488,9 @@ def _mx_crop_like(inputs, attrs):
         return _op.slice_like(*inputs, **new_attrs)
     expr = _infer_type(inputs[1])
     like_shape = expr.checked_type.shape
-    new_attrs['begin'] = [0, 0, offset[0], offset[1]]
-    new_attrs['end'] = [like_shape[0], like_shape[1], offset[0]+like_shape[2],
-                        offset[1]+like_shape[3]]
+    new_attrs['begin'] = _expr.const([0, 0, offset[0], offset[1]], dtype="int32")
+    new_attrs['end'] = _expr.const([like_shape[0], like_shape[1], offset[0]+like_shape[2],
+                                    offset[1]+like_shape[3]], dtype="int32")
     return _op.strided_slice(inputs[0], **new_attrs)
 
 
@@ -656,7 +664,7 @@ def _mx_multibox_detection(inputs, attrs):
 
     ret = _op.vision.multibox_transform_loc(inputs[0], inputs[1],
                                             inputs[2], **new_attrs0)
-    return _op.vision.non_max_suppression(ret[0], ret[1], **new_attrs1)
+    return _op.vision.non_max_suppression(ret[0], ret[1], ret[1], **new_attrs1)
 
 
 def _mx_batch_dot(inputs, attrs):
@@ -820,6 +828,7 @@ def _mx_box_nms(inputs, attrs):
                                       id_index=id_index, score_index=score_index)
     nms_out = _op.vision.non_max_suppression(ret[1],
                                              ret[0],
+                                             ret[2],
                                              iou_threshold=iou_thresh,
                                              force_suppress=force_suppress,
                                              top_k=top_k,
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 58ec4ee56a93..6643a888f10b 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -945,11 +945,12 @@ def _impl_v1(cls, inputs, attr, params):
                 attr['ends'] = new_ends
         except KeyError:
             pass
+        begin = list(attr['starts'])
+        end = list(attr['ends'])
 
-        return AttrCvt('strided_slice',
-                       transforms={'starts': 'begin',
-                                   'ends': 'end'},
-                       ignores=['axes'])(inputs, attr)
+        return _op.strided_slice(inputs[0],
+                                 begin=_expr.const(begin, dtype="int32"),
+                                 end=_expr.const(end, dtype="int32"))
 
     @classmethod
     def _impl_v10(cls, inputs, attr, params):
@@ -965,7 +966,9 @@ def _impl_v10(cls, inputs, attr, params):
                     starts, ends, axes)
                 starts = new_starts
                 ends = new_ends
-        return _op.strided_slice(inputs[0], begin=starts, end=ends)
+        return _op.strided_slice(inputs[0],
+                                 begin=_expr.const(starts, dtype="int32"),
+                                 end=_expr.const(ends, dtype="int32"))
 
 
 class Gather(OnnxOpConverter):
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 5926959349c8..59740d204744 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -97,7 +97,7 @@ def _need_prelude_for_shape_inference(op):
     return "TensorArray" in op
 
 def _need_module_for_shape_inference(op):
-    return op in ['StridedSlice, NonMaxSuppressionV3']
+    return op in ['StridedSlice', 'NonMaxSuppressionV3']
 
 def _rsqrt():
     def _impl(inputs, attr, params, mod):
@@ -624,41 +624,38 @@ def _impl(inputs, attr, params):
         # score_threshold was introduced from V3
         score_threshold = np.atleast_1d(inputs[4].data.asnumpy())[0] if len(inputs) > 4 else None
 
+        # Generate data with shape (1, num_anchors, 5)
         scores = AttrCvt(op_name="expand_dims",
                          extras={'axis': -1, 'num_newaxis': 1})([inputs[1]], attr)
-
         data = get_relay_op('concatenate')([scores, inputs[0]], -1)
-        # expand to [class_id, prob, box]
-        # data = _get_relay_op('concatenate')([scores, data], -1)
-        # expand to [batch_size, num_anchors, 6] or [batch_size, num_anchors, 5]
-
         data = get_relay_op('expand_dims')(data, 0, 1)
 
-        # Don't need to call get_valid_counts for TensorFlow and ONNX
-        # ct, data = _get_relay_op('get_valid_counts')(data, score_threshold=score_threshold,
-        #                                              id_index=-1, score_index=0)
-        # get the number of anchors
-        data_shape = attr['_input_shapes'][inputs[1]]
-        valid_cnt = _expr.const(data_shape)
+        # reason why using get_valid_counts is for inference performance
+        ct, data, indices = get_relay_op('get_valid_counts')(data,
+                                                             score_threshold=score_threshold,
+                                                             id_index=-1,
+                                                             score_index=0)
         # TensorFlow NMS doesn't have parameter top_k
         top_k = -1
-        # score_index is 0 since TF doesn't have class id for nms input
+        # TF doesn't have class id for nms input
         score_index = 0
         nms_ret = get_relay_op('non_max_suppression')(data=data,
-                                                      valid_count=valid_cnt,
+                                                      valid_count=ct,
+                                                      indices=indices,
                                                       max_output_size=max_output_size,
-                                                      score_threshold=score_threshold,
                                                       iou_threshold=iou_threshold,
-                                                      force_suppress=False,
+                                                      force_suppress=True,
                                                       top_k=top_k,
                                                       coord_start=1,
                                                       score_index=score_index,
                                                       id_index=-1,
                                                       return_indices=True,
                                                       invalid_to_bottom=False)
-
+        # squeeze it, TF NMS is not batched
         end = get_relay_op("squeeze")(nms_ret[1], axis=[1])
         data_slice = get_relay_op("squeeze")(nms_ret[0], axis=[0])
+
+        # slice to get the dynamic result
         ret = get_relay_op("strided_slice")(data_slice, _expr.const([0]), end, _expr.const([1]))
         return ret
     return _impl
@@ -1515,8 +1512,11 @@ def _transform_mask(stride_dim, ellipsis_mask):
         fshape_indices = None
         if begin_mask or end_mask or ellipsis_mask or new_axis_mask or shrink_axis_mask:
             begin, end, stride, fshape_indices = _transform_mask(stride_dim, ellipsis_mask)
-        out = _op.strided_slice(inputs[0], begin=begin, end=end, strides=stride)
-        out_shape = _infer_shape(out, mod)
+        out = _op.strided_slice(inputs[0],
+                                begin=_expr.const(begin),
+                                end=_expr.const(end),
+                                strides=_expr.const(stride))
+        out_shape = _infer_shape(out, mod=mod)
         if not fshape_indices:
             fshape_indices = range(len(out_shape))
 
diff --git a/python/tvm/relay/op/_tensor_grad.py b/python/tvm/relay/op/_tensor_grad.py
index 8ba10207020e..c8c5b76e5427 100644
--- a/python/tvm/relay/op/_tensor_grad.py
+++ b/python/tvm/relay/op/_tensor_grad.py
@@ -390,8 +390,10 @@ def conv2d_grad(orig, grad):
     assert padded_weight_grad_h >= filter_h
     assert padded_weight_grad_w >= filter_w
     if padded_weight_grad_h > filter_h or padded_weight_grad_w > filter_w:
-        backward_weight = strided_slice(backward_weight, begin=[0, 0, 0, 0],
-                                        end=[None, None, filter_h, filter_w])
+        backward_weight = strided_slice(backward_weight,
+                                        begin=const([0, 0, 0, 0], dtype="int64"),
+                                        end=const([out_channel, in_channel // attrs.groups,
+                                                   filter_h, filter_w], dtype="int64"))
 
     return [backward_data, backward_weight]
 
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index c8c540fa861b..44b8c1c03f9d 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -634,7 +634,7 @@ def strided_slice(data, begin, end, strides=None):
     ret : relay.Expr
         The computed result.
     """
-    strides = strides or const(1)
+    strides = strides or const([1], dtype="int32")
     return _make.strided_slice(data, begin, end, strides)
 
 
diff --git a/python/tvm/relay/op/vision/nms.py b/python/tvm/relay/op/vision/nms.py
index a4e98a732715..f2657f47c6b2 100644
--- a/python/tvm/relay/op/vision/nms.py
+++ b/python/tvm/relay/op/vision/nms.py
@@ -47,16 +47,19 @@ def get_valid_counts(data,
 
     out_tensor : relay.Expr
         Rearranged data tensor.
+
+    out_indices: relay.Expr
+        Indices in input data
     """
     return expr.TupleWrapper(
         _make.get_valid_counts(data, score_threshold,
-                               id_index, score_index), 2)
+                               id_index, score_index), 3)
 
 
 def non_max_suppression(data,
                         valid_count,
+                        indices,
                         max_output_size=-1,
-                        score_threshold=0.0,
                         iou_threshold=0.5,
                         force_suppress=False,
                         top_k=-1,
@@ -79,13 +82,13 @@ def non_max_suppression(data,
     valid_count : relay.Expr
         1-D tensor for valid number of boxes.
 
+    indices: relay.Expr
+        2-D tensor with shape [batch_size, num_anchors]
+
     max_output_size : int, optional
         Max number of output valid boxes for each instance.
         By default all valid boxes are returned.
 
-    score_threshold : float, optional
-        Lower limit of score for valid bounding boxes.
-
     iou_threshold : float, optional
         Non-maximum suppression threshold.
 
@@ -120,8 +123,8 @@ def non_max_suppression(data,
     """
     out = _make.non_max_suppression(data,
                                     valid_count,
+                                    indices,
                                     max_output_size,
-                                    score_threshold,
                                     iou_threshold,
                                     force_suppress,
                                     top_k,
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 5d471f99a47f..4e1b05e04620 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1676,6 +1676,66 @@ Array<Integer> GetIntArray(Array<IndexExpr> arr) {
 // strided_slice
 TVM_REGISTER_NODE_TYPE(StridedSliceAttrs);
 
+int64_t* ToVector(const runtime::NDArray& array) {
+  size_t len = array.Shape().front();
+  int64_t* rel_vec = new int64_t[len];
+  if (array->dtype.code == kDLInt) {
+    if (array->dtype.bits == 8) {
+      int8_t* init_array = reinterpret_cast<int8_t*>(array->data);
+      for (size_t i = 0; i < len; ++i) {
+        rel_vec[i] = int64_t(init_array[i]);
+      }
+      return rel_vec;
+    } else if (array->dtype.bits == 16) {
+      int16_t* init_array = reinterpret_cast<int16_t*>(array->data);
+      for (size_t i = 0; i < len; ++i) {
+        rel_vec[i] = int64_t(init_array[i]);
+      }
+      return rel_vec;
+    } else if (array->dtype.bits == 32) {
+      int32_t* init_array = reinterpret_cast<int32_t*>(array->data);
+      for (size_t i = 0; i < len; ++i) {
+        rel_vec[i] = int64_t(init_array[i]);
+      }
+      return rel_vec;
+    } else if (array->dtype.bits == 64) {
+      int64_t* init_array = reinterpret_cast<int64_t*>(array->data);
+      for (size_t i = 0; i < len; ++i) {
+        rel_vec[i] = int64_t(init_array[i]);
+      }
+      return rel_vec;
+    }
+  } else if (array->dtype.code == kDLUInt) {
+    if (array->dtype.bits == 8) {
+      uint8_t* init_array = reinterpret_cast<uint8_t*>(array->data);
+      for (size_t i = 0; i < len; ++i) {
+        rel_vec[i] = int64_t(init_array[i]);
+      }
+      return rel_vec;
+    } else if (array->dtype.bits == 16) {
+      uint16_t* init_array = reinterpret_cast<uint16_t*>(array->data);
+      for (size_t i = 0; i < len; ++i) {
+        rel_vec[i] = int64_t(init_array[i]);
+      }
+      return rel_vec;
+    } else if (array->dtype.bits == 32) {
+      uint32_t* init_array = reinterpret_cast<uint32_t*>(array->data);
+      for (size_t i = 0; i < len; ++i) {
+        rel_vec[i] = int64_t(init_array[i]);
+      }
+      return rel_vec;
+    } else if (array->dtype.bits == 64) {
+      uint64_t* init_array = reinterpret_cast<uint64_t*>(array->data);
+      for (size_t i = 0; i < len; ++i) {
+        rel_vec[i] = int64_t(init_array[i]);
+      }
+      return rel_vec;
+    }
+  }
+  LOG(FATAL) << "Unknown data type: " << tvm::runtime::TVMType2String(array->dtype);
+  return rel_vec;
+}
+
 bool StridedSliceRel(const Array<Type>& types,
                      int num_inputs,
                      const Attrs& attrs,
@@ -1686,7 +1746,7 @@ bool StridedSliceRel(const Array<Type>& types,
   const auto* data = types[0].as<TensorTypeNode>();
   CHECK(data != nullptr);
   auto dshape = data->shape;
-  auto num_axis = dshape.size();
+  int64_t num_axis = dshape.size();
 
   // calculate output shape
   std::vector<IndexExpr> oshape(num_axis);
@@ -1695,32 +1755,32 @@ bool StridedSliceRel(const Array<Type>& types,
       (cend = param->end.as<ConstantNode>()) &&
       (cstrides = param->strides.as<ConstantNode>())) {
     std::vector<int64_t> stride_vec;
-    int32_t* strides_val = reinterpret_cast<int32_t*>(cstrides->data->data);
-    for (size_t i = 0; i < cstrides->data.Shape().front(); ++i){
+    int64_t* strides_val = ToVector(cstrides->data);
+    for (int64_t i = 0; i < cstrides->data.Shape().front(); ++i) {
       stride_vec.push_back(strides_val[i]);
     }
-    for (size_t i = stride_vec.size(); i < num_axis; ++i) {
+    for (int64_t i = stride_vec.size(); i < num_axis; ++i) {
       stride_vec.push_back(1);
     }
     const int64_t max_range = std::numeric_limits<int64_t>::max();
     std::vector<int64_t> begin_vec;
-    int32_t* begin_val = reinterpret_cast<int32_t*>(cbegin->data->data);
-    for (size_t i = 0; i < cbegin->data.Shape().front(); ++i){
+    int64_t* begin_val = ToVector(cbegin->data);
+    for (int64_t i = 0; i < cbegin->data.Shape().front(); ++i) {
       begin_vec.push_back(begin_val[i]);
     }
-    for (size_t i = begin_vec.size(); i < num_axis; ++i) {
+    for (int64_t i = begin_vec.size(); i < num_axis; ++i) {
       begin_vec.push_back(stride_vec[i] > 0 ? 0 : max_range);
     }
     std::vector<int64_t> end_vec;
-    int32_t* end_val = reinterpret_cast<int32_t*>(cend->data->data);
-    for (size_t i = 0; i < cend->data.Shape().front(); ++i){
+    int64_t* end_val = ToVector(cend->data);
+    for (int64_t i = 0; i < cend->data.Shape().front(); ++i) {
       end_vec.push_back(end_val[i]);
     }
-    for (size_t i = end_vec.size(); i < num_axis; ++i) {
+    for (int64_t i = end_vec.size(); i < num_axis; ++i) {
       end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
     }
 
-    for (size_t i = 0; i < num_axis; ++i) {
+    for (int64_t i = 0; i < num_axis; ++i) {
       int64_t stride_v = stride_vec[i];
       int64_t begin_v = begin_vec[i];
       int64_t end_v = end_vec[i];
@@ -1784,9 +1844,9 @@ Array<Array<Layout>> StridedSliceInferCorrectLayout(const Attrs& attrs,
   }
 
   CHECK(old_in_layouts.defined());
-  CHECK_EQ(old_in_layouts.size(), 1);
+  CHECK_GE(old_in_layouts.size(), 1);
   CHECK(old_in_shapes.defined());
-  CHECK_EQ(old_in_shapes.size(), 1);
+  CHECK_GE(old_in_shapes.size(), 1);
 
   auto layout = old_in_layouts[0];
   if (layout.defined() && new_in_layouts.defined()) {
@@ -1802,17 +1862,16 @@ Array<Array<Layout>> StridedSliceInferCorrectLayout(const Attrs& attrs,
     if ((cbegin = params->begin.as<ConstantNode>()) &&
         (cend = params->end.as<ConstantNode>()) &&
         (cstrides = params->strides.as<ConstantNode>())) {
-
-      int32_t* strides_val = reinterpret_cast<int32_t*>(cstrides->data->data);
-      for (size_t i = 0; i < cstrides->data.Shape().front(); ++i){
+      int64_t* strides_val = ToVector(cstrides->data);
+      for (int64_t i = 0; i < cstrides->data.Shape().front(); ++i) {
         strides.push_back(strides_val[i]);
       }
-      int32_t* begin_val = reinterpret_cast<int32_t*>(cbegin->data->data);
-      for (size_t i = 0; i < cbegin->data.Shape().front(); ++i){
+      int64_t* begin_val = ToVector(cbegin->data);
+      for (int64_t i = 0; i < cbegin->data.Shape().front(); ++i) {
         begin.push_back(begin_val[i]);
       }
-      int32_t* end_val = reinterpret_cast<int32_t*>(cend->data->data);
-      for (size_t i = 0; i < cend->data.Shape().front(); ++i){
+      int64_t* end_val = ToVector(cend->data);
+      for (int64_t i = 0; i < cend->data.Shape().front(); ++i) {
         end.push_back(end_val[i]);
       }
     }
@@ -1859,11 +1918,12 @@ inline Tensor DynamicStridedSlice(const tvm::Tensor& input,
                                   const tvm::Tensor& strides,
                                   std::string name = "T_strided_slice_dynamic",
                                   std::string tag = topi::kInjective) {
-  size_t src_tensor_dim = static_cast<size_t>(input->shape.size());
+  int64_t src_tensor_dim = input->shape.size();
   Array<tvm::Expr> out_shape;
-  for(size_t i = 0; i < src_tensor_dim; ++i){
+  for (int64_t i = 0; i < src_tensor_dim; ++i) {
     out_shape.push_back(tvm::Var("dim"));
   }
+  // TODO(yongwww): move the compute into topi after nnvm is removed
   return tvm::compute(out_shape, [&](const Array<tvm::Var>& indices) {
       Array<tvm::Expr> real_indices;
       for (int32_t i = 0; i < src_tensor_dim; ++i) {
@@ -1882,16 +1942,16 @@ Array<te::Tensor> StridedSliceCompute(const Attrs& attrs, const Array<te::Tensor
       (cend = param->end.as<ConstantNode>()) &&
       (cstrides = param->strides.as<ConstantNode>())) {
     Array<Integer> begin, end, strides;
-    int32_t* strides_val = reinterpret_cast<int32_t*>(cstrides->data->data);
-    for (size_t i = 0; i < cstrides->data.Shape().front(); ++i){
+    int64_t* strides_val = ToVector(cstrides->data);
+    for (int64_t i = 0; i < cstrides->data.Shape().front(); ++i) {
       strides.push_back(strides_val[i]);
     }
-    int32_t* begin_val = reinterpret_cast<int32_t*>(cbegin->data->data);
-    for (size_t i = 0; i < cbegin->data.Shape().front(); ++i){
+    int64_t* begin_val = ToVector(cbegin->data);
+    for (int64_t i = 0; i < cbegin->data.Shape().front(); ++i) {
       begin.push_back(begin_val[i]);
     }
-    int32_t* end_val = reinterpret_cast<int32_t*>(cend->data->data);
-    for (size_t i = 0; i < cend->data.Shape().front(); ++i){
+    int64_t* end_val = ToVector(cend->data);
+    for (int64_t i = 0; i < cend->data.Shape().front(); ++i) {
       end.push_back(end_val[i]);
     }
     return Array<te::Tensor>{
diff --git a/src/relay/op/vision/nms.cc b/src/relay/op/vision/nms.cc
index bdd4f664165b..4002820d0c15 100644
--- a/src/relay/op/vision/nms.cc
+++ b/src/relay/op/vision/nms.cc
@@ -37,9 +37,11 @@ bool GetValidCountRel(const Array<Type>& types, int num_inputs, const Attrs& att
   CHECK_EQ(dshape.size(), 3) << "Input data should be 3-D.";
 
   std::vector<IndexExpr> oshape({data->shape[0]});
+  std::vector<IndexExpr> oshape_indices({data->shape[0], data->shape[1]});
   std::vector<Type> fields;
   fields.push_back(TensorType(oshape, DataType::Int(32)));
   fields.push_back(TensorType(data->shape, data->dtype));
+  fields.push_back(TensorType(oshape_indices, DataType::Int(32)));
 
   // assign output type
   reporter->Assign(types[1], TupleType(Array<Type>(fields)));
@@ -71,7 +73,7 @@ TVM_REGISTER_NODE_TYPE(NonMaximumSuppressionAttrs);
 
 bool NMSRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
             const TypeReporter& reporter) {
-  CHECK_EQ(types.size(), 3);
+  CHECK_EQ(types.size(), 4);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* valid_count = types[1].as<TensorTypeNode>();
   const NonMaximumSuppressionAttrs* param = attrs.as<NonMaximumSuppressionAttrs>();
@@ -88,9 +90,9 @@ bool NMSRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     fields.push_back(TensorType(oshape, DataType::Int(32)));
     std::vector<IndexExpr> countshape({dshape[0], 1});
     fields.push_back(TensorType(countshape, DataType::Int(32)));
-    reporter->Assign(types[2], TupleType(Array<Type>(fields)));
+    reporter->Assign(types[3], TupleType(Array<Type>(fields)));
   } else {
-    reporter->Assign(types[2], TensorType(dshape, data->dtype));
+    reporter->Assign(types[3], TensorType(dshape, data->dtype));
   }
   return true;
 }
@@ -98,8 +100,8 @@ bool NMSRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 
 Expr MakeNMS(Expr data,
              Expr valid_count,
+             Expr indices,
              int max_output_size,
-             double score_threshold,
              double iou_threshold,
              bool force_suppress,
              int top_k,
@@ -110,7 +112,6 @@ Expr MakeNMS(Expr data,
              bool invalid_to_bottom) {
   auto attrs = make_object<NonMaximumSuppressionAttrs>();
   attrs->max_output_size = max_output_size;
-  attrs->score_threshold = score_threshold;
   attrs->iou_threshold = iou_threshold;
   attrs->force_suppress = force_suppress;
   attrs->top_k = top_k;
@@ -120,7 +121,7 @@ Expr MakeNMS(Expr data,
   attrs->return_indices = return_indices;
   attrs->invalid_to_bottom = invalid_to_bottom;
   static const Op& op = Op::Get("vision.non_max_suppression");
-  return Call(op, {data, valid_count}, Attrs(attrs), {});
+  return Call(op, {data, valid_count, indices}, Attrs(attrs), {});
 }
 
 TVM_REGISTER_GLOBAL("relay.op.vision._make.non_max_suppression")
@@ -132,11 +133,12 @@ be in the format of [class_id, score, left, top, right, bottom]
 or [score, left, top, right, bottom]. Set id_index to be -1 to
 ignore class_id axis.
 )doc" TVM_ADD_FILELINE)
-    .set_num_inputs(2)
-    .add_argument("data", "Tensor", "Input data.")
-    .add_argument("valid_count", "Tensor", "Number of valid anchor boxes.")
-    .set_support_level(5)
-    .add_type_rel("NMS", NMSRel);
+.set_num_inputs(3)
+.add_argument("data", "Tensor", "Input data.")
+.add_argument("valid_count", "Tensor", "Number of valid anchor boxes.")
+.add_argument("indices", "Tensor", "Corresponding indices in original input tensor.")
+.set_support_level(5)
+.add_type_rel("NMS", NMSRel);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/transforms/combine_parallel_conv2d.cc b/src/relay/transforms/combine_parallel_conv2d.cc
index e4b2e1ea980b..58ac315d3234 100644
--- a/src/relay/transforms/combine_parallel_conv2d.cc
+++ b/src/relay/transforms/combine_parallel_conv2d.cc
@@ -71,12 +71,15 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
     const auto shape_b =
         tir::BijectiveLayout(Layout(attrs_b->kernel_layout), kOIHW).ForwardShape(tweight_b->shape);
 
-    return eq(attrs_a->strides, attrs_b->strides) && eq(attrs_a->padding, attrs_b->padding) &&
-           eq(attrs_a->dilation, attrs_b->dilation) && eq(attrs_a->groups, attrs_b->groups) &&
+    return eq(attrs_a->strides, attrs_b->strides) &&
+           eq(attrs_a->padding, attrs_b->padding) &&
+           eq(attrs_a->dilation, attrs_b->dilation) &&
+           eq(attrs_a->groups, attrs_b->groups) &&
            eq(attrs_a->data_layout, attrs_b->data_layout) &&
            eq(attrs_a->kernel_layout, attrs_b->kernel_layout) &&
            eq(attrs_a->out_dtype, attrs_b->out_dtype) &&
-           eq(attrs_a->out_layout, attrs_b->out_layout) && eq(shape_a[2], shape_b[2]) &&
+           eq(attrs_a->out_layout, attrs_b->out_layout) &&
+           eq(shape_a[2], shape_b[2]) &&
            eq(shape_a[3], shape_b[3]);
   }
 
@@ -187,13 +190,15 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
       int64_t* begin_data = static_cast<int64_t*>(begin_ndarray->data);
       int64_t* end_data = static_cast<int64_t*>(end_ndarray->data);
 
-      for (size_t i = 0; i < begin.size(); ++i){
+      for (size_t i = 0; i < begin.size(); ++i) {
         begin_data[i] = begin[i];
         end_data[i] = end[i];
       }
 
-      auto slice = MakeStridedSlice(data, ConstantNode::make(begin_ndarray),
-                                    ConstantNode::make(end_ndarray), ConstantNode::make(strides_ndarray));
+      auto slice = MakeStridedSlice(data,
+                                    ConstantNode::make(begin_ndarray),
+                                    ConstantNode::make(end_ndarray),
+                                    ConstantNode::make(strides_ndarray));
       subst_map->insert({GetRef<Expr>(branch[depth]), slice});
     }
   }
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 76d2fe13aa49..82c00e2db1d9 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -190,6 +190,7 @@ def name_without_num(name):
                                        target=device, out_names=out_name,
                                        num_output=len(out_name), opt_level=opt_level, mode=mode,
                                        cuda_layout=cuda_layout)
+
             # since the names from tensorflow and relay runs are not exactly same,
             # first len(tf_output) will be compared
             for i in range(len(tf_output)):
@@ -3320,6 +3321,7 @@ def test_forward_isfinite():
     test_forward_space_to_batch_nd()
     test_forward_batch_to_space_nd()
     test_forward_dilation()
+    test_forward_nms_v3()
 
     # End to End
     test_forward_inception_v3()
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 358c8f18ea34..ea5ebcd6c265 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -296,35 +296,23 @@ def test_mean_var_std():
 
 
 def test_strided_slice():
-    def verify(dshape, begin, end, strides, output, test_ref=True):
-        dtype = "int32"
+    def verify(dshape, begin, end, strides, output, test_ref=True, dtype="int32"):
         x = relay.var("x", relay.TensorType(dshape, "float32"))
         ndim = len(dshape)
         begin = begin if begin else [0] * ndim
         end = end if end else list(dshape)
-        strides = strides if strides else [1] * ndim
-        for i in range(ndim):
-            if len(begin) <= i:
-                begin.append(0)
-            if len(end) <= i:
-                end.append(dshape[i])
-            if len(strides) <= i:
-                strides.append(1)
-            if not begin[i]:
-                begin[i] = 0
-            if not end[i]:
-                end[i] = dshape[i]
-            if not strides[i]:
-                strides[i] = 1
-
         begin_expr = relay.const(begin, dtype=dtype)
         end_expr = relay.const(end, dtype=dtype)
-        strides_expr = relay.const(strides, dtype=dtype)
-
-        z = relay.strided_slice(x,
-                                begin=begin_expr,
-                                end=end_expr,
-                                strides=strides_expr)
+        if strides:
+            strides_expr = relay.const(strides, dtype=dtype)
+            z = relay.strided_slice(x,
+                                    begin=begin_expr,
+                                    end=end_expr,
+                                    strides=strides_expr)
+        else:
+            z = relay.strided_slice(x,
+                                    begin=begin_expr,
+                                    end=end_expr)
         func = relay.Function([x], z)
 
         func = run_infer_type(func)
@@ -348,8 +336,6 @@ def verify(dshape, begin, end, strides, output, test_ref=True):
     verify((1, 224, 224, 3), [0, 20, 20, 0], [1, 140, 140, 3], [1, 1, 1, 1], (1, 120, 120, 3), dtype="int64")
     verify((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], (1, 3, 3), dtype="int16")
     verify((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2], (3, 1, 2))
-    verify((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], (1, 3, 3))
-    verify((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2], (3, 1, 2))
     verify((3, 4, 3), [1, 0, 0], [2, 2, 3], [1, 1, 2], (1, 2, 2))
     verify((3, 4, 3), [1, 1, 0], [4, 4, 3], None, (2, 3, 3))
     verify((3, 4, 3), [1, 1, 0], [4, 1000, 3], None, (2, 3, 3))
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index b73428b0a8f5..df3091f3021d 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -244,6 +244,7 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
         np_data = np.random.uniform(low=-2, high=2, size=dshape).astype(dtype)
         np_out1 = np.zeros(shape=(batch_size,))
         np_out2 = np.zeros(shape=dshape).astype(dtype)
+        np_out3 = np.zeros(shape=(batch_size, num_anchor))
         for i in range(batch_size):
             np_out1[i] = 0
             inter_idx = 0
@@ -253,10 +254,12 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
                     for k in range(elem_length):
                         np_out2[i, inter_idx, k] = np_data[i, j, k]
                     np_out1[i] += 1
+                    np_out3[i, inter_idx] = j
                     inter_idx += 1
                 if j >= np_out1[i]:
                     for k in range(elem_length):
                         np_out2[i, j, k] = -1.0
+                    np_out3[i, j] = -1
 
         x = relay.var("x", relay.ty.TensorType(dshape, dtype))
         z = relay.vision.get_valid_counts(x, score_threshold, id_index, score_index)
@@ -271,6 +274,7 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
             if target == 'cuda':
                 return
             tvm.testing.assert_allclose(out[1].asnumpy(), np_out2, rtol=1e-3, atol=1e-04)
+            tvm.testing.assert_allclose(out[2].asnumpy(), np_out3, rtol=1e-3, atol=1e-04)
 
     verify_get_valid_counts((1, 2500, 6), 0, 0, 1)
     verify_get_valid_counts((1, 2500, 5), -1, -1, 0)
@@ -279,15 +283,16 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
 
 
 def test_non_max_suppression():
-    def verify_nms(x0_data, x1_data, dshape, ref_res, ref_indices_res,
+    def verify_nms(x0_data, x1_data, x2_data, dshape, ref_res, ref_indices_res,
                    iou_threshold=0.5, force_suppress=False, top_k=-1,
                    check_type_only=False):
         x0 = relay.var("x0", relay.ty.TensorType(dshape, "float32"))
         x1 = relay.var("x1", relay.ty.TensorType((dshape[0],), "int32"))
-        z = relay.vision.non_max_suppression(x0, x1, max_output_size=-1, \
+        x2 = relay.var("x2", relay.ty.TensorType((dshape[0],dshape[1]), "int32"))
+        z = relay.vision.non_max_suppression(x0, x1, x2, max_output_size=-1, \
             iou_threshold=iou_threshold, force_suppress=force_suppress, \
             top_k=top_k, return_indices=False)
-        z_indices = relay.vision.non_max_suppression(x0, x1, max_output_size=-1, score_threshold=0.5, \
+        z_indices = relay.vision.non_max_suppression(x0, x1, x2, max_output_size=-1, \
                     iou_threshold=iou_threshold, force_suppress=force_suppress, \
                     top_k=top_k)
         if isinstance(z_indices, relay.expr.TupleWrapper):
@@ -304,50 +309,53 @@ def verify_nms(x0_data, x1_data, dshape, ref_res, ref_indices_res,
         if check_type_only:
             return
 
-        func = relay.Function([x0, x1], z)
+        func = relay.Function([x0, x1, x2], z)
         func = run_infer_type(func)
-        func_indices = relay.Function([x0, x1], z_indices)
+        func_indices = relay.Function([x0, x1, x2], z_indices)
         func_indices = run_infer_type(func_indices)
         for target, ctx in ctx_list():
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
-            op_res1 = intrp1.evaluate(func)(x0_data, x1_data)
+            op_res1 = intrp1.evaluate(func)(x0_data, x1_data, x2_data)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
             if top_k == -1:
-                op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data)
+                op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data, x2_data)
                 tvm.testing.assert_allclose(op_indices_res1[0].asnumpy(), ref_indices_res, rtol=1e-5)
             intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
-            op_res2 = intrp2.evaluate(func)(x0_data, x1_data)
+            op_res2 = intrp2.evaluate(func)(x0_data, x1_data, x2_data)
             tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
             if top_k == -1:
-                op_indices_res2 = intrp2.evaluate(func_indices)(x0_data, x1_data)
+                op_indices_res2 = intrp2.evaluate(func_indices)(x0_data, x1_data, x2_data)
                 tvm.testing.assert_allclose(op_indices_res2[0].asnumpy(), ref_indices_res, rtol=1e-5)
 
     np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80],
                          [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79],
                          [1, 0.5, 100, 60, 70, 110]]]).astype("float32")
     np_valid_count = np.array([4]).astype("int32")
+
+    np_indices = np.array([[0, 1, 3, 4, -1]]).astype("int32")
+
     np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
                            [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
     np_indices_result = np.array([[3, 0, -1, -1, -1]])
     num_anchors = 5
 
-    dshape = (te.size_var("n"), num_anchors, 6)
-    verify_nms(np_data, np_valid_count, dshape, np_result, np_indices_result,
+    dshape = (tvm.var("n"), num_anchors, 6)
+    verify_nms(np_data, np_valid_count, np_indices, dshape, np_result, np_indices_result,
                force_suppress=True, top_k=2, check_type_only=True)
     dshape = (1, num_anchors, 6)
-    verify_nms(np_data, np_valid_count, dshape, np_result, np_indices_result,
+    verify_nms(np_data, np_valid_count, np_indices, dshape, np_result, np_indices_result,
                force_suppress=True, top_k=2, check_type_only=False)
 
     np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
                            [1, 0.7, 30, 60, 50, 80], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
     np_indices_result = np.array([[3, 0, 1, -1, -1]])
-    dshape = (te.size_var("n"), num_anchors, 6)
-    verify_nms(np_data, np_valid_count, dshape, np_result,
+    dshape = (tvm.var("n"), num_anchors, 6)
+    verify_nms(np_data, np_valid_count, np_indices, dshape, np_result,
                np_indices_result, check_type_only=True)
     dshape = (1, num_anchors, 6)
-    verify_nms(np_data, np_valid_count, dshape, np_result,
+    verify_nms(np_data, np_valid_count, np_indices, dshape, np_result,
                np_indices_result, top_k=3)
 
 
@@ -390,7 +398,7 @@ def test_default_value():
 
         assert ret.checked_type == ref_type
 
-        nms = relay.vision.non_max_suppression(mtl[0], mtl[1], return_indices=False)
+        nms = relay.vision.non_max_suppression(mtl[0], mtl[1], mtl[0], return_indices=False)
         func = relay.Function([cls_prob, loc_pred, anchors], nms)
         func = run_infer_type(func)
         for target, ctx in ctx_list():
diff --git a/topi/python/topi/cuda/ssd/multibox.py b/topi/python/topi/cuda/ssd/multibox.py
index 30784f45a591..22d74438188c 100644
--- a/topi/python/topi/cuda/ssd/multibox.py
+++ b/topi/python/topi/cuda/ssd/multibox.py
@@ -459,7 +459,7 @@ def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nm
     """
     inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor,
                                        clip, threshold, variances)
-    out = non_max_suppression(inter_out[0], inter_out[1], max_output_size=-1,
+    out = non_max_suppression(inter_out[0], inter_out[1], inter_out[1], max_output_size=-1,
                               iou_threshold=nms_threshold, force_suppress=force_suppress,
                               top_k=nms_topk, return_indices=False)
     return out
diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index 48e85cdd574a..73e702447de5 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -23,54 +23,6 @@
 from ..sort import argsort
 
 
-@hybrid.script
-def hybrid_rearrange_idx(data):
-    """Hybrid routine to rearrange nms output to
-    move all valid entries to top.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor or numpy NDArray
-        NMS output. 2-D tensor with shape
-        [batch_size, num_anchors].
-
-    one: tvm.tir.const
-        Constant one with the same dtype as data.
-
-    Returns
-    -------
-    output : tvm.te.Tensor or numpy NDArray
-        Transformed NMS output. 2-D tensor with shape
-        [batch_size, num_anchors].
-
-    shape : tvm.te.Tensor or numpy NDArray
-        Shape of Tensor with valid indexes
-        [Batch_size, num_valid_indices]
-    """
-    batch_size = data.shape[0]
-    num_anchors = data.shape[1]
-    out_tensor = output_tensor((batch_size,
-                                num_anchors),
-                               data.dtype)
-    out_shape = output_tensor((batch_size,
-                               1),
-                              data.dtype)
-
-    for i in range(batch_size): # range instead
-        valid_idx = 0
-        for j in range(num_anchors):
-            if data[i, j] >= 0:
-                out_tensor[i, valid_idx] = data[i, j]
-                valid_idx += 1
-            if data[i, j] > num_anchors or data[i, j] < -num_anchors:
-                out_tensor[i, valid_idx] = 0
-                valid_idx += 1
-            if j >= valid_idx:
-                out_tensor[i, j] = -1
-        out_shape[i, 0] = valid_idx
-    return out_tensor, out_shape
-
-
 @hybrid.script
 def hybrid_rearrange_out(data, one):
     """Hybrid routine to rearrange nms output to
@@ -80,7 +32,9 @@ def hybrid_rearrange_out(data, one):
     ----------
     data : tvm.te.Tensor or numpy NDArray
         NMS output. 3-D tensor with shape
-        [batch_size, num_anchors, 6].
+        [batch_size, num_anchors, 6] or
+        [batch_size, num_anchors, 5], or 2-D
+        tensor with shape [batch_size, num_anchors].
 
     one: tvm.tir.const
         Constant one with the same dtype as data.
@@ -89,28 +43,48 @@ def hybrid_rearrange_out(data, one):
     -------
     output : tvm.te.Tensor or numpy NDArray
         Transformed NMS output. 3-D tensor with shape
-        [batch_size, num_anchors, 6] or [batch_size, num_anchors, 5].
+        [batch_size, num_anchors, 6] or [batch_size, num_anchors, 5],
+        or 2-D tensor with shape [batch_size, num_anchors].
+
+    valid_box_count : tvm.Tensor or numpy NDArray
+        Tensor with shape [batch_size, 1], indicates
+        the valid number of boxes.
     """
+    ndim = len(data.shape)
     batch_size = data.shape[0]
     num_anchors = data.shape[1]
-    elem_length = data.shape[2]
-    output = output_tensor((batch_size,
-                            num_anchors,
-                            elem_length),
-                           data.dtype)
+    valid_box_count = output_tensor((batch_size, 1), "int32")
+    output = output_tensor((batch_size, num_anchors), data.dtype)
+    if ndim > 2:
+        output = output_tensor((batch_size,
+                                num_anchors,
+                                data.shape[2]),
+                               data.dtype)
 
     for i in parallel(batch_size):
         valid_idx = 0
         for j in range(num_anchors):
-            if data[i, j, 0] >= 0:
-                for k in range(elem_length):
-                    output[i, valid_idx, k] = data[i, j, k]
-                valid_idx += 1
-            if j >= valid_idx:
-                for k in range(elem_length):
-                    output[i, j, k] = -one
-
-    return output
+            if ndim > 2:
+                elem_length = data.shape[2]
+                if data[i, j, 0] >= 0:
+                    for k in range(elem_length):
+                        output[i, valid_idx, k] = data[i, j, k]
+                    valid_idx += 1
+                if j >= valid_idx:
+                    for k in range(elem_length):
+                        output[i, j, k] = -one
+            else:
+                if data[i, j] >= 0:
+                    output[i, valid_idx] = data[i, j]
+                    valid_idx += 1
+                if data[i, j] > num_anchors or data[i, j] < -num_anchors:
+                    output[i, valid_idx] = 0
+                    valid_idx += 1
+                if j >= valid_idx:
+                    output[i, j] = -one
+        valid_box_count[i, 0] = valid_idx
+
+    return output, valid_box_count
 
 
 @hybrid.script
@@ -139,11 +113,14 @@ def hybrid_get_valid_counts(data, score_threshold, id_index, score_index, one):
 
     Returns
     -------
+    valid_count : tvm.te.Tensor or numpy NDArray
+        1-D tensor for valid number of boxes.
+
     out_tensor : tvm.te.Tensor or numpy NDArray
         Rearranged data tensor.
 
-    valid_count : tvm.te.Tensor or numpy NDArray
-        1-D tensor for valid number of boxes.
+    out_indices: tvm.te.Tensor or numpy NDArray
+        Related index in input data.
     """
     batch_size = data.shape[0]
     num_anchors = data.shape[1]
@@ -153,6 +130,7 @@ def hybrid_get_valid_counts(data, score_threshold, id_index, score_index, one):
                                 num_anchors,
                                 box_data_length),
                                data.dtype)
+    out_indices = output_tensor((batch_size, num_anchors), "int32")
     for i in parallel(batch_size):
         valid_count[i] = 0
         for j in range(num_anchors):
@@ -161,11 +139,13 @@ def hybrid_get_valid_counts(data, score_threshold, id_index, score_index, one):
                     (id_index < 0 or data[i, j, id_index] >= 0):
                 for k in range(box_data_length):
                     out_tensor[i, valid_count[i], k] = data[i, j, k]
+                out_indices[i, valid_count[i]] = j
                 valid_count[i] += 1
             if j >= valid_count[i]:
                 for k in range(box_data_length):
                     out_tensor[i, j, k] = -one
-    return valid_count, out_tensor
+                out_indices[i, j] = -1
+    return valid_count, out_tensor, out_indices
 
 
 def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
@@ -189,11 +169,14 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
 
     Returns
     -------
+    valid_count : tvm.te.Tensor
+        1-D tensor for valid number of boxes.
+
     out_tensor : tvm.te.Tensor
         Rearranged data tensor.
 
-    valid_count : tvm.te.Tensor
-        1-D tensor for valid number of boxes.
+    out_indices: tvm.te.Tensor or numpy NDArray
+        Related index in input data.
     """
     score_threshold_const = tvm.tir.const(score_threshold, data.dtype)
     id_index_const = tvm.tir.const(id_index, "int32")
@@ -204,8 +187,9 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
 
 
 @hybrid.script
-def hybrid_nms(data, sorted_index, valid_count, max_output_size, iou_threshold,
-               force_suppress, top_k, coord_start, id_index, score_index, zero, one):
+def hybrid_nms(data, sorted_index, valid_count, indices, max_output_size,
+               iou_threshold, force_suppress, top_k, coord_start, score_index,
+               id_index, return_indices, zero, one):
     """Hybrid routing for non-maximum suppression.
 
     Parameters
@@ -221,6 +205,9 @@ def hybrid_nms(data, sorted_index, valid_count, max_output_size, iou_threshold,
     valid_count : tvm.te.Tensor or numpy NDArray
         1-D tensor for valid number of boxes.
 
+    indices : tvm.Tensor or numpy.NDArray
+        indices in original tensor, with shape [batch_size, num_anchors]
+
     max_output_size : tvm.tir.const
         Max number of output valid boxes for each instance.
         By default all valid boxes are returned.
@@ -237,11 +224,14 @@ def hybrid_nms(data, sorted_index, valid_count, max_output_size, iou_threshold,
     coord_start : tvm.tir.const
         Start index of the consecutive 4 coordinates.
 
+    score_index: tvm.tir.const
+        Index of the scores/confidence of boxes.
+
     id_index : tvm.tir.const
         index of the class categories, -1 to disable.
 
-    score_index: tvm.tir.const
-        Index of the scores/confidence of boxes.
+    return_indices : tvm.tir.const
+        Whether to return box indices in input data.
 
     zero: tvm.tir.const
         Constant zero with the same dtype as data.
@@ -258,9 +248,12 @@ def hybrid_nms(data, sorted_index, valid_count, max_output_size, iou_threshold,
     box_indices: tvm.te.Tensor
         2-D tensor with shape [batch_size, num_anchors].
     """
+
     batch_size = data.shape[0]
     num_anchors = data.shape[1]
     box_data_length = data.shape[2]
+
+    # box_indices is the expected value, similar to TF & ONNX
     box_indices = output_tensor((batch_size, num_anchors), sorted_index.dtype)
     output = output_tensor((batch_size,
                             num_anchors,
@@ -282,9 +275,11 @@ def hybrid_nms(data, sorted_index, valid_count, max_output_size, iou_threshold,
                         for k in range(box_data_length):
                             output[i, j + nkeep, k] = -one
                         box_indices[i, j + nkeep] = -1
+
             # Apply nms
             box_start_idx = coord_start
             batch_idx = i
+
             for j in range(valid_count[i]):
                 if output[i, j, score_index] > 0 and (id_index < 0 or output[i, j, id_index] >= 0):
                     box_a_idx = j
@@ -296,36 +291,62 @@ def hybrid_nms(data, sorted_index, valid_count, max_output_size, iou_threshold,
                                 check_iou = 1
                             elif id_index < 0 or output[i, j, id_index] == output[i, k, id_index]:
                                 check_iou = 1
+
                         if check_iou > 0:
-                            a_l = output[batch_idx, box_a_idx, box_start_idx]
-                            a_t = output[batch_idx, box_a_idx, box_start_idx + 1]
-                            a_r = output[batch_idx, box_a_idx, box_start_idx + 2]
-                            a_b = output[batch_idx, box_a_idx, box_start_idx + 3]
+                            # a_l: left, a_t: top, a_r: right, a_b: bottom
+                            a_l = min(output[batch_idx, box_a_idx, box_start_idx],
+                                      output[batch_idx, box_a_idx, box_start_idx + 2])
+                            a_t = min(output[batch_idx, box_a_idx, box_start_idx + 1],
+                                      output[batch_idx, box_a_idx, box_start_idx + 3])
+                            a_r = max(output[batch_idx, box_a_idx, box_start_idx],
+                                      output[batch_idx, box_a_idx, box_start_idx + 2])
+                            a_b = max(output[batch_idx, box_a_idx, box_start_idx + 1],
+                                      output[batch_idx, box_a_idx, box_start_idx + 3])
+
                             box_b_idx = k
-                            b_t = output[batch_idx, box_b_idx, box_start_idx + 1]
-                            b_b = output[batch_idx, box_b_idx, box_start_idx + 3]
-                            b_l = output[batch_idx, box_b_idx, box_start_idx]
-                            b_r = output[batch_idx, box_b_idx, box_start_idx + 2]
+
+                            # b_l: left, b_t: top, b_r: right, b_b: bottom
+                            b_l = min(output[batch_idx, box_b_idx, box_start_idx],
+                                      output[batch_idx, box_b_idx, box_start_idx + 2])
+                            b_t = min(output[batch_idx, box_b_idx, box_start_idx + 1],
+                                      output[batch_idx, box_b_idx, box_start_idx + 3])
+                            b_r = max(output[batch_idx, box_b_idx, box_start_idx],
+                                      output[batch_idx, box_b_idx, box_start_idx + 2])
+                            b_b = max(output[batch_idx, box_b_idx, box_start_idx + 1],
+                                      output[batch_idx, box_b_idx, box_start_idx + 3])
+
+                            # Overlapping width and height
                             w = max(zero, min(a_r, b_r) - max(a_l, b_l))
                             h = max(zero, min(a_b, b_b) - max(a_t, b_t))
+
+                            # Overlapping area
                             area = h * w
+
+                            # total area of the figure formed by box a and box b
+                            # except for overlapping area
                             u = (a_r - a_l) * (a_b - a_t) + (b_r - b_l) * (b_b - b_t) - area
+
+                            # get the iou
                             iou = zero if u <= zero else area / u
+
                             if iou >= iou_threshold:
                                 output[i, k, score_index] = -one
                                 if id_index >= 0:
                                     output[i, k, id_index] = -one
                                 box_indices[i, k] = -1
+
         else:
             for j in parallel(valid_count[i]):
                 for k in range(box_data_length):
                     output[i, j, k] = data[i, j, k]
                 box_indices[i, j] = j
+
         # Set invalid entry to be -1
         for j in parallel(num_anchors - valid_count[i]):
             for k in range(box_data_length):
                 output[i, j + valid_count[i], k] = -one
             box_indices[i, j + valid_count[i]] = -1
+
         # Only return max_output_size valid boxes
         num_valid_boxes = 0
         if max_output_size > 0:
@@ -337,138 +358,17 @@ def hybrid_nms(data, sorted_index, valid_count, max_output_size, iou_threshold,
                         box_indices[i, j] = -1
                     else:
                         num_valid_boxes += 1
-    return output, box_indices
-
-@hybrid.script
-def hybrid_dynamic_nms(data, sorted_index, max_output_size, score_threshold,
-                       iou_threshold, score_index, zero, one):
-    """Hybrid routing for non-maximum suppression.
-
-    Parameters
-    ----------
-    data: tvm.te.Tensor or numpy NDArray
-        Bounding boxes with class and score. 3-D tensor with shape
-        [batch_size, num_anchors, 6] or [batch_size, num_anchors, 5].
-
-    sorted_index : tvm.te.Tensor or numpy NDArray
-        Bounding box indexes sorted by score, with shape
-        [batch_size, num_anchors].
-
-    max_output_size : tvm.tir.const
-        Max number of output valid boxes for each instance.
-        By default all valid boxes are returned.
-
-    score_threshold : tvm.tir.const
-        Lower limit of score for valid bounding boxes.
-
-    iou_threshold : tvm.tir.const
-        Overlapping(IoU) threshold to suppress object with smaller score.
-
-    score_index: tvm.tir.const
-        Index of the scores/confidence of boxes.
-
-    zero: tvm.tir.const
-        Constant zero with the same dtype as data.
-
-    one: tvm.tir.const
-        Constant one with the same dtype as data.
-
-    Returns
-    -------
-    box_indices: tvm.te.Tensor
-        2-D tensor with shape [batch_size, num_anchors].
-    """
 
-
-    batch_size = data.shape[0]
-    num_anchors = data.shape[1]
-    box_data_length = data.shape[2]
-
-    # box_indices is the expected value, similar to TF & ONNX
-    box_indices = output_tensor((batch_size, num_anchors), sorted_index.dtype)
-    output = output_tensor((batch_size,
-                            num_anchors,
-                            box_data_length,), data.dtype)
-
-    for i in range(batch_size):
-        if iou_threshold > 0:
-            # Reorder output
-            for j in parallel(num_anchors):
-                for k in range(box_data_length):
-                    output[i, j, k] = data[i, sorted_index[i, j], k]
-                if output[i, j, score_index] > score_threshold:
-                    box_indices[i, j] = sorted_index[i, j]
-                else:
-                    box_indices[i, j] = -1
-
-            # Apply nms
-            box_start_idx = 1
-            batch_idx = i
-
-            for j in range(num_anchors):
-                # index sorted
-                j_sorted = sorted_index[i, j]
-
-                box_a_idx = j
-                # l: left, t: top, r: right, b: bottom
-                a_l = min(output[batch_idx, box_a_idx, box_start_idx],
-                          output[batch_idx, box_a_idx, box_start_idx + 2])
-                a_t = min(output[batch_idx, box_a_idx, box_start_idx + 1],
-                          output[batch_idx, box_a_idx, box_start_idx + 3])
-                a_r = max(output[batch_idx, box_a_idx, box_start_idx],
-                          output[batch_idx, box_a_idx, box_start_idx + 2])
-                a_b = max(output[batch_idx, box_a_idx, box_start_idx + 1],
-                          output[batch_idx, box_a_idx, box_start_idx + 3])
-
-                for k in parallel(j + 1, num_anchors):
-                    k_sorted = sorted_index[i, k]
-                    box_b_idx = k
-                    # l: left, t: top, r: right, b: bottom
-                    b_l = min(output[batch_idx, box_b_idx, box_start_idx],
-                              output[batch_idx, box_b_idx, box_start_idx + 2])
-                    b_t = min(output[batch_idx, box_b_idx, box_start_idx + 1],
-                              output[batch_idx, box_b_idx, box_start_idx + 3])
-                    b_r = max(output[batch_idx, box_b_idx, box_start_idx],
-                              output[batch_idx, box_b_idx, box_start_idx + 2])
-                    b_b = max(output[batch_idx, box_b_idx, box_start_idx + 1],
-                              output[batch_idx, box_b_idx, box_start_idx + 3])
-
-                    # Overlapping width and height
-                    w = max(zero, min(a_r, b_r) - max(a_l, b_l))
-                    h = max(zero, min(a_b, b_b) - max(a_t, b_t))
-
-                    # Overlapping area
-                    area = h * w
-
-                    # total area of the figure formed by box a and box b except for overlapping area
-                    u = (a_r - a_l) * (a_b - a_t) + (b_r - b_l) * (b_b - b_t) - area
-
-                    # get the iou
-                    iou = area / u
-
-                    # output[i, k, sorted_index] = iou
-
-                    if iou >= score_threshold:
-                        box_indices[i, k] = -1
-
-        else:
-            for j in parallel(num_anchors):
-                box_indices[i, j] = sorted_index[i, j]
-
-        # Only return max_output_size valid boxes
-        num_valid_boxes = 0
-        if max_output_size > 0:
-            for j in parallel(num_anchors):
-                if num_valid_boxes == max_output_size:
-                    box_indices[i, j] = -1
-                else:
-                    num_valid_boxes += 1
+        if return_indices:
+            for j in range(valid_count[i]):
+                idx = box_indices[i, j]
+                if box_indices[i, j] >= 0:
+                    box_indices[i, j] = indices[i, idx]
 
     return output, box_indices
 
-
 @tvm.target.generic_func
-def non_max_suppression(data, valid_count, max_output_size=-1, score_threshold=0.0,
+def non_max_suppression(data, valid_count, indices, max_output_size=-1,
                         iou_threshold=0.5, force_suppress=False, top_k=-1,
                         coord_start=2, score_index=1, id_index=0,
                         return_indices=True, invalid_to_bottom=False):
@@ -482,13 +382,13 @@ def non_max_suppression(data, valid_count, max_output_size=-1, score_threshold=0
     valid_count : tvm.te.Tensor
         1-D tensor for valid number of boxes.
 
+    indices : tvm.Tensor
+        2-D tensor with shape [batch_size, num_anchors].
+
     max_output_size : optional, int
         Max number of output valid boxes for each instance.
         By default all valid boxes are returned.
 
-    score_threshold : optional, float
-        Lower limit of score for valid bounding boxes.
-
     iou_threshold : optional, float
         Non-maximum suppression threshold.
 
@@ -517,7 +417,7 @@ def non_max_suppression(data, valid_count, max_output_size=-1, score_threshold=0
     -------
     out : tvm.te.Tensor or tuple of tvm.te.Tensor
         3-D tensor with shape [batch_size, num_anchors, 6]
-        or [batch_size, num_anchors, 6]. Out is a tuple of tvm.te.Tensor
+        or [batch_size, num_anchors, 5]. Out is a tuple of tvm.te.Tensor
         if return_indices is True, the Tensor in the tuple is 2-D tensor
         with shape [batch_size, num_anchors] and shape
         [batch_size, num_valid_anchors] respectively.
@@ -533,7 +433,7 @@ def non_max_suppression(data, valid_count, max_output_size=-1, score_threshold=0
         iou_threshold = 0.7
         force_suppress = True
         top_k = -1
-        out = non_max_suppression(data, valid_count, iou_threshold=iou_threshold,
+        out = non_max_suppression(data, valid_count, indices, iou_threshold=iou_threshold,
                                   force_suppress=force_suppress, top_k=top_k)
         np_data = np.random.uniform(dshape)
         np_valid_count = np.array([4])
@@ -551,33 +451,24 @@ def non_max_suppression(data, valid_count, max_output_size=-1, score_threshold=0
     score_shape = (batch_size, num_anchors)
     score_tensor = te.compute(score_shape, lambda i, j: data[i, j, score_axis])
     sort_tensor = argsort(score_tensor, valid_count=valid_count, axis=1, is_ascend=False)
-
+    out, box_indices = hybrid_nms(data,
+                                  sort_tensor,
+                                  valid_count,
+                                  indices,
+                                  tvm.const(max_output_size, dtype="int32"),
+                                  tvm.const(iou_threshold, dtype=data.dtype),
+                                  tvm.const(force_suppress, dtype="bool"),
+                                  tvm.const(top_k, dtype="int32"),
+                                  tvm.const(coord_start, dtype="int32"),
+                                  tvm.const(score_index, dtype="int32"),
+                                  tvm.const(id_index, dtype="int32"),
+                                  tvm.const(return_indices, dtype="bool"),
+                                  zero=tvm.const(0, dtype=data.dtype),
+                                  one=tvm.const(1, dtype=data.dtype))
     if return_indices:
-        # return a tuple with two tensor, one is the computed valid indices of boxes, appending -1 as invalid boxes
-        # the other one is the number of valid boxes
-        out, box_indices = hybrid_dynamic_nms(data,
-                                              sort_tensor,
-                                              tvm.tir.const(max_output_size, dtype="int32"),
-                                              tvm.tir.const(score_threshold, dtype=data.dtype),
-                                              tvm.tir.const(iou_threshold, dtype=data.dtype),
-                                              tvm.tir.const(score_index, dtype="int32"),
-                                              zero=tvm.tir.const(0, dtype=data.dtype),
-                                              one=tvm.tir.const(1, dtype=data.dtype))
-        box_indices, out_shape = hybrid_rearrange_idx(box_indices)
-        return [box_indices, out_shape]
-    else:
-        out, box_indices = hybrid_nms(data,
-                                      sort_tensor,
-                                      valid_count,
-                                      tvm.tir.const(max_output_size, dtype="int32"),
-                                      tvm.tir.const(iou_threshold, dtype=data.dtype),
-                                      tvm.tir.const(force_suppress, dtype="bool"),
-                                      tvm.tir.const(top_k, dtype="int32"),
-                                      tvm.tir.const(coord_start, dtype="int32"),
-                                      tvm.tir.const(id_index, dtype="int32"),
-                                      tvm.tir.const(score_index, dtype="int32"),
-                                      zero=tvm.tir.const(0, dtype=data.dtype),
-                                      one=tvm.tir.const(1, dtype=data.dtype))
-        if invalid_to_bottom:
-            out = hybrid_rearrange_out(out, one=tvm.tir.const(1, dtype=data.dtype))
-        return out
+        box_indices, out_shape = hybrid_rearrange_out(box_indices, one=tvm.const(1, dtype="int32"))
+        return tuple([box_indices, out_shape])
+
+    if invalid_to_bottom:
+        out, out_shape = hybrid_rearrange_out(out, one=tvm.const(1, dtype=data.dtype))
+    return out
diff --git a/topi/python/topi/vision/ssd/multibox.py b/topi/python/topi/vision/ssd/multibox.py
index e1ddc7bab9b0..e5b92156bdc3 100644
--- a/topi/python/topi/vision/ssd/multibox.py
+++ b/topi/python/topi/vision/ssd/multibox.py
@@ -304,7 +304,7 @@ def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nm
     """
     inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor,
                                        clip, threshold, variances)
-    out = non_max_suppression(inter_out[0], inter_out[1], max_output_size=-1, score_threshold=0,
+    out = non_max_suppression(inter_out[0], inter_out[1], inter_out[1], max_output_size=-1,
                               iou_threshold=nms_threshold, force_suppress=force_suppress,
                               top_k=nms_topk, return_indices=False)
     return out
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index 77cd2f1d7e65..bd71742c83ab 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -69,6 +69,7 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
     np_data = np.random.uniform(low=-2, high=2, size=dshape).astype(dtype)
     np_out1 = np.zeros(shape=(batch_size,))
     np_out2 = np.zeros(shape=dshape).astype(dtype)
+    np_out3 = np.zeros(shape=(batch_size, num_anchor))
     for i in range(batch_size):
         np_out1[i] = 0
         inter_idx = 0
@@ -78,10 +79,12 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
                 for k in range(elem_length):
                     np_out2[i, inter_idx, k] = np_data[i, j, k]
                 np_out1[i] += 1
+                np_out3[i, inter_idx] = j
                 inter_idx += 1
             if j >= np_out1[i]:
                 for k in range(elem_length):
                     np_out2[i, j, k] = -1.0
+                np_out3[i, j] = -1
 
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -98,10 +101,12 @@ def check_device(device):
         tvm_input_data = tvm.nd.array(np_data, ctx)
         tvm_out1 = tvm.nd.array(np.zeros(np_out1.shape, dtype="int32"), ctx)
         tvm_out2 = tvm.nd.array(np.zeros(np_out2.shape, dtype=dtype), ctx)
-        f = tvm.build(s, [data, outs[0], outs[1]], device)
-        f(tvm_input_data, tvm_out1, tvm_out2)
+        tvm_out3 = tvm.nd.array(np.zeros(np_out3.shape, dtype="int32"), ctx)
+        f = tvm.build(s, [data, outs[0], outs[1], outs[2]], device)
+        f(tvm_input_data, tvm_out1, tvm_out2, tvm_out3)
         tvm.testing.assert_allclose(tvm_out1.asnumpy(), np_out1, rtol=1e-3)
         tvm.testing.assert_allclose(tvm_out2.asnumpy(), np_out2, rtol=1e-3)
+        tvm.testing.assert_allclose(tvm_out3.asnumpy(), np_out3, rtol=1e-3)
 
     """ Skip this test as it is intermittent
         see https://github.com/apache/incubator-tvm/pull/4901#issuecomment-595040094
@@ -114,6 +119,7 @@ def check_device(device):
 
 
 def test_get_valid_counts():
+    verify_get_valid_counts((1, 1000, 5), 0.5, -1, 0)
     verify_get_valid_counts((1, 2500, 6), 0, 0, 1)
     verify_get_valid_counts((1, 2500, 5), -1, -1, 0)
     verify_get_valid_counts((3, 1000, 6), 0.55, 1, 0)
@@ -147,16 +153,18 @@ def check_device(device):
 
         tvm_data = tvm.nd.array(np_data, ctx)
         tvm_valid_count = tvm.nd.array(np_valid_count, ctx)
+        tvm_indices = tvm.nd.array(np_indices, ctx)
 
         tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
-        f = tvm.build(s, [data, valid_count, out], device)
-        f(tvm_data, tvm_valid_count, tvm_out)
+        f = tvm.build(s, [data, valid_count, indices, out], device)
+        f(tvm_data, tvm_valid_count, tvm_indices, tvm_out)
         tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4)
 
         tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"), ctx)
-        f = tvm.build(indices_s, [data, valid_count, indices_out[0]], device)
-        f(tvm_data, tvm_valid_count, tvm_indices_out)
-        tvm.testing.assert_allclose(tvm_indices_out.asnumpy(), np_indices_result, rtol=1e-4)
+        f = tvm.build(indices_s, [data, valid_count, indices, indices_out[0]], device)
+        f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out)
+        # TODO (yongwww): add dynamic nms for gpu
+        # tvm.testing.assert_allclose(tvm_indices_out.asnumpy(), np_indices_result, rtol=1e-4)
 
     for device in ['llvm', 'cuda', 'opencl']:
         check_device(device)
@@ -167,23 +175,24 @@ def test_non_max_suppression():
                          [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79],
                          [1, 0.5, 100, 60, 70, 110]]]).astype("float32")
     np_valid_count = np.array([4]).astype("int32")
+    np_indices = np.array([[0, 1, 3, 4, -1]]).astype("int32")
     np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
                            [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
-    np_indices_result = np.array([[3, 0, -1, -1, -1]])
+    np_indices_result = np.array([[4, 0, -1, -1, -1]])
 
-    verify_non_max_suppression(np_data, np_valid_count, np_result, np_indices_result, 0.6, 0.7, True, 2, 2, 1, 0)
+    verify_non_max_suppression(np_data, np_valid_count, np_indices, np_result, np_indices_result, 0.6, True, 2, 2, 1, 0)
 
     np_data = np.array([[[0.8, 1, 20, 25, 45], [0.7, 30, 60, 50, 80],
                          [0.4, 4, 21, 19, 40], [0.9, 35, 61, 52, 79],
                          [0.5, 100, 60, 70, 110]]]).astype("float32")
     np_valid_count = np.array([4]).astype("int32")
+    np_indices = np.array([[0, 1, 3, 4, -1]]).astype("int32")
     np_result = np.array([[[0.9, 35, 61, 52, 79], [0.8, 1, 20, 25, 45],
                            [-1, -1, -1, -1, -1], [-1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1]]])
-    np_indices_result = np.array([[3, 0, -1, -1, -1]])
-    verify_non_max_suppression(np_data, np_valid_count, np_result, np_indices_result, 0.6, 0.7, False, 2, 1, 0, -1)
-
+    np_indices_result = np.array([[4, 0, -1, -1, -1]])
+    verify_non_max_suppression(np_data, np_valid_count, np_indices, np_result, np_indices_result, 0.6, False, 2, 1, 0, -1)
 
 
 def verify_multibox_prior(dshape, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, 0.5), clip=False):

From c3be4817d7dc8c5f020026081799ebaad64bda55 Mon Sep 17 00:00:00 2001
From: Yong Wu <ywu118@alumni.jh.edu>
Date: Sun, 17 Nov 2019 18:39:48 +0000
Subject: [PATCH 03/22] fix nnvm compatibility issues

---
 topi/python/topi/cuda/nms.py            |  7 +++--
 topi/python/topi/image/dilation2d.py    | 12 ++++----
 topi/python/topi/math.py                | 12 ++++----
 topi/python/topi/vision/nms.py          |  6 ++--
 topi/python/topi/x86/conv2d_alter_op.py |  4 ++-
 topi/python/topi/x86/conv3d.py          |  6 ++--
 topi/tests/python/test_topi_vision.py   | 39 +++++++++++++++----------
 7 files changed, 50 insertions(+), 36 deletions(-)

diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index d8be3bd1b886..255cf6fc30f7 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -335,7 +335,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
     return ib.get()
 
 
-def non_max_suppression(data, valid_count, max_output_size=-1,
+def non_max_suppression(data, valid_count, indices, max_output_size=-1,
                         iou_threshold=0.5, force_suppress=False, top_k=-1,
                         coord_start=2, score_index=1, id_index=0,
                         return_indices=True, invalid_to_bottom=False):
@@ -351,6 +351,9 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
     valid_count : tvm.te.Tensor
         1-D tensor for valid number of boxes.
 
+    indices : tvm.te.Tensor
+        2-D tensor with shape [batch_size, num_anchors].
+
     max_output_size : optional, int
         Max number of output valid boxes for each instance.
         By default all valid boxes are returned.
@@ -445,4 +448,4 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
     if return_indices:
         return box_indices
 
-    return out
+    return out
\ No newline at end of file
diff --git a/topi/python/topi/image/dilation2d.py b/topi/python/topi/image/dilation2d.py
index a71866e60a98..074ca6c02d08 100644
--- a/topi/python/topi/image/dilation2d.py
+++ b/topi/python/topi/image/dilation2d.py
@@ -29,10 +29,10 @@ def dilation2d_nchw(input, filter, stride, padding, dilations, out_dtype=None):
 
     Parameters
     ----------
-    input : tvm.Tensor
+    input : tvm.te.Tensor
         4-D with shape [batch, in_channel, in_height, in_width]
 
-    filter : tvm.Tensor
+    filter : tvm.te.Tensor
         3-D with shape [ in_channel, filter_height, filter_width]
 
     stride : int or a list/tuple of two ints
@@ -49,7 +49,7 @@ def dilation2d_nchw(input, filter, stride, padding, dilations, out_dtype=None):
 
     Returns
     -------
-    Output : tvm.Tensor
+    Output : tvm.te.Tensor
         4-D with shape [batch, in_channel, out_height, out_width]
     """
     if out_dtype is None:
@@ -100,10 +100,10 @@ def dilation2d_nhwc(input, filter, stride, padding, dilations, out_dtype=None):
 
     Parameters
     ----------
-    input : tvm.Tensor
+    input : tvm.te.Tensor
         4-D with shape [batch, in_height, in_width, in_channel]
 
-    filter : tvm.Tensor
+    filter : tvm.te.Tensor
         3-D with shape [filter_height, filter_width, in_channel]
 
     stride : int or a list/tuple of two ints
@@ -120,7 +120,7 @@ def dilation2d_nhwc(input, filter, stride, padding, dilations, out_dtype=None):
 
     Returns
     -------
-    Output : tvm.Tensor
+    Output : tvm.te.Tensor
         4-D with shape [batch, out_height, out_width, in_channel]
     """
     if out_dtype is None:
diff --git a/topi/python/topi/math.py b/topi/python/topi/math.py
index d715308573a4..b4228a4a9178 100644
--- a/topi/python/topi/math.py
+++ b/topi/python/topi/math.py
@@ -401,12 +401,12 @@ def isfinite(x):
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
     return te.compute(x.shape, lambda *i: te.isfinite(x(*i)))
@@ -418,12 +418,12 @@ def isinf(x):
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
     return te.compute(x.shape, lambda *i: te.isinf(x(*i)))
@@ -677,12 +677,12 @@ def fast_tanh(x):
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
     return cpp.fast_tanh(x, x.dtype, tag.ELEMWISE)
diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index 73e702447de5..fdf2430b3a02 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -46,7 +46,7 @@ def hybrid_rearrange_out(data, one):
         [batch_size, num_anchors, 6] or [batch_size, num_anchors, 5],
         or 2-D tensor with shape [batch_size, num_anchors].
 
-    valid_box_count : tvm.Tensor or numpy NDArray
+    valid_box_count : tvm.te.Tensor or numpy NDArray
         Tensor with shape [batch_size, 1], indicates
         the valid number of boxes.
     """
@@ -205,7 +205,7 @@ def hybrid_nms(data, sorted_index, valid_count, indices, max_output_size,
     valid_count : tvm.te.Tensor or numpy NDArray
         1-D tensor for valid number of boxes.
 
-    indices : tvm.Tensor or numpy.NDArray
+    indices : tvm.te.Tensor or numpy.NDArray
         indices in original tensor, with shape [batch_size, num_anchors]
 
     max_output_size : tvm.tir.const
@@ -382,7 +382,7 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1,
     valid_count : tvm.te.Tensor
         1-D tensor for valid number of boxes.
 
-    indices : tvm.Tensor
+    indices : tvm.te.Tensor
         2-D tensor with shape [batch_size, num_anchors].
 
     max_output_size : optional, int
diff --git a/topi/python/topi/x86/conv2d_alter_op.py b/topi/python/topi/x86/conv2d_alter_op.py
index d1c607f6a3e5..e9fc4223a9ea 100644
--- a/topi/python/topi/x86/conv2d_alter_op.py
+++ b/topi/python/topi/x86/conv2d_alter_op.py
@@ -312,7 +312,9 @@ def _conv2d_legalize(attrs, inputs, arg_types):
             new_attrs['channels'] = new_out_channel
             out = tvm.relay.nn.conv2d(data, kernel, **new_attrs)
             original_out_shape = [x.value for x in output_tensor.shape]
-            out = relay.strided_slice(out, begin=(0, 0, 0, 0), end=original_out_shape)
+            out = relay.strided_slice(out,
+                                      begin=relay.const([0, 0, 0, 0], "int32"),
+                                      end=relay.const(original_out_shape, "int32"))
         else:
             out = relay.nn.conv2d(data, kernel, **new_attrs)
 
diff --git a/topi/python/topi/x86/conv3d.py b/topi/python/topi/x86/conv3d.py
index 27f48f8dc69a..f0dee31a9992 100644
--- a/topi/python/topi/x86/conv3d.py
+++ b/topi/python/topi/x86/conv3d.py
@@ -78,11 +78,11 @@ def conv3d_ncdhw(cfg, data, kernel, strides, padding, dilation, out_dtype):
 
     Parameters
     ----------
-    input : tvm.Tensor
+    input : tvm.te.Tensor
         5-D input data with shapes:
         [batch, in_channel, in_depth, in_height, in_width] for NCDHW layout
 
-    filter : tvm.Tensor
+    filter : tvm.te.Tensor
         5-D filter with shape [out_channels, in_channels, kernel_depth, kernel_height, kernel_width]
 
     strides : int or a list/tuple of three ints
@@ -96,7 +96,7 @@ def conv3d_ncdhw(cfg, data, kernel, strides, padding, dilation, out_dtype):
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         5-D with shape [batch, out_channel, out_depth, out_height, out_width] for NCDHW layout
     """
     layout = "NCDHW"
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index bd71742c83ab..072fa8eb7e32 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -102,11 +102,17 @@ def check_device(device):
         tvm_out1 = tvm.nd.array(np.zeros(np_out1.shape, dtype="int32"), ctx)
         tvm_out2 = tvm.nd.array(np.zeros(np_out2.shape, dtype=dtype), ctx)
         tvm_out3 = tvm.nd.array(np.zeros(np_out3.shape, dtype="int32"), ctx)
-        f = tvm.build(s, [data, outs[0], outs[1], outs[2]], device)
-        f(tvm_input_data, tvm_out1, tvm_out2, tvm_out3)
-        tvm.testing.assert_allclose(tvm_out1.asnumpy(), np_out1, rtol=1e-3)
-        tvm.testing.assert_allclose(tvm_out2.asnumpy(), np_out2, rtol=1e-3)
-        tvm.testing.assert_allclose(tvm_out3.asnumpy(), np_out3, rtol=1e-3)
+        if device == "llvm":
+            f = tvm.build(s, [data, outs[0], outs[1], outs[2]], device)
+            f(tvm_input_data, tvm_out1, tvm_out2, tvm_out3)
+            tvm.testing.assert_allclose(tvm_out1.asnumpy(), np_out1, rtol=1e-3)
+            tvm.testing.assert_allclose(tvm_out2.asnumpy(), np_out2, rtol=1e-3)
+            tvm.testing.assert_allclose(tvm_out3.asnumpy(), np_out3, rtol=1e-3)
+        else:
+            f = tvm.build(s, [data, outs[0], outs[1]], device)
+            f(tvm_input_data, tvm_out1, tvm_out2)
+            tvm.testing.assert_allclose(tvm_out1.asnumpy(), np_out1, rtol=1e-3)
+            tvm.testing.assert_allclose(tvm_out2.asnumpy(), np_out2, rtol=1e-3)
 
     """ Skip this test as it is intermittent
         see https://github.com/apache/incubator-tvm/pull/4901#issuecomment-595040094
@@ -161,10 +167,13 @@ def check_device(device):
         tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4)
 
         tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"), ctx)
-        f = tvm.build(indices_s, [data, valid_count, indices, indices_out[0]], device)
-        f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out)
-        # TODO (yongwww): add dynamic nms for gpu
-        # tvm.testing.assert_allclose(tvm_indices_out.asnumpy(), np_indices_result, rtol=1e-4)
+        if device == 'llvm':
+            f = tvm.build(indices_s, [data, valid_count, indices, indices_out[0]], device)
+            f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out)
+        else:
+            f = tvm.build(indices_s, [data, valid_count, indices, indices_out], device)
+            f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out)
+        tvm.testing.assert_allclose(tvm_indices_out.asnumpy(), np_indices_result, rtol=1e-4)
 
     for device in ['llvm', 'cuda', 'opencl']:
         check_device(device)
@@ -175,24 +184,24 @@ def test_non_max_suppression():
                          [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79],
                          [1, 0.5, 100, 60, 70, 110]]]).astype("float32")
     np_valid_count = np.array([4]).astype("int32")
-    np_indices = np.array([[0, 1, 3, 4, -1]]).astype("int32")
+    np_indices = np.array([[0, 1, 2, 3, 4]]).astype("int32")
     np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
                            [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
-    np_indices_result = np.array([[4, 0, -1, -1, -1]])
+    np_indices_result = np.array([[3, 0, -1, -1, -1]])
 
-    verify_non_max_suppression(np_data, np_valid_count, np_indices, np_result, np_indices_result, 0.6, True, 2, 2, 1, 0)
+    verify_non_max_suppression(np_data, np_valid_count, np_indices, np_result, np_indices_result, 0.7, True, 2, 2, 1, 0)
 
     np_data = np.array([[[0.8, 1, 20, 25, 45], [0.7, 30, 60, 50, 80],
                          [0.4, 4, 21, 19, 40], [0.9, 35, 61, 52, 79],
                          [0.5, 100, 60, 70, 110]]]).astype("float32")
     np_valid_count = np.array([4]).astype("int32")
-    np_indices = np.array([[0, 1, 3, 4, -1]]).astype("int32")
+    np_indices = np.array([[0, 1, 2, 3, 4]]).astype("int32")
     np_result = np.array([[[0.9, 35, 61, 52, 79], [0.8, 1, 20, 25, 45],
                            [-1, -1, -1, -1, -1], [-1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1]]])
-    np_indices_result = np.array([[4, 0, -1, -1, -1]])
-    verify_non_max_suppression(np_data, np_valid_count, np_indices, np_result, np_indices_result, 0.6, False, 2, 1, 0, -1)
+    np_indices_result = np.array([[3, 0, -1, -1, -1]])
+    verify_non_max_suppression(np_data, np_valid_count, np_indices, np_result, np_indices_result, 0.7, False, 2, 1, 0, -1)
 
 
 def verify_multibox_prior(dshape, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, 0.5), clip=False):

From 47faed9773d6757e64e356b018de4d78314c722c Mon Sep 17 00:00:00 2001
From: Yong Wu <ywu118@alumni.jh.edu>
Date: Mon, 18 Nov 2019 14:37:09 -0800
Subject: [PATCH 04/22] fix InferCorrectLayout

---
 python/tvm/relay/frontend/tensorflow.py       |  6 +-
 python/tvm/relay/op/strategy/generic.py       |  6 +-
 python/tvm/relay/op/vision/nms.py             |  2 +-
 src/relay/op/tensor/transform.cc              | 55 +++++++++++++------
 .../transforms/combine_parallel_conv2d.cc     | 15 +++--
 src/relay/transforms/fuse_ops.cc              | 17 ++++++
 .../frontend/tensorflow/test_forward.py       |  2 -
 tests/python/relay/test_any.py                |  2 +-
 tests/python/relay/test_op_level5.py          |  8 +--
 .../python/relay/test_pass_alter_op_layout.py |  4 +-
 topi/python/topi/vision/nms.py                | 24 ++++----
 11 files changed, 90 insertions(+), 51 deletions(-)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 59740d204744..d0dbdd7121b8 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -617,7 +617,7 @@ def _impl(inputs, attr, params, mod):
         return out
 
 def _nms():
-    def _impl(inputs, attr, params):
+    def _impl(inputs, attr, params, mod):
         # Get parameter values
         max_output_size = int(np.atleast_1d(inputs[2].data.asnumpy().astype("int64"))[0])
         iou_threshold = np.atleast_1d(inputs[3].data.asnumpy())[0]
@@ -626,6 +626,7 @@ def _impl(inputs, attr, params):
 
         # Generate data with shape (1, num_anchors, 5)
         scores = AttrCvt(op_name="expand_dims",
+                         ignores=['T_threshold'],
                          extras={'axis': -1, 'num_newaxis': 1})([inputs[1]], attr)
         data = get_relay_op('concatenate')([scores, inputs[0]], -1)
         data = get_relay_op('expand_dims')(data, 0, 1)
@@ -651,6 +652,7 @@ def _impl(inputs, attr, params):
                                                       id_index=-1,
                                                       return_indices=True,
                                                       invalid_to_bottom=False)
+
         # squeeze it, TF NMS is not batched
         end = get_relay_op("squeeze")(nms_ret[1], axis=[1])
         data_slice = get_relay_op("squeeze")(nms_ret[0], axis=[0])
@@ -2531,7 +2533,7 @@ class LoopBound(ExprVisitor):
     .. code-block:: python
 
         i = tf.constant(0)
-        data = tf.compat.v1.placeholder(tf.float32, shape=(1024, 1024))
+        data = tf.placeholder(tf.float32, shape=(1024, 1024))
         slice = tf.strided_slice(data, 0, 512)
         def c(i): return tf.less(i, 10)
         def b(i): return [tf.add(i, 1), tf.add(i, 1) + slice]
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 99439affee1a..0cedaa1c07f0 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -695,9 +695,9 @@ def _compute_nms(attrs, inputs, out_type):
         score_index = get_const_int(attrs.score_index)
         id_index = get_const_int(attrs.id_index)
         invalid_to_bottom = bool(get_const_int(attrs.invalid_to_bottom))
-        return [topi_compute(inputs[0], inputs[1], max_output_size, iou_threshold,
-                             force_suppress, top_k, coord_start, score_index,
-                             id_index, return_indices, invalid_to_bottom)]
+        return [topi_compute(inputs[0], inputs[1], inputs[2], max_output_size, iou_threshold,
+                             force_suppress, top_k, coord_start, score_index, id_index,
+                             return_indices, invalid_to_bottom)]
     return _compute_nms
 
 @override_native_generic_func("non_max_suppression_strategy")
diff --git a/python/tvm/relay/op/vision/nms.py b/python/tvm/relay/op/vision/nms.py
index f2657f47c6b2..d599ea55d0ba 100644
--- a/python/tvm/relay/op/vision/nms.py
+++ b/python/tvm/relay/op/vision/nms.py
@@ -117,7 +117,7 @@ def non_max_suppression(data,
     -------
     out : relay.Expr or relay.Tuple
         return relay.Expr if return_indices is disabled, a 3-D tensor
-        with shape [batch_size, num_anchors, 6] or [batch_size, num_anchors, 6].
+        with shape [batch_size, num_anchors, 6] or [batch_size, num_anchors, 5].
         if return_indices is True, return relay.Tuple of two 2-D tensors, with
         shape [batch_size, num_anchors] and [batch_size, num_valid_anchors] respectively.
     """
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 4e1b05e04620..b464330c0bb9 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1732,7 +1732,7 @@ int64_t* ToVector(const runtime::NDArray& array) {
       return rel_vec;
     }
   }
-  LOG(FATAL) << "Unknown data type: " << tvm::runtime::TVMType2String(array->dtype);
+  LOG(FATAL) << "Unknown data type: " << tvm::runtime::DLDataType2String(array->dtype);
   return rel_vec;
 }
 
@@ -1825,7 +1825,7 @@ bool StridedSliceRel(const Array<Type>& types,
       oshape[i] = tir::make_const(dshape[i].dtype(), (slice_range + step - 1) / step);
     }
   } else {
-    for (size_t i = 0; i < num_axis; ++i) {
+    for (int64_t i = 0; i < num_axis; ++i) {
       oshape[i] = Any::make();
     }
   }
@@ -1850,7 +1850,7 @@ Array<Array<Layout>> StridedSliceInferCorrectLayout(const Attrs& attrs,
 
   auto layout = old_in_layouts[0];
   if (layout.defined() && new_in_layouts.defined()) {
-    CHECK_EQ(new_in_layouts.size(), 1);
+    CHECK_GE(new_in_layouts.size(), 1);
     auto new_layout = new_in_layouts[0];
     auto shape = old_in_shapes[0];
 
@@ -1907,25 +1907,44 @@ Array<Array<Layout>> StridedSliceInferCorrectLayout(const Attrs& attrs,
         new_end.push_back(tvm::Integer(ed / factor));
       }
     }
+
     layout = new_layout;
+
+    DLContext ctx;
+    ctx.device_type = kDLCPU;
+    ctx.device_id = 0;
+    auto begin_ndarray = runtime::NDArray::Empty({int64_t(new_begin.size())},
+                                                 DataType::Int(64), ctx);
+    auto end_ndarray = runtime::NDArray::Empty({int64_t(new_begin.size())},
+                                               DataType::Int(64), ctx);
+    auto strides_ndarray = runtime::NDArray::Empty({int64_t(new_begin.size())},
+                                                   DataType::Int(64), ctx);
+    int64_t* begin_data = static_cast<int64_t*>(begin_ndarray->data);
+    int64_t* end_data = static_cast<int64_t*>(end_ndarray->data);
+    for (size_t i = 0; i < new_begin.size(); ++i) {
+      begin_data[i] = new_begin[i];
+      end_data[i] = new_end[i];
+    }
+    params->begin = Constant(begin_ndarray);
+    params->end = Constant(end_ndarray);
   }
-  return {{layout}, {layout}};
+  return {{layout, Layout("C"), Layout("C"), Layout("C")}, {layout}};
 }
 
-inline Tensor DynamicStridedSlice(const tvm::Tensor& input,
-                                  const tvm::Tensor& begin,
-                                  const tvm::Tensor& end,
-                                  const tvm::Tensor& strides,
+inline te::Tensor DynamicStridedSlice(const te::Tensor& input,
+                                  const te::Tensor& begin,
+                                  const te::Tensor& end,
+                                  const te::Tensor& strides,
                                   std::string name = "T_strided_slice_dynamic",
                                   std::string tag = topi::kInjective) {
   int64_t src_tensor_dim = input->shape.size();
-  Array<tvm::Expr> out_shape;
+  Array<IndexExpr> out_shape;
   for (int64_t i = 0; i < src_tensor_dim; ++i) {
-    out_shape.push_back(tvm::Var("dim"));
+    out_shape.push_back(tvm::tir::Var("dim"));
   }
-  // TODO(yongwww): move the compute into topi after nnvm is removed
-  return tvm::compute(out_shape, [&](const Array<tvm::Var>& indices) {
-      Array<tvm::Expr> real_indices;
+  // TODO(yongwww): move the compute into topi
+  return te::compute(out_shape, [&](const Array<tvm::tir::Var>& indices) {
+      Array<IndexExpr> real_indices;
       for (int32_t i = 0; i < src_tensor_dim; ++i) {
         real_indices.push_back(indices[i] * strides(i) + begin(i));
       }
@@ -1958,12 +1977,12 @@ Array<te::Tensor> StridedSliceCompute(const Attrs& attrs, const Array<te::Tensor
       topi::strided_slice(inputs[0], begin, end, strides)
     };
   } else {
-    Tensor data = inputs[0];
-    Tensor begin = inputs[1];
-    Tensor end = inputs[2];
-    Tensor strides = inputs[3];
+    te::Tensor data = inputs[0];
+    te::Tensor begin = inputs[1];
+    te::Tensor end = inputs[2];
+    te::Tensor strides = inputs[3];
     // Dynamic computation
-    return Array<Tensor>{
+    return Array<te::Tensor>{
       DynamicStridedSlice(data, begin, end, strides)
     };
   }
diff --git a/src/relay/transforms/combine_parallel_conv2d.cc b/src/relay/transforms/combine_parallel_conv2d.cc
index 58ac315d3234..6c53b6706fc9 100644
--- a/src/relay/transforms/combine_parallel_conv2d.cc
+++ b/src/relay/transforms/combine_parallel_conv2d.cc
@@ -184,9 +184,12 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
       DLContext ctx;
       ctx.device_type = kDLCPU;
       ctx.device_id = 0;
-      auto begin_ndarray = runtime::NDArray::Empty({1}, DataType::Int(64), ctx);
-      auto end_ndarray = runtime::NDArray::Empty({1}, DataType::Int(64), ctx);
-      auto strides_ndarray = runtime::NDArray::Empty({1}, DataType::Int(64), ctx);
+      auto begin_ndarray = runtime::NDArray::Empty({int64_t(begin.size())},
+                                                   DataType::Int(64), ctx);
+      auto end_ndarray = runtime::NDArray::Empty({int64_t(begin.size())},
+                                                 DataType::Int(64), ctx);
+      auto strides_ndarray = runtime::NDArray::Empty({int64_t(begin.size())},
+                                                     DataType::Int(64), ctx);
       int64_t* begin_data = static_cast<int64_t*>(begin_ndarray->data);
       int64_t* end_data = static_cast<int64_t*>(end_ndarray->data);
 
@@ -196,9 +199,9 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
       }
 
       auto slice = MakeStridedSlice(data,
-                                    ConstantNode::make(begin_ndarray),
-                                    ConstantNode::make(end_ndarray),
-                                    ConstantNode::make(strides_ndarray));
+                                    Constant(begin_ndarray),
+                                    Constant(end_ndarray),
+                                    Constant(strides_ndarray));
       subst_map->insert({GetRef<Expr>(branch[depth]), slice});
     }
   }
diff --git a/src/relay/transforms/fuse_ops.cc b/src/relay/transforms/fuse_ops.cc
index 01f1eeea30b3..13bfa7bb28de 100644
--- a/src/relay/transforms/fuse_ops.cc
+++ b/src/relay/transforms/fuse_ops.cc
@@ -249,6 +249,23 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
       this->Update(call->op, node, kOpaque);
     }
 
+    if (call->attrs.as<StridedSliceAttrs>()) {
+      bool is_dyn{false};
+      for (auto arg :  call->args) {
+        auto arg_tt = arg->checked_type().as<TensorTypeNode>();
+        if (arg_tt) {
+          for (auto dim : arg_tt->shape) {
+            if (dim.as<Any>()) {
+              is_dyn = true;
+            }
+          }
+        }
+        if (is_dyn) break;
+      }
+      if (!is_dyn) {
+        op_pattern = kInjective;
+      }
+    }
     node->pattern = op_pattern;
     this->Update(call->op, nullptr, kOpaque);
     const auto* rtype = call->checked_type().as<TensorTypeNode>();
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 82c00e2db1d9..76d2fe13aa49 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -190,7 +190,6 @@ def name_without_num(name):
                                        target=device, out_names=out_name,
                                        num_output=len(out_name), opt_level=opt_level, mode=mode,
                                        cuda_layout=cuda_layout)
-
             # since the names from tensorflow and relay runs are not exactly same,
             # first len(tf_output) will be compared
             for i in range(len(tf_output)):
@@ -3321,7 +3320,6 @@ def test_forward_isfinite():
     test_forward_space_to_batch_nd()
     test_forward_batch_to_space_nd()
     test_forward_dilation()
-    test_forward_nms_v3()
 
     # End to End
     test_forward_inception_v3()
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index e6a462bd737e..9464e865b4e1 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -645,7 +645,7 @@ def test_arange_with_dynamic_shape():
 
 def verify_any_strided_slice(data_shape, begin_shape, end_shape,
                              strides_shape, data_np_shape):
-    mod = relay.Module()
+    mod = tvm.IRModule()
     data = relay.var('data', shape=data_shape, dtype='float32')
     begin = relay.var('begin', shape=begin_shape, dtype="int32")
     end = relay.var('end', shape=end_shape, dtype="int32")
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index df3091f3021d..0cee8a0f391f 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -288,13 +288,13 @@ def verify_nms(x0_data, x1_data, x2_data, dshape, ref_res, ref_indices_res,
                    check_type_only=False):
         x0 = relay.var("x0", relay.ty.TensorType(dshape, "float32"))
         x1 = relay.var("x1", relay.ty.TensorType((dshape[0],), "int32"))
-        x2 = relay.var("x2", relay.ty.TensorType((dshape[0],dshape[1]), "int32"))
+        x2 = relay.var("x2", relay.ty.TensorType((dshape[0], dshape[1]), "int32"))
         z = relay.vision.non_max_suppression(x0, x1, x2, max_output_size=-1, \
             iou_threshold=iou_threshold, force_suppress=force_suppress, \
             top_k=top_k, return_indices=False)
         z_indices = relay.vision.non_max_suppression(x0, x1, x2, max_output_size=-1, \
                     iou_threshold=iou_threshold, force_suppress=force_suppress, \
-                    top_k=top_k)
+                    top_k=top_k, return_indices=True)
         if isinstance(z_indices, relay.expr.TupleWrapper):
             z_indices = z_indices.astuple()
         assert "iou_threshold" in z.astext()
@@ -340,7 +340,7 @@ def verify_nms(x0_data, x1_data, x2_data, dshape, ref_res, ref_indices_res,
     np_indices_result = np.array([[3, 0, -1, -1, -1]])
     num_anchors = 5
 
-    dshape = (tvm.var("n"), num_anchors, 6)
+    dshape = (te.size_var("n"), num_anchors, 6)
     verify_nms(np_data, np_valid_count, np_indices, dshape, np_result, np_indices_result,
                force_suppress=True, top_k=2, check_type_only=True)
     dshape = (1, num_anchors, 6)
@@ -351,7 +351,7 @@ def verify_nms(x0_data, x1_data, x2_data, dshape, ref_res, ref_indices_res,
                            [1, 0.7, 30, 60, 50, 80], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
     np_indices_result = np.array([[3, 0, 1, -1, -1]])
-    dshape = (tvm.var("n"), num_anchors, 6)
+    dshape = (te.size_var("n"), num_anchors, 6)
     verify_nms(np_data, np_valid_count, np_indices, dshape, np_result,
                np_indices_result, check_type_only=True)
     dshape = (1, num_anchors, 6)
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
index bc0420f26d9b..c45b82f36602 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -620,7 +620,7 @@ def before():
         x = relay.var("x", shape=(1, 32, 28, 28))
         weight = relay.var('weight', shape=(32, 32, 3, 3))
         y = relay.nn.conv2d(x, weight, channels=32, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.strided_slice(y, begin=[0, 16], end=[None, None])
+        y = relay.strided_slice(y, begin=relay.const([0, 16], "int32"), end=relay.const([1, 32], "int32"))
         y = relay.Function(analysis.free_vars(y), y)
         return y
 
@@ -636,7 +636,7 @@ def expected():
         x = relay.layout_transform(x, "NCHW", "NCHW4c")
         y = relay.nn.conv2d(x, weight, channels=32, kernel_size=(3, 3), padding=(1, 1),
                             data_layout="NCHW4c")
-        y = relay.strided_slice(y, begin=[0, 4], end=[None, 8])
+        y = relay.strided_slice(y, begin=relay.const([0, 4], "int32"), end=relay.const([1, 8], "int32"))
         y = relay.layout_transform(y, "NCHW4c", "NCHW")
         y = relay.Function(analysis.free_vars(y), y)
         return y
diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index fdf2430b3a02..247e6b58b709 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -455,20 +455,20 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1,
                                   sort_tensor,
                                   valid_count,
                                   indices,
-                                  tvm.const(max_output_size, dtype="int32"),
-                                  tvm.const(iou_threshold, dtype=data.dtype),
-                                  tvm.const(force_suppress, dtype="bool"),
-                                  tvm.const(top_k, dtype="int32"),
-                                  tvm.const(coord_start, dtype="int32"),
-                                  tvm.const(score_index, dtype="int32"),
-                                  tvm.const(id_index, dtype="int32"),
-                                  tvm.const(return_indices, dtype="bool"),
-                                  zero=tvm.const(0, dtype=data.dtype),
-                                  one=tvm.const(1, dtype=data.dtype))
+                                  tvm.tir.const(max_output_size, dtype="int32"),
+                                  tvm.tir.const(iou_threshold, dtype=data.dtype),
+                                  tvm.tir.const(force_suppress, dtype="bool"),
+                                  tvm.tir.const(top_k, dtype="int32"),
+                                  tvm.tir.const(coord_start, dtype="int32"),
+                                  tvm.tir.const(score_index, dtype="int32"),
+                                  tvm.tir.const(id_index, dtype="int32"),
+                                  tvm.tir.const(return_indices, dtype="bool"),
+                                  zero=tvm.tir.const(0, dtype=data.dtype),
+                                  one=tvm.tir.const(1, dtype=data.dtype))
     if return_indices:
-        box_indices, out_shape = hybrid_rearrange_out(box_indices, one=tvm.const(1, dtype="int32"))
+        box_indices, out_shape = hybrid_rearrange_out(box_indices, one=tvm.tir.const(1, dtype="int32"))
         return tuple([box_indices, out_shape])
 
     if invalid_to_bottom:
-        out, out_shape = hybrid_rearrange_out(out, one=tvm.const(1, dtype=data.dtype))
+        out, out_shape = hybrid_rearrange_out(out, one=tvm.tir.const(1, dtype=data.dtype))
     return out

From a4955cafa23ac5a782f004f2927cd343192474cc Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Wed, 15 Apr 2020 21:28:52 +0000
Subject: [PATCH 05/22] Minor fix

---
 python/tvm/relay/op/strategy/generic.py | 11 +++++--
 python/tvm/relay/op/vision/_vision.py   | 40 +++++++++++++++++++++++++
 topi/python/topi/vision/nms.py          |  3 +-
 3 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 0cedaa1c07f0..a843c33e3cf7 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -695,9 +695,14 @@ def _compute_nms(attrs, inputs, out_type):
         score_index = get_const_int(attrs.score_index)
         id_index = get_const_int(attrs.id_index)
         invalid_to_bottom = bool(get_const_int(attrs.invalid_to_bottom))
-        return [topi_compute(inputs[0], inputs[1], inputs[2], max_output_size, iou_threshold,
-                             force_suppress, top_k, coord_start, score_index, id_index,
-                             return_indices, invalid_to_bottom)]
+        if return_indices:
+            return topi_compute(inputs[0], inputs[1], inputs[2], max_output_size, iou_threshold,
+                                force_suppress, top_k, coord_start, score_index, id_index,
+                                return_indices, invalid_to_bottom)
+        else:
+            return [topi_compute(inputs[0], inputs[1], inputs[2], max_output_size, iou_threshold,
+                                 force_suppress, top_k, coord_start, score_index, id_index,
+                                 return_indices, invalid_to_bottom)]
     return _compute_nms
 
 @override_native_generic_func("non_max_suppression_strategy")
diff --git a/python/tvm/relay/op/vision/_vision.py b/python/tvm/relay/op/vision/_vision.py
index 6e2008ad74c0..00d369c89c94 100644
--- a/python/tvm/relay/op/vision/_vision.py
+++ b/python/tvm/relay/op/vision/_vision.py
@@ -18,6 +18,10 @@
 """Definition of vision ops"""
 from __future__ import absolute_import
 
+import topi
+
+from tvm.te.hybrid import script
+
 from .. import op as reg
 from .. import strategy
 from ..op import OpPattern
@@ -40,3 +44,39 @@
 # non-maximum suppression
 reg.register_strategy("vision.non_max_suppression", strategy.nms_strategy)
 reg.register_pattern("vision.non_max_suppression", OpPattern.OPAQUE)
+
+@script
+def _get_valid_counts_shape_func(data_shape):
+    valid_counts_shape = output_tensor((1,), "int64")
+    out_tensor_shape = output_tensor((data_shape.shape[0],), "int64")
+    out_indices_shape = output_tensor((2,), "int64")
+
+    valid_counts_shape[0] = data_shape[0]
+    for i in const_range(data_shape.shape[0]):
+        out_tensor_shape[i] = data_shape[i]
+    out_indices_shape[0] = data_shape[0]
+    out_indices_shape[1] = data_shape[1]
+
+    return valid_counts_shape, out_tensor_shape, out_indices_shape
+
+@reg.register_shape_func("vision.get_valid_counts", False)
+def get_valid_counts_shape_func(attrs, inputs, _):
+    return _get_valid_counts_shape_func(inputs[0])
+
+@script
+def _nms_shape_func(data_shape):
+    out_shape = output_tensor((2,), "int64")
+    count_shape = output_tensor((2,), "int64")
+
+    out_shape[0] = data_shape[0]
+    out_shape[1] = data_shape[1]
+    count_shape[0] = data_shape[0]
+    count_shape[1] = int64(1)
+    return out_shape, count_shape
+
+@reg.register_shape_func("vision.non_max_suppression", False)
+def nms_shape_func(attrs, inputs, _):
+    if attrs.return_indices:
+        return _nms_shape_func(inputs[0])
+    else:
+        return [topi.math.identity(inputs[0])]
diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index 247e6b58b709..96a89207d9fa 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -466,8 +466,7 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1,
                                   zero=tvm.tir.const(0, dtype=data.dtype),
                                   one=tvm.tir.const(1, dtype=data.dtype))
     if return_indices:
-        box_indices, out_shape = hybrid_rearrange_out(box_indices, one=tvm.tir.const(1, dtype="int32"))
-        return tuple([box_indices, out_shape])
+        return hybrid_rearrange_out(box_indices, one=tvm.tir.const(1, dtype="int32"))
 
     if invalid_to_bottom:
         out, out_shape = hybrid_rearrange_out(out, one=tvm.tir.const(1, dtype=data.dtype))

From e40216453b0e2bc53344c76c7280fbb1cb8ecef3 Mon Sep 17 00:00:00 2001
From: Yong Wu <ywu118@alumni.jh.edu>
Date: Fri, 17 Apr 2020 00:32:01 +0800
Subject: [PATCH 06/22] fix for fuse

---
 python/tvm/relay/op/_transform.py       | 16 +++++++++++++---
 python/tvm/relay/op/strategy/generic.py |  7 +++----
 python/tvm/relay/op/vision/_vision.py   |  3 +--
 python/tvm/relay/op/vision/nms.py       |  2 +-
 src/relay/op/tensor/transform.cc        |  6 ++++--
 src/relay/transforms/fuse_ops.cc        |  5 +++++
 tests/python/relay/test_op_level5.py    | 14 ++++++--------
 topi/tests/python/test_topi_vision.py   |  3 ++-
 8 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 7d81ab13f6db..378911a27e58 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -102,11 +102,21 @@ def arange_shape_func(attrs, inputs, _):
     return [_arange_shape_func(*inputs)]
 
 @script
-def _strided_slice_shape_func(data_shape, begin, end, strides):
-    ndim = len(data_shape.shape)
+def _strided_slice_shape_func(data, begin, end, strides):
+    ndim = len(data.shape)
     out = output_tensor((ndim,), "int64")
     for i in const_range(ndim):
-        out[i] = int64(ceil_div((int64(end[i]) - int64(begin[i])), int64(strides[i])))
+        cbegin = 0
+        cend = data.shape[i]
+        cstride = 1
+        if len(begin) > i:
+            cbegin = begin[i]
+        if len(end) > i:
+            cend = end[i]
+        if len(strides) > i:
+            cstride = strides[i]
+        assert cstride != 0, "Strides can't be zero."
+        out[i] = int64(ceil_div((int64(cend) - int64(cbegin)), int64(cstride)))
     return out
 
 @_reg.register_shape_func("strided_slice", True)
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index a843c33e3cf7..de808d1edbf4 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -699,10 +699,9 @@ def _compute_nms(attrs, inputs, out_type):
             return topi_compute(inputs[0], inputs[1], inputs[2], max_output_size, iou_threshold,
                                 force_suppress, top_k, coord_start, score_index, id_index,
                                 return_indices, invalid_to_bottom)
-        else:
-            return [topi_compute(inputs[0], inputs[1], inputs[2], max_output_size, iou_threshold,
-                                 force_suppress, top_k, coord_start, score_index, id_index,
-                                 return_indices, invalid_to_bottom)]
+        return [topi_compute(inputs[0], inputs[1], inputs[2], max_output_size, iou_threshold,
+                             force_suppress, top_k, coord_start, score_index, id_index,
+                             return_indices, invalid_to_bottom)]
     return _compute_nms
 
 @override_native_generic_func("non_max_suppression_strategy")
diff --git a/python/tvm/relay/op/vision/_vision.py b/python/tvm/relay/op/vision/_vision.py
index 00d369c89c94..094671c74284 100644
--- a/python/tvm/relay/op/vision/_vision.py
+++ b/python/tvm/relay/op/vision/_vision.py
@@ -78,5 +78,4 @@ def _nms_shape_func(data_shape):
 def nms_shape_func(attrs, inputs, _):
     if attrs.return_indices:
         return _nms_shape_func(inputs[0])
-    else:
-        return [topi.math.identity(inputs[0])]
+    return [topi.math.identity(inputs[0])]
diff --git a/python/tvm/relay/op/vision/nms.py b/python/tvm/relay/op/vision/nms.py
index d599ea55d0ba..38dcbe5452be 100644
--- a/python/tvm/relay/op/vision/nms.py
+++ b/python/tvm/relay/op/vision/nms.py
@@ -134,5 +134,5 @@ def non_max_suppression(data,
                                     return_indices,
                                     invalid_to_bottom)
     if return_indices:
-        return TupleWrapper(out, 2)
+        return expr.TupleWrapper(out, 2)
     return out
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index b464330c0bb9..a6d7e3a6730a 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1799,8 +1799,10 @@ bool StridedSliceRel(const Array<Type>& types,
       // Require concrete integer as symbolic inference of min/max
       // can get complicated and not very helpful.
       const int64_t* p_dim_size = tir::as_const_int(dshape[i]);
-      CHECK(p_dim_size)
-          << "strided_slice requires sliced dimension to be concrete int";
+      if (!p_dim_size) {
+        oshape[i] = dshape[i];
+        continue;
+      }
       int64_t dim_size = p_dim_size[0];
       begin_v = (begin_v < 0) ? dim_size + begin_v : begin_v;
       end_v = (end_v < 0) ? dim_size + end_v : end_v;
diff --git a/src/relay/transforms/fuse_ops.cc b/src/relay/transforms/fuse_ops.cc
index 13bfa7bb28de..566f1424e3ea 100644
--- a/src/relay/transforms/fuse_ops.cc
+++ b/src/relay/transforms/fuse_ops.cc
@@ -252,6 +252,10 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
     if (call->attrs.as<StridedSliceAttrs>()) {
       bool is_dyn{false};
       for (auto arg :  call->args) {
+        if (!arg.as<ConstantNode>()) {
+           is_dyn = true;
+           break;
+        }
         auto arg_tt = arg->checked_type().as<TensorTypeNode>();
         if (arg_tt) {
           for (auto dim : arg_tt->shape) {
@@ -266,6 +270,7 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
         op_pattern = kInjective;
       }
     }
+
     node->pattern = op_pattern;
     this->Update(call->op, nullptr, kOpaque);
     const auto* rtype = call->checked_type().as<TensorTypeNode>();
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 0cee8a0f391f..c20a66729712 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -317,15 +317,13 @@ def verify_nms(x0_data, x1_data, x2_data, dshape, ref_res, ref_indices_res,
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             op_res1 = intrp1.evaluate(func)(x0_data, x1_data, x2_data)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
-            if top_k == -1:
-                op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data, x2_data)
-                tvm.testing.assert_allclose(op_indices_res1[0].asnumpy(), ref_indices_res, rtol=1e-5)
+            op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data, x2_data)
+            tvm.testing.assert_allclose(op_indices_res1[0].asnumpy(), ref_indices_res, rtol=1e-5)
             intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
             op_res2 = intrp2.evaluate(func)(x0_data, x1_data, x2_data)
             tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
-            if top_k == -1:
-                op_indices_res2 = intrp2.evaluate(func_indices)(x0_data, x1_data, x2_data)
-                tvm.testing.assert_allclose(op_indices_res2[0].asnumpy(), ref_indices_res, rtol=1e-5)
+            op_indices_res2 = intrp2.evaluate(func_indices)(x0_data, x1_data, x2_data)
+            tvm.testing.assert_allclose(op_indices_res2[0].asnumpy(), ref_indices_res, rtol=1e-5)
 
     np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80],
                          [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79],
@@ -337,7 +335,7 @@ def verify_nms(x0_data, x1_data, x2_data, dshape, ref_res, ref_indices_res,
     np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
                            [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
-    np_indices_result = np.array([[3, 0, -1, -1, -1]])
+    np_indices_result = np.array([[4, 0, -1, -1, -1]])
     num_anchors = 5
 
     dshape = (te.size_var("n"), num_anchors, 6)
@@ -350,7 +348,7 @@ def verify_nms(x0_data, x1_data, x2_data, dshape, ref_res, ref_indices_res,
     np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
                            [1, 0.7, 30, 60, 50, 80], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
-    np_indices_result = np.array([[3, 0, 1, -1, -1]])
+    np_indices_result = np.array([[4, 0, 1, -1, -1]])
     dshape = (te.size_var("n"), num_anchors, 6)
     verify_nms(np_data, np_valid_count, np_indices, dshape, np_result,
                np_indices_result, check_type_only=True)
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index 072fa8eb7e32..d2331ee0c7f7 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -153,7 +153,8 @@ def check_device(device):
                            coord_start=coord_start, score_index=score_index, id_index=id_index,
                            return_indices=False)
             indices_out = fcompute(data, valid_count, indices, -1, iou_threshold, force_suppress, top_k,
-                                   coord_start=coord_start, score_index=score_index, id_index=id_index)
+                                   coord_start=coord_start, score_index=score_index, id_index=id_index,
+                                   return_indices=True)
             s = fschedule(out)
             indices_s = fschedule(indices_out)
 

From b40a5cf5dba0ae674a6679cfbb7e30d209d3728c Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Thu, 16 Apr 2020 17:35:04 +0000
Subject: [PATCH 07/22] Workaround to pass batch_size into hybrid function to
 handle dynamic shape

---
 topi/python/topi/vision/nms.py | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index 96a89207d9fa..f427b2276994 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -24,7 +24,7 @@
 
 
 @hybrid.script
-def hybrid_rearrange_out(data, one):
+def hybrid_rearrange_out(data, one, batch_size):
     """Hybrid routine to rearrange nms output to
     move all valid entries to top.
 
@@ -39,6 +39,10 @@ def hybrid_rearrange_out(data, one):
     one: tvm.tir.const
         Constant one with the same dtype as data.
 
+    batch_size: tvm.tir.IntImm or tvm.tir.Var
+        Batch size. We need to pass it in since hybrid script doesn't support
+        binding variable to symbolic dim.
+
     Returns
     -------
     output : tvm.te.Tensor or numpy NDArray
@@ -51,7 +55,6 @@ def hybrid_rearrange_out(data, one):
         the valid number of boxes.
     """
     ndim = len(data.shape)
-    batch_size = data.shape[0]
     num_anchors = data.shape[1]
     valid_box_count = output_tensor((batch_size, 1), "int32")
     output = output_tensor((batch_size, num_anchors), data.dtype)
@@ -88,7 +91,7 @@ def hybrid_rearrange_out(data, one):
 
 
 @hybrid.script
-def hybrid_get_valid_counts(data, score_threshold, id_index, score_index, one):
+def hybrid_get_valid_counts(data, score_threshold, id_index, score_index, one, batch_size):
     """Hybrid routine to get valid count of bounding boxes
     given a score threshold. Also moves valid boxes to the
     top of input data.
@@ -111,6 +114,10 @@ def hybrid_get_valid_counts(data, score_threshold, id_index, score_index, one):
     one: tvm.tir.const
         Constant one with the same dtype as data.
 
+    batch_size: tvm.tir.IntImm or tvm.tir.Var
+        Batch size. We need to pass it in since hybrid script doesn't support
+        binding variable to symbolic dim.
+
     Returns
     -------
     valid_count : tvm.te.Tensor or numpy NDArray
@@ -122,7 +129,6 @@ def hybrid_get_valid_counts(data, score_threshold, id_index, score_index, one):
     out_indices: tvm.te.Tensor or numpy NDArray
         Related index in input data.
     """
-    batch_size = data.shape[0]
     num_anchors = data.shape[1]
     box_data_length = data.shape[2]
     valid_count = output_tensor((batch_size,), "int32")
@@ -183,11 +189,12 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
     score_index_const = tvm.tir.const(score_index, "int32")
     return hybrid_get_valid_counts(data, score_threshold_const,
                                    id_index_const, score_index_const,
-                                   tvm.tir.const(1, data.dtype))
+                                   tvm.tir.const(1, data.dtype),
+                                   data.shape[0])
 
 
 @hybrid.script
-def hybrid_nms(data, sorted_index, valid_count, indices, max_output_size,
+def hybrid_nms(data, sorted_index, valid_count, indices, batch_size, max_output_size,
                iou_threshold, force_suppress, top_k, coord_start, score_index,
                id_index, return_indices, zero, one):
     """Hybrid routing for non-maximum suppression.
@@ -208,6 +215,10 @@ def hybrid_nms(data, sorted_index, valid_count, indices, max_output_size,
     indices : tvm.te.Tensor or numpy.NDArray
         indices in original tensor, with shape [batch_size, num_anchors]
 
+    batch_size: tvm.tir.IntImm or tvm.tir.Var
+        Batch size. We need to pass it in since hybrid script doesn't support
+        binding variable to symbolic dim.
+
     max_output_size : tvm.tir.const
         Max number of output valid boxes for each instance.
         By default all valid boxes are returned.
@@ -249,7 +260,6 @@ def hybrid_nms(data, sorted_index, valid_count, indices, max_output_size,
         2-D tensor with shape [batch_size, num_anchors].
     """
 
-    batch_size = data.shape[0]
     num_anchors = data.shape[1]
     box_data_length = data.shape[2]
 
@@ -455,6 +465,7 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1,
                                   sort_tensor,
                                   valid_count,
                                   indices,
+                                  batch_size,
                                   tvm.tir.const(max_output_size, dtype="int32"),
                                   tvm.tir.const(iou_threshold, dtype=data.dtype),
                                   tvm.tir.const(force_suppress, dtype="bool"),
@@ -466,8 +477,10 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1,
                                   zero=tvm.tir.const(0, dtype=data.dtype),
                                   one=tvm.tir.const(1, dtype=data.dtype))
     if return_indices:
-        return hybrid_rearrange_out(box_indices, one=tvm.tir.const(1, dtype="int32"))
+        return hybrid_rearrange_out(box_indices, one=tvm.tir.const(1, dtype="int32"),
+                                    batch_size=batch_size)
 
     if invalid_to_bottom:
-        out, out_shape = hybrid_rearrange_out(out, one=tvm.tir.const(1, dtype=data.dtype))
+        out, _ = hybrid_rearrange_out(out, one=tvm.tir.const(1, dtype=data.dtype),
+                                      batch_size=batch_size)
     return out

From bd67550477e5209435e4e8e6266e39a112f9ad42 Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Thu, 16 Apr 2020 19:10:32 +0000
Subject: [PATCH 08/22] Seperate rearrange

---
 topi/python/topi/vision/nms.py | 89 ++++++++++++++++++++++------------
 1 file changed, 57 insertions(+), 32 deletions(-)

diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index f427b2276994..ff865ce988c0 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -22,9 +22,52 @@
 from tvm.te import hybrid
 from ..sort import argsort
 
+@hybrid.script
+def hybrid_rearrange_box_out(data, one, batch_size):
+    """Hybrid routine to rearrange nms output to
+    move all valid entries to top.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor or numpy NDArray
+        NMS output. 3-D tensor with shape
+        [batch_size, num_anchors, 6].
+
+    one: tvm.tir.const
+        Constant one with the same dtype as data.
+
+    batch_size: tvm.tir.IntImm or tvm.tir.Var
+        Batch size. We need to pass it in since hybrid script doesn't support
+        binding variable to symbolic dim.
+
+    Returns
+    -------
+    output : tvm.te.Tensor or numpy NDArray
+        Transformed NMS output. 3-D tensor with shape
+        [batch_size, num_anchors, 6].
+    """
+    num_anchors = data.shape[1]
+    elem_length = data.shape[2]
+    output = output_tensor((batch_size,
+                            num_anchors,
+                            elem_length),
+                           data.dtype)
+
+    for i in parallel(batch_size):
+        valid_idx = 0
+        for j in range(num_anchors):
+            if data[i, j, 0] >= 0:
+                for k in range(elem_length):
+                    output[i, valid_idx, k] = data[i, j, k]
+                valid_idx += 1
+            if j >= valid_idx:
+                for k in range(elem_length):
+                    output[i, j, k] = -one
+    return output
+
 
 @hybrid.script
-def hybrid_rearrange_out(data, one, batch_size):
+def hybrid_rearrange_indices_out(data, one, batch_size):
     """Hybrid routine to rearrange nms output to
     move all valid entries to top.
 
@@ -46,45 +89,27 @@ def hybrid_rearrange_out(data, one, batch_size):
     Returns
     -------
     output : tvm.te.Tensor or numpy NDArray
-        Transformed NMS output. 3-D tensor with shape
-        [batch_size, num_anchors, 6] or [batch_size, num_anchors, 5],
-        or 2-D tensor with shape [batch_size, num_anchors].
+        2-D tensor with shape [batch_size, num_anchors].
 
     valid_box_count : tvm.te.Tensor or numpy NDArray
         Tensor with shape [batch_size, 1], indicates
         the valid number of boxes.
     """
-    ndim = len(data.shape)
     num_anchors = data.shape[1]
     valid_box_count = output_tensor((batch_size, 1), "int32")
     output = output_tensor((batch_size, num_anchors), data.dtype)
-    if ndim > 2:
-        output = output_tensor((batch_size,
-                                num_anchors,
-                                data.shape[2]),
-                               data.dtype)
 
     for i in parallel(batch_size):
         valid_idx = 0
         for j in range(num_anchors):
-            if ndim > 2:
-                elem_length = data.shape[2]
-                if data[i, j, 0] >= 0:
-                    for k in range(elem_length):
-                        output[i, valid_idx, k] = data[i, j, k]
-                    valid_idx += 1
-                if j >= valid_idx:
-                    for k in range(elem_length):
-                        output[i, j, k] = -one
-            else:
-                if data[i, j] >= 0:
-                    output[i, valid_idx] = data[i, j]
-                    valid_idx += 1
-                if data[i, j] > num_anchors or data[i, j] < -num_anchors:
-                    output[i, valid_idx] = 0
-                    valid_idx += 1
-                if j >= valid_idx:
-                    output[i, j] = -one
+            if data[i, j] >= 0:
+                output[i, valid_idx] = data[i, j]
+                valid_idx += 1
+            if data[i, j] > num_anchors or data[i, j] < -num_anchors:
+                output[i, valid_idx] = 0
+                valid_idx += 1
+            if j >= valid_idx:
+                output[i, j] = -one
         valid_box_count[i, 0] = valid_idx
 
     return output, valid_box_count
@@ -477,10 +502,10 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1,
                                   zero=tvm.tir.const(0, dtype=data.dtype),
                                   one=tvm.tir.const(1, dtype=data.dtype))
     if return_indices:
-        return hybrid_rearrange_out(box_indices, one=tvm.tir.const(1, dtype="int32"),
-                                    batch_size=batch_size)
+        return hybrid_rearrange_indices_out(box_indices, one=tvm.tir.const(1, dtype="int32"),
+                                            batch_size=batch_size)
 
     if invalid_to_bottom:
-        out, _ = hybrid_rearrange_out(out, one=tvm.tir.const(1, dtype=data.dtype),
-                                      batch_size=batch_size)
+        out = hybrid_rearrange_box_out(out, one=tvm.tir.const(1, dtype=data.dtype),
+                                       batch_size=batch_size)
     return out

From 70051b571279dc0126e1c18726470205d598245d Mon Sep 17 00:00:00 2001
From: Yong Wu <ywu118@alumni.jh.edu>
Date: Fri, 17 Apr 2020 05:37:22 +0800
Subject: [PATCH 09/22] fix lint

---
 topi/python/topi/cuda/nms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 255cf6fc30f7..2a206f6cbe68 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -448,4 +448,4 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1,
     if return_indices:
         return box_indices
 
-    return out
\ No newline at end of file
+    return out

From c142880eafc8d377de4f2eeae3221bdef1f1b8da Mon Sep 17 00:00:00 2001
From: Yong Wu <ywu118@alumni.jh.edu>
Date: Tue, 21 Apr 2020 05:19:28 +0800
Subject: [PATCH 10/22] fix ci, comments

---
 include/tvm/relay/attrs/transform.h           |  3 +
 include/tvm/relay/attrs/vision.h              |  3 -
 python/tvm/relay/frontend/tensorflow.py       |  5 +-
 python/tvm/relay/op/_transform.py             | 11 +--
 python/tvm/relay/op/transform.py              | 31 ++++++--
 src/relay/op/tensor/transform.cc              | 79 ++++++++++++-------
 src/relay/op/vision/nms.cc                    | 12 +--
 .../transforms/combine_parallel_conv2d.cc     | 15 ++--
 src/relay/transforms/fuse_ops.cc              | 22 ------
 src/relay/transforms/pattern_util.h           |  2 +-
 tests/python/relay/test_any.py                |  5 +-
 tests/python/relay/test_op_level2.py          |  2 +-
 tests/python/relay/test_op_level4.py          | 26 +++---
 .../python/relay/test_pass_alter_op_layout.py | 41 +++++++---
 .../test_pass_combine_parallel_conv2d.py      | 62 +++++++++++----
 .../topi/testing/strided_slice_python.py      |  7 +-
 topi/python/topi/vision/nms.py                | 33 +++++---
 17 files changed, 221 insertions(+), 138 deletions(-)

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 4f0c90ec4f4a..a4c3d0194b22 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -213,6 +213,7 @@ struct StridedSliceAttrs : public tvm::AttrsNode<StridedSliceAttrs> {
   Expr begin;
   Expr end;
   Expr strides;
+  bool ignore_end;
 
   TVM_DECLARE_ATTRS(StridedSliceAttrs, "relay.attrs.StridedSliceAttrs") {
     TVM_ATTR_FIELD(begin)
@@ -221,6 +222,8 @@ struct StridedSliceAttrs : public tvm::AttrsNode<StridedSliceAttrs> {
         .describe("Indices for end of slice, end index is exclusive");
     TVM_ATTR_FIELD(strides)
         .describe("Stride values of the slice");
+    TVM_ATTR_FIELD(ignore_end).set_default(false)
+        .describe("Whether to ignore the input end and infer value of end from input data");
   }
 };
 
diff --git a/include/tvm/relay/attrs/vision.h b/include/tvm/relay/attrs/vision.h
index 3edd23f34494..52669ea651ee 100644
--- a/include/tvm/relay/attrs/vision.h
+++ b/include/tvm/relay/attrs/vision.h
@@ -103,9 +103,6 @@ struct NonMaximumSuppressionAttrs : public tvm::AttrsNode<NonMaximumSuppressionA
         .set_default(-1)
         .describe("Max number of output valid boxes for each instance."
                   "By default all valid boxes are returned.");
-    TVM_ATTR_FIELD(score_threshold)
-        .set_default(0.0)
-        .describe("Non-maximum suppression score threshold.");
     TVM_ATTR_FIELD(iou_threshold)
         .set_default(0.5)
         .describe("Non-maximum suppression iou threshold.");
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index d0dbdd7121b8..a830018c8a9c 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -96,9 +96,6 @@ def _get_tuple_param(params, input_node):
 def _need_prelude_for_shape_inference(op):
     return "TensorArray" in op
 
-def _need_module_for_shape_inference(op):
-    return op in ['StridedSlice', 'NonMaxSuppressionV3']
-
 def _rsqrt():
     def _impl(inputs, attr, params, mod):
         inputs.append(tvm.relay.const(-0.5, attr['T'].name))
@@ -622,7 +619,7 @@ def _impl(inputs, attr, params, mod):
         max_output_size = int(np.atleast_1d(inputs[2].data.asnumpy().astype("int64"))[0])
         iou_threshold = np.atleast_1d(inputs[3].data.asnumpy())[0]
         # score_threshold was introduced from V3
-        score_threshold = np.atleast_1d(inputs[4].data.asnumpy())[0] if len(inputs) > 4 else None
+        score_threshold = np.atleast_1d(inputs[4].data.asnumpy())[0] if len(inputs) > 4 else 0.0
 
         # Generate data with shape (1, num_anchors, 5)
         scores = AttrCvt(op_name="expand_dims",
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 378911a27e58..40fa1caf8943 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -102,18 +102,18 @@ def arange_shape_func(attrs, inputs, _):
     return [_arange_shape_func(*inputs)]
 
 @script
-def _strided_slice_shape_func(data, begin, end, strides):
+def _strided_slice_shape_func(data, begin, end, strides, ignore_end):
     ndim = len(data.shape)
     out = output_tensor((ndim,), "int64")
     for i in const_range(ndim):
         cbegin = 0
         cend = data.shape[i]
         cstride = 1
-        if len(begin) > i:
+        if begin.shape[0] > i:
             cbegin = begin[i]
-        if len(end) > i:
+        if ignore_end != 0 or end.shape[0] > i:
             cend = end[i]
-        if len(strides) > i:
+        if strides.shape[0] > i:
             cstride = strides[i]
         assert cstride != 0, "Strides can't be zero."
         out[i] = int64(ceil_div((int64(cend) - int64(cbegin)), int64(cstride)))
@@ -121,7 +121,8 @@ def _strided_slice_shape_func(data, begin, end, strides):
 
 @_reg.register_shape_func("strided_slice", True)
 def strided_slice_shape_func(attrs, inputs, _):
-    return [_strided_slice_shape_func(*inputs)]
+    ignore_end = attrs.ignore_end
+    return [_strided_slice_shape_func(*inputs, convert(get_const_int(ignore_end)))]
 
 @script
 def _concatenate_shape_func(inputs, axis):
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 44b8c1c03f9d..6033aae3e960 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -611,7 +611,7 @@ def split(data, indices_or_sections, axis=0):
     return TupleWrapper(_make.split(data, indices_or_sections, axis), ret_size)
 
 
-def strided_slice(data, begin, end, strides=None):
+def strided_slice(data, begin, end, strides=None, ignore_end=False):
     """Strided slice of an array.
 
     Parameters
@@ -619,23 +619,32 @@ def strided_slice(data, begin, end, strides=None):
     data : relay.Expr
         The source array to be sliced.
 
-    begin: relay.Expr
+    begin: relay.Expr or List[int]
         The indices to begin with in the slicing.
 
-    end: relay.Expr
+    end: relay.Expr or List[int]
         Indices indicating end of the slice.
 
-    strides: relay.Expr, optional
+    strides: relay.Expr or List[int], optional
         Specifies the stride values, it can be negative in that case,
         the input tensor will be reversed in that particular axis.
 
+    ignore_end: boolean, optional
+        Whether to ignore input end.
+
     Returns
     -------
     ret : relay.Expr
         The computed result.
     """
     strides = strides or const([1], dtype="int32")
-    return _make.strided_slice(data, begin, end, strides)
+    if isinstance(begin, list):
+        begin = const(list(begin))
+    if isinstance(end, list):
+        end = const(list(end))
+    if isinstance(strides, list):
+        strides = const(list(strides))
+    return _make.strided_slice(data, begin, end, strides, ignore_end)
 
 
 def strided_set(data, v, begin, end, strides=None):
@@ -649,13 +658,13 @@ def strided_set(data, v, begin, end, strides=None):
     v : relay.Expr
         The data to be set.
 
-    begin: relay.Expr
+    begin: relay.Expr or List[int]
         The indices to begin with in the slicing.
 
-    end: relay.Expr
+    end: relay.Expr or List[int]
         Indices indicating end of the slice.
 
-    strides: relay.Expr, optional
+    strides: relay.Expr or List[int], optional
         Specifies the stride values, it can be negative in that case,
         the input tensor will be reversed in that particular axis.
 
@@ -665,6 +674,12 @@ def strided_set(data, v, begin, end, strides=None):
         The computed result.
     """
     strides = strides or const([1], dtype="int32")
+    if isinstance(begin, list):
+        begin = const(list(begin))
+    if isinstance(end, list):
+        end = const(list(end))
+    if isinstance(strides, list):
+        strides = const(list(strides))
     return _make.strided_set(data, v, begin, end, strides)
 
 
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index a6d7e3a6730a..27c459e76b33 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1096,11 +1096,11 @@ bool ArangeRel(const Array<Type>& types, int num_inputs, const Attrs& raw_attrs,
 }
 
 inline te::Tensor DynamicArange(const te::Tensor& start,
-                                 const te::Tensor& stop,
-                                 const te::Tensor& step,
-                                 tvm::DataType dtype,
-                                 std::string name = "T_arange_dynamic",
-                                 std::string tag = topi::kInjective) {
+                                const te::Tensor& stop,
+                                const te::Tensor& step,
+                                tvm::DataType dtype,
+                                std::string name = "T_arange_dynamic",
+                                std::string tag = topi::kInjective) {
   tvm::PrimExpr num_elem = tvm::tir::Var("num_elem");
   return te::compute(
       {num_elem},
@@ -1774,10 +1774,18 @@ bool StridedSliceRel(const Array<Type>& types,
     std::vector<int64_t> end_vec;
     int64_t* end_val = ToVector(cend->data);
     for (int64_t i = 0; i < cend->data.Shape().front(); ++i) {
-      end_vec.push_back(end_val[i]);
+      if (param->ignore_end) {
+        end_vec.push_back(max_range);
+      } else {
+        end_vec.push_back(end_val[i]);
+      }
     }
     for (int64_t i = end_vec.size(); i < num_axis; ++i) {
-      end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
+      if (param->ignore_end) {
+        end_vec.push_back(max_range);
+      } else {
+        end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
+      }
     }
 
     for (int64_t i = 0; i < num_axis; ++i) {
@@ -1810,7 +1818,7 @@ bool StridedSliceRel(const Array<Type>& types,
       int64_t slice_range, step;
       if (stride_v < 0) {
         if (end_v < -1) end_v = -1;
-        CHECK_LT(end_v, begin_v)
+        CHECK_LE(end_v, begin_v)
             << "strided_slice get empty slice at axis " << i;
         begin_v = std::min(dim_size - 1, begin_v);
         slice_range = begin_v - end_v;
@@ -1861,9 +1869,10 @@ Array<Array<Layout>> StridedSliceInferCorrectLayout(const Attrs& attrs,
     CHECK(params != nullptr);
     Array<Integer> begin, end, strides;
     const ConstantNode *cbegin, *cend, *cstrides;
-    if ((cbegin = params->begin.as<ConstantNode>()) &&
-        (cend = params->end.as<ConstantNode>()) &&
-        (cstrides = params->strides.as<ConstantNode>())) {
+    cbegin = params->begin.as<ConstantNode>();
+    cend = params->end.as<ConstantNode>();
+    cstrides = params->strides.as<ConstantNode>();
+    if (cbegin && cend && cstrides) {
       int64_t* strides_val = ToVector(cstrides->data);
       for (int64_t i = 0; i < cstrides->data.Shape().front(); ++i) {
         strides.push_back(strides_val[i]);
@@ -1923,22 +1932,25 @@ Array<Array<Layout>> StridedSliceInferCorrectLayout(const Attrs& attrs,
                                                    DataType::Int(64), ctx);
     int64_t* begin_data = static_cast<int64_t*>(begin_ndarray->data);
     int64_t* end_data = static_cast<int64_t*>(end_ndarray->data);
+    int64_t* strides_data = static_cast<int64_t*>(strides_ndarray->data);
     for (size_t i = 0; i < new_begin.size(); ++i) {
       begin_data[i] = new_begin[i];
       end_data[i] = new_end[i];
+      strides_data[i] = 1;
     }
     params->begin = Constant(begin_ndarray);
     params->end = Constant(end_ndarray);
+    params->strides = Constant(strides_ndarray);
   }
   return {{layout, Layout("C"), Layout("C"), Layout("C")}, {layout}};
 }
 
 inline te::Tensor DynamicStridedSlice(const te::Tensor& input,
-                                  const te::Tensor& begin,
-                                  const te::Tensor& end,
-                                  const te::Tensor& strides,
-                                  std::string name = "T_strided_slice_dynamic",
-                                  std::string tag = topi::kInjective) {
+                                      const te::Tensor& begin,
+                                      const te::Tensor& end,
+                                      const te::Tensor& strides,
+                                      std::string name = "T_strided_slice_dynamic",
+                                      std::string tag = topi::kInjective) {
   int64_t src_tensor_dim = input->shape.size();
   Array<IndexExpr> out_shape;
   for (int64_t i = 0; i < src_tensor_dim; ++i) {
@@ -1984,6 +1996,11 @@ Array<te::Tensor> StridedSliceCompute(const Attrs& attrs, const Array<te::Tensor
     te::Tensor end = inputs[2];
     te::Tensor strides = inputs[3];
     // Dynamic computation
+    CHECK(begin->shape[0].as<IntImmNode>()->value == data->shape.size()
+          && end->shape[0].as<IntImmNode>()->value == data->shape.size()
+          && strides->shape[0].as<IntImmNode>()->value == data->shape.size())
+          << "begin, end, and strides are required to have the same length"
+          << " if they are non-constant.";
     return Array<te::Tensor>{
       DynamicStridedSlice(data, begin, end, strides)
     };
@@ -1994,11 +2011,13 @@ Array<te::Tensor> StridedSliceCompute(const Attrs& attrs, const Array<te::Tensor
 Expr MakeStridedSlice(Expr data,
                       Expr begin,
                       Expr end,
-                      Expr strides) {
+                      Expr strides,
+                      bool ignore_end) {
   auto attrs = make_object<StridedSliceAttrs>();
   attrs->begin = begin;
   attrs->end = end;
   attrs->strides = strides;
+  attrs->ignore_end = ignore_end;
   static const Op& op = Op::Get("strided_slice");
   return Call(op, {data, begin, end, strides}, Attrs(attrs), {});
 }
@@ -2031,19 +2050,19 @@ Examples::
                                                 [[ 5.,  6.],
                                                  [ 7.,  8.]]]
 )code" TVM_ADD_FILELINE)
-.set_num_inputs(4)
-.add_argument("data", "Tensor", "The input tensor.")
-.add_argument("begin", "Tensor", "The indices to begin with in the slicing.")
-.add_argument("end", "Tensor", "Indices indicating end of the slice.")
-.add_argument("strides", "Tensor", "The stride values.")
-.set_support_level(4)
-.set_attrs_type<StridedSliceAttrs>()
-.add_type_rel("StridedSlice", StridedSliceRel)
-.set_attr<FTVMCompute>("FTVMCompute", StridedSliceCompute)
-// TODO(@icemelon, @yongwww): Change to kOpaque because FuseOps doesn't consider dynamic shape
-.set_attr<TOpPattern>("TOpPattern", kOpaque)
-.set_attr<AnyCodegenStrategy>("AnyCodegenStrategy", kVariableDimensions)
-.set_attr<FInferCorrectLayout>("FInferCorrectLayout", StridedSliceInferCorrectLayout);
+    .set_num_inputs(4)
+    .add_argument("data", "Tensor", "The input tensor.")
+    .add_argument("begin", "Tensor", "The indices to begin with in the slicing.")
+    .add_argument("end", "Tensor", "Indices indicating end of the slice.")
+    .add_argument("strides", "Tensor", "The stride values.")
+    .add_argument("ignore_end", "Tensor", "Whether to ignore end.")
+    .set_support_level(4)
+    .set_attrs_type<StridedSliceAttrs>()
+    .add_type_rel("StridedSlice", StridedSliceRel)
+    .set_attr<FTVMCompute>("FTVMCompute", StridedSliceCompute)
+    .set_attr<TOpPattern>("TOpPattern", kInjective)
+    .set_attr<AnyCodegenStrategy>("AnyCodegenStrategy", kVariableDimensions)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", StridedSliceInferCorrectLayout);
 
 // strided_set
 bool StridedSetRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
diff --git a/src/relay/op/vision/nms.cc b/src/relay/op/vision/nms.cc
index 4002820d0c15..e51432931605 100644
--- a/src/relay/op/vision/nms.cc
+++ b/src/relay/op/vision/nms.cc
@@ -133,12 +133,12 @@ be in the format of [class_id, score, left, top, right, bottom]
 or [score, left, top, right, bottom]. Set id_index to be -1 to
 ignore class_id axis.
 )doc" TVM_ADD_FILELINE)
-.set_num_inputs(3)
-.add_argument("data", "Tensor", "Input data.")
-.add_argument("valid_count", "Tensor", "Number of valid anchor boxes.")
-.add_argument("indices", "Tensor", "Corresponding indices in original input tensor.")
-.set_support_level(5)
-.add_type_rel("NMS", NMSRel);
+    .set_num_inputs(3)
+    .add_argument("data", "Tensor", "Input data.")
+    .add_argument("valid_count", "Tensor", "Number of valid anchor boxes.")
+    .add_argument("indices", "Tensor", "Corresponding indices in original input tensor.")
+    .set_support_level(5)
+    .add_type_rel("NMS", NMSRel);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/transforms/combine_parallel_conv2d.cc b/src/relay/transforms/combine_parallel_conv2d.cc
index 6c53b6706fc9..d17e2fcd9655 100644
--- a/src/relay/transforms/combine_parallel_conv2d.cc
+++ b/src/relay/transforms/combine_parallel_conv2d.cc
@@ -167,6 +167,7 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
   void UpdateGroupOutput(const Expr& data, const Group& branches, size_t depth,
                          ExprSubstMap* subst_map) {
     int64_t index = 0;
+
     for (const auto& branch : branches) {
       const CallNode* conv2d = branch[0];
       int64_t channels = GetConv2DSuperChannelsDim(conv2d);
@@ -174,13 +175,11 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
       Array<Integer> end;
       for (size_t i = 0; i < channel_pos_; i++) {
         begin.push_back(0);
-        end.push_back(NullValue<Integer>());
+        end.push_back(channels);
       }
       begin.push_back(index);
       index += channels;
       end.push_back(index);
-
-
       DLContext ctx;
       ctx.device_type = kDLCPU;
       ctx.device_id = 0;
@@ -190,18 +189,22 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
                                                  DataType::Int(64), ctx);
       auto strides_ndarray = runtime::NDArray::Empty({int64_t(begin.size())},
                                                      DataType::Int(64), ctx);
-      int64_t* begin_data = static_cast<int64_t*>(begin_ndarray->data);
-      int64_t* end_data = static_cast<int64_t*>(end_ndarray->data);
+
+      auto* begin_data = static_cast<int64_t*>(begin_ndarray->data);
+      auto* end_data = static_cast<int64_t*>(end_ndarray->data);
+      auto* strides_data = static_cast<int64_t*>(strides_ndarray->data);
 
       for (size_t i = 0; i < begin.size(); ++i) {
         begin_data[i] = begin[i];
         end_data[i] = end[i];
+        strides_data[i] = 1;
       }
 
       auto slice = MakeStridedSlice(data,
                                     Constant(begin_ndarray),
                                     Constant(end_ndarray),
-                                    Constant(strides_ndarray));
+                                    Constant(strides_ndarray),
+                                    false);
       subst_map->insert({GetRef<Expr>(branch[depth]), slice});
     }
   }
diff --git a/src/relay/transforms/fuse_ops.cc b/src/relay/transforms/fuse_ops.cc
index 566f1424e3ea..01f1eeea30b3 100644
--- a/src/relay/transforms/fuse_ops.cc
+++ b/src/relay/transforms/fuse_ops.cc
@@ -249,28 +249,6 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
       this->Update(call->op, node, kOpaque);
     }
 
-    if (call->attrs.as<StridedSliceAttrs>()) {
-      bool is_dyn{false};
-      for (auto arg :  call->args) {
-        if (!arg.as<ConstantNode>()) {
-           is_dyn = true;
-           break;
-        }
-        auto arg_tt = arg->checked_type().as<TensorTypeNode>();
-        if (arg_tt) {
-          for (auto dim : arg_tt->shape) {
-            if (dim.as<Any>()) {
-              is_dyn = true;
-            }
-          }
-        }
-        if (is_dyn) break;
-      }
-      if (!is_dyn) {
-        op_pattern = kInjective;
-      }
-    }
-
     node->pattern = op_pattern;
     this->Update(call->op, nullptr, kOpaque);
     const auto* rtype = call->checked_type().as<TensorTypeNode>();
diff --git a/src/relay/transforms/pattern_util.h b/src/relay/transforms/pattern_util.h
index 8964959bfcfd..89f29fcc0cce 100644
--- a/src/relay/transforms/pattern_util.h
+++ b/src/relay/transforms/pattern_util.h
@@ -673,7 +673,7 @@ Expr MakeConcatenate(Expr data, int axis);
 
 Expr MakeRepeat(Expr data, int repeats, int axis);
 
-Expr MakeStridedSlice(Expr data, Expr begin, Expr end, Expr strides);
+Expr MakeStridedSlice(Expr data, Expr begin, Expr end, Expr strides, bool ignore_end);
 
 Expr MakeStack(Expr data, int axis);
 
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 9464e865b4e1..e5a7521ce721 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -644,13 +644,13 @@ def test_arange_with_dynamic_shape():
         tvm.testing.assert_allclose(result.asnumpy(), np.array(range(10)).astype("int32")+1)
 
 def verify_any_strided_slice(data_shape, begin_shape, end_shape,
-                             strides_shape, data_np_shape):
+                             strides_shape, data_np_shape, ignore_end=False):
     mod = tvm.IRModule()
     data = relay.var('data', shape=data_shape, dtype='float32')
     begin = relay.var('begin', shape=begin_shape, dtype="int32")
     end = relay.var('end', shape=end_shape, dtype="int32")
     strides = relay.var('strides', shape=strides_shape, dtype="int32")
-    y = relay.strided_slice(data, begin, end, strides)
+    y = relay.strided_slice(data, begin, end, strides, ignore_end)
     mod["main"] = relay.Function([data, begin, end, strides], y)
 
     # Generate random numpy input data
@@ -670,6 +670,7 @@ def test_any_strided_slice():
     verify_any_strided_slice(any_dims(3), (3,), (3,), (3,), (15, 17, 21))
     verify_any_strided_slice(any_dims(3), (3,), (3,), (3,), (23, 29, 41))
     verify_any_strided_slice(any_dims(4), (4,), (4,), (4,), (40, 50, 60, 70))
+    verify_any_strided_slice(any_dims(4), (4,), (4,), (4,), (40, 50, 60, 70), ignore_end=True)
 
 
 def test_recursive_concat():
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 68eced328fa8..c9a19044ea33 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -780,7 +780,7 @@ def _test_pool2d_int(opfunc, reffunc, dtype):
     x = relay.var("x", shape=dshape, dtype=dtype)
     y = opfunc(x, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
     func = relay.Function([x], y)
-    data = np.random.random_integers(low=-128, high=128, size=dshape)
+    data = np.random.randint(low=-128, high=128, size=dshape)
     ref_res = reffunc(data.reshape(1,3,14,2,14,2), axis=(3,5)).astype(dtype)
     for target, ctx in ctx_list():
         intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index ea5ebcd6c265..4b5e19223dca 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -58,11 +58,11 @@ def check_binary_op(opfunc, ref):
 
 def test_cmp_type():
     for op, ref in ((relay.greater, np.greater),
-               (relay.greater_equal, np.greater_equal),
-               (relay.less, np.less),
-               (relay.less_equal, np.less_equal),
-               (relay.equal, np.equal),
-               (relay.not_equal, np.not_equal)):
+                    (relay.greater_equal, np.greater_equal),
+                    (relay.less, np.less),
+                    (relay.less_equal, np.less_equal),
+                    (relay.equal, np.equal),
+                    (relay.not_equal, np.not_equal)):
         x = relay.var("x", relay.TensorType((10, 4), "float32"))
         y = relay.var("y", relay.TensorType((5, 10, 1), "float32"))
         z = op(x, y)
@@ -296,7 +296,8 @@ def test_mean_var_std():
 
 
 def test_strided_slice():
-    def verify(dshape, begin, end, strides, output, test_ref=True, dtype="int32"):
+    def verify(dshape, begin, end, strides, output,
+               ignore_end=False, test_ref=True, dtype="int32"):
         x = relay.var("x", relay.TensorType(dshape, "float32"))
         ndim = len(dshape)
         begin = begin if begin else [0] * ndim
@@ -308,11 +309,13 @@ def verify(dshape, begin, end, strides, output, test_ref=True, dtype="int32"):
             z = relay.strided_slice(x,
                                     begin=begin_expr,
                                     end=end_expr,
-                                    strides=strides_expr)
+                                    strides=strides_expr,
+                                    ignore_end=ignore_end)
         else:
             z = relay.strided_slice(x,
                                     begin=begin_expr,
-                                    end=end_expr)
+                                    end=end_expr,
+                                    ignore_end=ignore_end)
         func = relay.Function([x], z)
 
         func = run_infer_type(func)
@@ -320,6 +323,7 @@ def verify(dshape, begin, end, strides, output, test_ref=True, dtype="int32"):
         assert "begin=" in text
         assert "end=" in text
 
+
         if output:
             assert func.body.checked_type == relay.ty.TensorType(output, "float32")
 
@@ -333,10 +337,12 @@ def verify(dshape, begin, end, strides, output, test_ref=True, dtype="int32"):
             op_res = intrp.evaluate(func)(x_data)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
 
-    verify((1, 224, 224, 3), [0, 20, 20, 0], [1, 140, 140, 3], [1, 1, 1, 1], (1, 120, 120, 3), dtype="int64")
+    verify((1, 224, 224, 3), [0, 20, 20, 0], [1, 140, 140, 3], [1, 1, 1, 1],
+           (1, 120, 120, 3), dtype="int64")
     verify((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], (1, 3, 3), dtype="int16")
     verify((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2], (3, 1, 2))
-    verify((3, 4, 3), [1, 0, 0], [2, 2, 3], [1, 1, 2], (1, 2, 2))
+    verify((3, 4, 3), [1, 0, 0], [2, 2, 3], [1, 1, 2], (2, 4, 2),
+           ignore_end=True, test_ref=False)
     verify((3, 4, 3), [1, 1, 0], [4, 4, 3], None, (2, 3, 3))
     verify((3, 4, 3), [1, 1, 0], [4, 1000, 3], None, (2, 3, 3))
     verify((3, 4, 3), [1, 1, 0], [4, 4], None, (2, 3, 3))
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
index c45b82f36602..ee4a27c20316 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -18,10 +18,11 @@
 import pytest
 
 import tvm
-from tvm import te
 from tvm import relay
 from tvm.relay import transform, analysis
 from tvm.relay.testing.temp_op_attr import TempOpAttr
+from tvm.relay.testing import ctx_list, run_infer_type
+import numpy as np
 
 def run_opt_pass(expr, passes):
     passes = passes if isinstance(passes, list) else [passes]
@@ -620,7 +621,10 @@ def before():
         x = relay.var("x", shape=(1, 32, 28, 28))
         weight = relay.var('weight', shape=(32, 32, 3, 3))
         y = relay.nn.conv2d(x, weight, channels=32, kernel_size=(3, 3), padding=(1, 1))
-        y = relay.strided_slice(y, begin=relay.const([0, 16], "int32"), end=relay.const([1, 32], "int32"))
+        y = relay.strided_slice(y,
+                                begin=relay.const([0, 16], "int32"),
+                                end=relay.const([1, 33], "int32"),
+                                strides=relay.const([1, 1], "int32"))
         y = relay.Function(analysis.free_vars(y), y)
         return y
 
@@ -632,22 +636,41 @@ def alter_conv2d(attrs, inputs, tinfos, out_type):
 
     def expected():
         x = relay.var("x", shape=(1, 32, 28, 28))
-        weight = relay.var("weight")
+        weight = relay.var("weight", shape=(32, 32, 3, 3))
+        weight = relay.layout_transform(weight, "OIHW", "OIHW4i4o")
         x = relay.layout_transform(x, "NCHW", "NCHW4c")
-        y = relay.nn.conv2d(x, weight, channels=32, kernel_size=(3, 3), padding=(1, 1),
-                            data_layout="NCHW4c")
-        y = relay.strided_slice(y, begin=relay.const([0, 4], "int32"), end=relay.const([1, 8], "int32"))
+        y = relay.op.nn.contrib_conv2d_nchwc(x, weight, channels=32, kernel_size=(3, 3), padding=(1, 1),
+                                             data_layout="NCHW4c")
+
+        y = relay.strided_slice(y,
+                                begin=relay.const([0, 4], "int32"),
+                                end=relay.const([1, 21], "int32"), # [1, 8]
+                                strides=relay.const([1, 1], "int32"))
+
         y = relay.layout_transform(y, "NCHW4c", "NCHW")
         y = relay.Function(analysis.free_vars(y), y)
         return y
 
     with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
         a = before()
-        a = run_opt_pass(a, [transform.CanonicalizeOps(),
-                             transform.AlterOpLayout()])
         b = run_opt_pass(expected(), transform.InferType())
 
-    assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
+    # Verify inference result
+    mod_before = tvm.IRModule()
+    mod_new = tvm.IRModule()
+    mod_before['main'] = a
+    mod_new['main'] = b
+    with relay.build_config(opt_level=3):
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug", "vm"]:
+                ex_before = relay.create_executor(kind, mod=mod_before, ctx=ctx, target=target)
+                ex_new = relay.create_executor(kind, mod=mod_new, ctx=ctx, target=target)
+                np_data = np.random.uniform(size=(1, 32, 28, 28)).astype("float32")
+                np_weight = np.random.uniform(size=(32, 32, 3, 3)).astype("float32")
+                result_before = ex_before.evaluate()(np_data, np_weight)
+                result_new = ex_new.evaluate()(np_data, np_weight)
+                tvm.testing.assert_allclose(result_before.asnumpy(), result_new.asnumpy(), rtol=1e-5, atol=1e-5)
+
 
 def test_alter_layout_depthwise_conv2d():
     """Test depthwise_conv2d operator"""
diff --git a/tests/python/relay/test_pass_combine_parallel_conv2d.py b/tests/python/relay/test_pass_combine_parallel_conv2d.py
index 7f7f18598589..291000965be9 100644
--- a/tests/python/relay/test_pass_combine_parallel_conv2d.py
+++ b/tests/python/relay/test_pass_combine_parallel_conv2d.py
@@ -15,9 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
-from tvm import te
 from tvm import relay
 from tvm.relay import transform
+import numpy as np
 
 
 def run_combine_parallel(expr, min_num_branches=3):
@@ -50,17 +50,25 @@ def expected(x, w1, w2, w3, w4, channels1, channels2, channels3, channels4):
         args = [x, w1, w2, w3, w4]
         w = relay.concatenate((w1, w2, w4), axis=0)
         y = relay.nn.conv2d(x, w, channels=channels1 + channels2 + channels4)
-        y1 = relay.strided_slice(y, [0, 0], [None, channels1])
-        y2 = relay.strided_slice(y, [0, channels1], [None, channels1 + channels2])
+        y1 = relay.strided_slice(y,
+                                 begin=relay.const([0, 0], "int64"),
+                                 end=relay.const([1, channels1], "int64"),
+                                 strides=relay.const([2, 1], 'int64'))
+        y2 = relay.strided_slice(y,
+                                 begin=relay.const([0, channels1], "int64"),
+                                 end=relay.const([1, channels1 + channels2], "int64"),
+                                 strides=relay.const([2, 1], 'int64'))
         y3 = relay.nn.conv2d(x, w3)
-        y4 = relay.strided_slice(y, [0, channels1 + channels2],
-                                 [None, channels1 + channels2 + channels4])
+        y4 = relay.strided_slice(y,
+                                 begin=relay.const([0, channels1 + channels2], "int64"),
+                                 end=relay.const([1, channels1 + channels2 + channels4], "int64"),
+                                 strides=relay.const([2, 1], 'int64'))
         y5 = relay.nn.max_pool2d(x)
         y = relay.Tuple((y1, y2, y3, y4, y5))
         return relay.Function(args, y)
 
     def check(x_shape, channels1, channels2, channels3, channels4):
-        x =  relay.var("x", shape=x_shape)
+        x = relay.var("x", shape=x_shape)
         in_c = x_shape[1]
         w1 = relay.var("w1", shape=(channels1, in_c, 1, 1))
         w2 = relay.var("w2", shape=(channels2, in_c, 1, 1))
@@ -72,7 +80,8 @@ def check(x_shape, channels1, channels2, channels3, channels4):
                          transform.CombineParallelConv2D(min_num_branches=2))
         y_expected = expected(x, w1, w2, w3, w4, channels1, channels2, channels3, channels4)
         y_expected = run_opt_pass(y_expected, transform.InferType())
-        assert tvm.ir.structural_equal(y, y_expected, map_free_vars=True)
+        np.testing.assert_string_equal(str(y), str(y_expected)),\
+            "Actual = \n" + str(y) + "\nExpected = \n" + str(y_expected)
 
     check((1, 4, 16, 16), 4, 4, 4, 4)
     check((1, 4, 16, 16), 4, 8, 4, 7)
@@ -99,8 +108,14 @@ def expected(x, w1, w2, scale1, scale2, bias, channels1, channels2):
         y = relay.nn.conv2d(x, w, channels=channels1 + channels2)
         y = relay.multiply(y, scale)
         y = relay.nn.relu(y)
-        y1 = relay.strided_slice(y, [0, 0], [None, channels1])
-        y2 = relay.strided_slice(y, [0, channels1], [None, channels1 + channels2])
+        y1 = relay.strided_slice(y,
+                                 begin=relay.const([0, 0], "int64"),
+                                 end=relay.const([1, channels1], "int64"),
+                                 strides=relay.const([2, 1], "int64"))
+        y2 = relay.strided_slice(y,
+                                 begin=relay.const([0, channels1], "int64"),
+                                 end=relay.const([1, channels1 + channels2], "int64"),
+                                 strides=relay.const([2, 1], "int64"))
         y2 = relay.add(y2, bias)
         y = relay.Tuple((y1, y2))
         return relay.Function(args, y)
@@ -118,7 +133,8 @@ def check(x_shape, channels1, channels2):
                          transform.CombineParallelConv2D(min_num_branches=2))
         y_expected = expected(x, w1, w2, scale1, scale2, bias, channels1, channels2)
         y_expected = run_opt_pass(y_expected, transform.InferType())
-        assert tvm.ir.structural_equal(y, y_expected, map_free_vars=True)
+        np.testing.assert_string_equal(str(y), str(y_expected)),\
+            "Actual = \n" + str(y) + "Expected = \n" + str(y_expected)
 
     check((1, 4, 16, 16), 4, 8)
 
@@ -138,8 +154,14 @@ def expected(x, w1, w2, scale1, scale2, channels1, channels2):
         args = [x, w1, w2, scale1, scale2]
         w = relay.concatenate((w1, w2), axis=0)
         y = relay.nn.conv2d(x, w, channels=channels1 + channels2)
-        y1 = relay.strided_slice(y, [0, 0], [None, channels1])
-        y2 = relay.strided_slice(y, [0, channels1], [None, channels1 + channels2])
+        y1 = relay.strided_slice(y,
+                                 begin=relay.const([0, 0], "int64"),
+                                 end=relay.const([1, channels1], "int64"),
+                                 strides=relay.const([2, 1], "int64"))
+        y2 = relay.strided_slice(y,
+                                 begin=relay.const([0, channels1], "int64"),
+                                 end=relay.const([1, channels1 + channels2], "int64"),
+                                 strides=relay.const([2, 1], "int64"))
         y1 = relay.multiply(y1, scale1)
         y2 = relay.multiply(y2, scale2)
         y = relay.Tuple((y1, y2))
@@ -157,7 +179,8 @@ def check(x_shape, channels1, channels2):
                          transform.CombineParallelConv2D(min_num_branches=2))
         y_expected = expected(x, w1, w2, scale1, scale2, channels1, channels2)
         y_expected = run_opt_pass(y_expected, transform.InferType())
-        assert tvm.ir.structural_equal(y, y_expected, map_free_vars=True)
+        np.testing.assert_string_equal(str(y), str(y_expected)),\
+            "Actual = \n" + str(y) + "Expected = \n" + str(y_expected)
 
     check((1, 4, 16, 16), 4, 8)
 
@@ -178,8 +201,14 @@ def expected(x, w, channels, repeat):
         for i in range(repeat):
             w_concat = relay.concatenate((w, w), axis=0)
             y = relay.nn.conv2d(y, w_concat, channels=channels*2)
-            y1 = relay.strided_slice(y, [0, 0], [None, channels])
-            y2 = relay.strided_slice(y, [0, channels], [None, channels * 2])
+            y1 = relay.strided_slice(y,
+                                     begin=relay.const([0, 0], "int64"),
+                                     end=relay.const([1, channels], "int64"),
+                                     strides=relay.const([2, 1], "int64"))
+            y2 = relay.strided_slice(y,
+                                     begin=relay.const([0, channels], "int64"),
+                                     end=relay.const([1, channels * 2], "int64"),
+                                     strides=relay.const([2, 1], "int64"))
             y = relay.concatenate((y1, y2), axis=1)
         return relay.Function(args, y)
 
@@ -193,7 +222,8 @@ def check(x_shape, repeat):
                          transform.CombineParallelConv2D(min_num_branches=2))
         y_expected = expected(x, w, out_c, repeat)
         y_expected = run_opt_pass(y_expected, transform.InferType())
-        assert tvm.ir.structural_equal(y, y_expected, map_free_vars=True)
+        np.testing.assert_string_equal(str(y), str(y_expected)),\
+            "Actual = \n" + str(y) + "\nExpected = \n" + str(y_expected)
 
     check((1, 4, 16, 16), 4)
 
diff --git a/topi/python/topi/testing/strided_slice_python.py b/topi/python/topi/testing/strided_slice_python.py
index c1c899afe31f..b21c3fb87119 100644
--- a/topi/python/topi/testing/strided_slice_python.py
+++ b/topi/python/topi/testing/strided_slice_python.py
@@ -17,7 +17,7 @@
 """strided_slice/set in python"""
 
 
-def strided_slice_python(data, begin, end, strides):
+def strided_slice_python(data, begin, end, strides, ignore_end=False):
     """Python version of strided slice operator.
 
     Parameters
@@ -34,6 +34,9 @@ def strided_slice_python(data, begin, end, strides):
     strides : list
         The stride of each slice.
 
+    ignore_end : boolean
+        Whether to ignore input end
+
     Returns
     -------
     result : numpy.ndarray
@@ -44,7 +47,7 @@ def strided_slice_python(data, begin, end, strides):
     for i in range(len(data.shape)):
         slices.append(slice(
             begin[i] if i < len(begin) else None,
-            end[i] if i < len(end) else None,
+            end[i] if i < len(end) and not ignore_end else None,
             strides[i] if i < len(strides) else None))
     return data[tuple(slices)]
 
diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index ff865ce988c0..1a2089683b62 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -23,7 +23,7 @@
 from ..sort import argsort
 
 @hybrid.script
-def hybrid_rearrange_box_out(data, one, batch_size):
+def hybrid_rearrange_box_out(data, one, batch_size, num_anchors):
     """Hybrid routine to rearrange nms output to
     move all valid entries to top.
 
@@ -40,13 +40,15 @@ def hybrid_rearrange_box_out(data, one, batch_size):
         Batch size. We need to pass it in since hybrid script doesn't support
         binding variable to symbolic dim.
 
+    num_anchors: tvm.tir.IntImm or tvm.tir.Var
+        Number of anchors.
+
     Returns
     -------
     output : tvm.te.Tensor or numpy NDArray
         Transformed NMS output. 3-D tensor with shape
         [batch_size, num_anchors, 6].
     """
-    num_anchors = data.shape[1]
     elem_length = data.shape[2]
     output = output_tensor((batch_size,
                             num_anchors,
@@ -67,7 +69,7 @@ def hybrid_rearrange_box_out(data, one, batch_size):
 
 
 @hybrid.script
-def hybrid_rearrange_indices_out(data, one, batch_size):
+def hybrid_rearrange_indices_out(data, one, batch_size, num_anchors):
     """Hybrid routine to rearrange nms output to
     move all valid entries to top.
 
@@ -86,6 +88,9 @@ def hybrid_rearrange_indices_out(data, one, batch_size):
         Batch size. We need to pass it in since hybrid script doesn't support
         binding variable to symbolic dim.
 
+    num_anchors: tvm.tir.IntImm or tvm.tir.Var
+        Number of anchors.
+
     Returns
     -------
     output : tvm.te.Tensor or numpy NDArray
@@ -95,7 +100,6 @@ def hybrid_rearrange_indices_out(data, one, batch_size):
         Tensor with shape [batch_size, 1], indicates
         the valid number of boxes.
     """
-    num_anchors = data.shape[1]
     valid_box_count = output_tensor((batch_size, 1), "int32")
     output = output_tensor((batch_size, num_anchors), data.dtype)
 
@@ -116,7 +120,8 @@ def hybrid_rearrange_indices_out(data, one, batch_size):
 
 
 @hybrid.script
-def hybrid_get_valid_counts(data, score_threshold, id_index, score_index, one, batch_size):
+def hybrid_get_valid_counts(data, score_threshold, id_index, score_index,
+                            one, batch_size, num_anchors):
     """Hybrid routine to get valid count of bounding boxes
     given a score threshold. Also moves valid boxes to the
     top of input data.
@@ -143,6 +148,9 @@ def hybrid_get_valid_counts(data, score_threshold, id_index, score_index, one, b
         Batch size. We need to pass it in since hybrid script doesn't support
         binding variable to symbolic dim.
 
+    num_anchors: tvm.tir.IntImm or tvm.tir.Var
+        Number of anchors.
+
     Returns
     -------
     valid_count : tvm.te.Tensor or numpy NDArray
@@ -154,7 +162,6 @@ def hybrid_get_valid_counts(data, score_threshold, id_index, score_index, one, b
     out_indices: tvm.te.Tensor or numpy NDArray
         Related index in input data.
     """
-    num_anchors = data.shape[1]
     box_data_length = data.shape[2]
     valid_count = output_tensor((batch_size,), "int32")
     out_tensor = output_tensor((batch_size,
@@ -215,13 +222,13 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
     return hybrid_get_valid_counts(data, score_threshold_const,
                                    id_index_const, score_index_const,
                                    tvm.tir.const(1, data.dtype),
-                                   data.shape[0])
+                                   data.shape[0], data.shape[1])
 
 
 @hybrid.script
-def hybrid_nms(data, sorted_index, valid_count, indices, batch_size, max_output_size,
-               iou_threshold, force_suppress, top_k, coord_start, score_index,
-               id_index, return_indices, zero, one):
+def hybrid_nms(data, sorted_index, valid_count, indices, batch_size, num_anchors,
+               max_output_size, iou_threshold, force_suppress, top_k, coord_start,
+               score_index, id_index, return_indices, zero, one):
     """Hybrid routing for non-maximum suppression.
 
     Parameters
@@ -285,7 +292,6 @@ def hybrid_nms(data, sorted_index, valid_count, indices, batch_size, max_output_
         2-D tensor with shape [batch_size, num_anchors].
     """
 
-    num_anchors = data.shape[1]
     box_data_length = data.shape[2]
 
     # box_indices is the expected value, similar to TF & ONNX
@@ -491,6 +497,7 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1,
                                   valid_count,
                                   indices,
                                   batch_size,
+                                  num_anchors,
                                   tvm.tir.const(max_output_size, dtype="int32"),
                                   tvm.tir.const(iou_threshold, dtype=data.dtype),
                                   tvm.tir.const(force_suppress, dtype="bool"),
@@ -503,9 +510,9 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1,
                                   one=tvm.tir.const(1, dtype=data.dtype))
     if return_indices:
         return hybrid_rearrange_indices_out(box_indices, one=tvm.tir.const(1, dtype="int32"),
-                                            batch_size=batch_size)
+                                            batch_size=batch_size, num_anchors=num_anchors)
 
     if invalid_to_bottom:
         out = hybrid_rearrange_box_out(out, one=tvm.tir.const(1, dtype=data.dtype),
-                                       batch_size=batch_size)
+                                       batch_size=batch_size, num_anchors=num_anchors)
     return out

From 4e22eef8835e88105f12ca70c5f1593bf003aa38 Mon Sep 17 00:00:00 2001
From: Yong Wu <ywu118@alumni.jh.edu>
Date: Wed, 20 May 2020 01:13:51 +0800
Subject: [PATCH 11/22] change attr to Optional<T>

---
 include/tvm/relay/attrs/transform.h           |   6 +-
 src/relay/op/tensor/transform.cc              | 141 ++++++++----------
 .../test_pass_combine_parallel_conv2d.py      |  48 +++---
 topi/include/topi/transform.h                 |   6 +-
 topi/python/topi/transform.py                 |   7 +-
 topi/src/transform.cc                         |   2 +-
 6 files changed, 96 insertions(+), 114 deletions(-)

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index a4c3d0194b22..b63c319a19aa 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -210,9 +210,9 @@ struct SplitAttrs : public tvm::AttrsNode<SplitAttrs> {
 
 /*! \brief Attributes for StridedSlice operator */
 struct StridedSliceAttrs : public tvm::AttrsNode<StridedSliceAttrs> {
-  Expr begin;
-  Expr end;
-  Expr strides;
+  Optional<Array<Integer>> begin;
+  Optional<Array<Integer>> end;
+  Optional<Array<Integer>> strides;
   bool ignore_end;
 
   TVM_DECLARE_ATTRS(StridedSliceAttrs, "relay.attrs.StridedSliceAttrs") {
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 27c459e76b33..0f54c4d5cb7f 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1750,42 +1750,39 @@ bool StridedSliceRel(const Array<Type>& types,
 
   // calculate output shape
   std::vector<IndexExpr> oshape(num_axis);
-  const ConstantNode *cbegin, *cend, *cstrides;
-  if ((cbegin = param->begin.as<ConstantNode>()) &&
-      (cend = param->end.as<ConstantNode>()) &&
-      (cstrides = param->strides.as<ConstantNode>())) {
+  if (param->begin && param->end && param->strides) {
     std::vector<int64_t> stride_vec;
-    int64_t* strides_val = ToVector(cstrides->data);
-    for (int64_t i = 0; i < cstrides->data.Shape().front(); ++i) {
-      stride_vec.push_back(strides_val[i]);
+    for (Integer i : param->strides.value()) {
+      CHECK(i.defined());
+      stride_vec.push_back(i->value);
     }
     for (int64_t i = stride_vec.size(); i < num_axis; ++i) {
       stride_vec.push_back(1);
     }
     const int64_t max_range = std::numeric_limits<int64_t>::max();
     std::vector<int64_t> begin_vec;
-    int64_t* begin_val = ToVector(cbegin->data);
-    for (int64_t i = 0; i < cbegin->data.Shape().front(); ++i) {
-      begin_vec.push_back(begin_val[i]);
+    for (size_t i = 0; i < param->begin.value().size(); ++i) {
+      if (!param->begin.value()[i].defined()) {
+        begin_vec.push_back(stride_vec[i] > 0 ? 0 : max_range);
+      } else {
+        begin_vec.push_back(param->begin.value()[i]->value);
+      }
     }
-    for (int64_t i = begin_vec.size(); i < num_axis; ++i) {
+    for (size_t i = begin_vec.size(); i < num_axis; ++i) {
       begin_vec.push_back(stride_vec[i] > 0 ? 0 : max_range);
     }
+
     std::vector<int64_t> end_vec;
-    int64_t* end_val = ToVector(cend->data);
-    for (int64_t i = 0; i < cend->data.Shape().front(); ++i) {
-      if (param->ignore_end) {
-        end_vec.push_back(max_range);
+    for (size_t i = 0; i < param->end.value().size(); ++i) {
+      // allow end to be None
+      if (param->ignore_end || (!param->end.value()[i].defined())) {
+        end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
       } else {
-        end_vec.push_back(end_val[i]);
+        end_vec.push_back(param->end.value()[i]->value);
       }
     }
-    for (int64_t i = end_vec.size(); i < num_axis; ++i) {
-      if (param->ignore_end) {
-        end_vec.push_back(max_range);
-      } else {
-        end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
-      }
+    for (size_t i = end_vec.size(); i < num_axis; ++i) {
+      end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
     }
 
     for (int64_t i = 0; i < num_axis; ++i) {
@@ -1839,6 +1836,7 @@ bool StridedSliceRel(const Array<Type>& types,
       oshape[i] = Any::make();
     }
   }
+
   reporter->Assign(types[4], TensorType(oshape, data->dtype));
   return true;
 }
@@ -1868,22 +1866,19 @@ Array<Array<Layout>> StridedSliceInferCorrectLayout(const Attrs& attrs,
     auto *params = const_cast<StridedSliceAttrs*>(attrs.as<StridedSliceAttrs>());
     CHECK(params != nullptr);
     Array<Integer> begin, end, strides;
-    const ConstantNode *cbegin, *cend, *cstrides;
-    cbegin = params->begin.as<ConstantNode>();
-    cend = params->end.as<ConstantNode>();
-    cstrides = params->strides.as<ConstantNode>();
-    if (cbegin && cend && cstrides) {
-      int64_t* strides_val = ToVector(cstrides->data);
-      for (int64_t i = 0; i < cstrides->data.Shape().front(); ++i) {
-        strides.push_back(strides_val[i]);
+    if (params->begin && params->end && params->strides) {
+      for (Integer i : params->strides.value()) {
+        CHECK(i.defined());
+        strides.push_back(i->value);
       }
-      int64_t* begin_val = ToVector(cbegin->data);
-      for (int64_t i = 0; i < cbegin->data.Shape().front(); ++i) {
-        begin.push_back(begin_val[i]);
+
+      for (Integer i : params->begin.value()) {
+        CHECK(i.defined());
+        begin.push_back(i->value);
       }
-      int64_t* end_val = ToVector(cend->data);
-      for (int64_t i = 0; i < cend->data.Shape().front(); ++i) {
-        end.push_back(end_val[i]);
+      for (Integer i : params->end.value()) {
+        CHECK(i.defined());
+        end.push_back(i->value);
       }
     }
 
@@ -1920,27 +1915,8 @@ Array<Array<Layout>> StridedSliceInferCorrectLayout(const Attrs& attrs,
     }
 
     layout = new_layout;
-
-    DLContext ctx;
-    ctx.device_type = kDLCPU;
-    ctx.device_id = 0;
-    auto begin_ndarray = runtime::NDArray::Empty({int64_t(new_begin.size())},
-                                                 DataType::Int(64), ctx);
-    auto end_ndarray = runtime::NDArray::Empty({int64_t(new_begin.size())},
-                                               DataType::Int(64), ctx);
-    auto strides_ndarray = runtime::NDArray::Empty({int64_t(new_begin.size())},
-                                                   DataType::Int(64), ctx);
-    int64_t* begin_data = static_cast<int64_t*>(begin_ndarray->data);
-    int64_t* end_data = static_cast<int64_t*>(end_ndarray->data);
-    int64_t* strides_data = static_cast<int64_t*>(strides_ndarray->data);
-    for (size_t i = 0; i < new_begin.size(); ++i) {
-      begin_data[i] = new_begin[i];
-      end_data[i] = new_end[i];
-      strides_data[i] = 1;
-    }
-    params->begin = Constant(begin_ndarray);
-    params->end = Constant(end_ndarray);
-    params->strides = Constant(strides_ndarray);
+    params->begin = new_begin;
+    params->end = new_end;
   }
   return {{layout, Layout("C"), Layout("C"), Layout("C")}, {layout}};
 }
@@ -1949,6 +1925,7 @@ inline te::Tensor DynamicStridedSlice(const te::Tensor& input,
                                       const te::Tensor& begin,
                                       const te::Tensor& end,
                                       const te::Tensor& strides,
+                                      const bool& ignore_end,
                                       std::string name = "T_strided_slice_dynamic",
                                       std::string tag = topi::kInjective) {
   int64_t src_tensor_dim = input->shape.size();
@@ -1970,25 +1947,13 @@ Array<te::Tensor> StridedSliceCompute(const Attrs& attrs, const Array<te::Tensor
                                       const Type& out_type) {
   const StridedSliceAttrs* param = attrs.as<StridedSliceAttrs>();
   CHECK(param != nullptr);
-  const ConstantNode *cbegin, *cend, *cstrides;
-  if ((cbegin = param->begin.as<ConstantNode>()) &&
-      (cend = param->end.as<ConstantNode>()) &&
-      (cstrides = param->strides.as<ConstantNode>())) {
+  if (param->begin && param->end && param->strides) {
     Array<Integer> begin, end, strides;
-    int64_t* strides_val = ToVector(cstrides->data);
-    for (int64_t i = 0; i < cstrides->data.Shape().front(); ++i) {
-      strides.push_back(strides_val[i]);
-    }
-    int64_t* begin_val = ToVector(cbegin->data);
-    for (int64_t i = 0; i < cbegin->data.Shape().front(); ++i) {
-      begin.push_back(begin_val[i]);
-    }
-    int64_t* end_val = ToVector(cend->data);
-    for (int64_t i = 0; i < cend->data.Shape().front(); ++i) {
-      end.push_back(end_val[i]);
-    }
+    begin = param->begin.value();
+    end = param->end.value();
+    strides = param->strides.value();
     return Array<te::Tensor>{
-      topi::strided_slice(inputs[0], begin, end, strides)
+      topi::strided_slice(inputs[0], begin, end, strides, param->ignore_end)
     };
   } else {
     te::Tensor data = inputs[0];
@@ -2002,7 +1967,7 @@ Array<te::Tensor> StridedSliceCompute(const Attrs& attrs, const Array<te::Tensor
           << "begin, end, and strides are required to have the same length"
           << " if they are non-constant.";
     return Array<te::Tensor>{
-      DynamicStridedSlice(data, begin, end, strides)
+      DynamicStridedSlice(data, begin, end, strides, param->ignore_end)
     };
   }
 }
@@ -2014,9 +1979,27 @@ Expr MakeStridedSlice(Expr data,
                       Expr strides,
                       bool ignore_end) {
   auto attrs = make_object<StridedSliceAttrs>();
-  attrs->begin = begin;
-  attrs->end = end;
-  attrs->strides = strides;
+  const ConstantNode *cbegin, *cend, *cstrides;
+  if ((cbegin = begin.as<ConstantNode>()) &&
+      (cend = end.as<ConstantNode>()) &&
+      (cstrides = strides.as<ConstantNode>())) {
+    CHECK_EQ(cbegin->data->ndim, 1);
+    CHECK_EQ(cend->data->ndim, 1);
+    CHECK_EQ(cstrides->data->ndim, 1);
+    Array<Integer> begin, end, strides;
+    for (int i = 0; i < cbegin->data->shape[0]; i++) {
+      begin.push_back(Integer(static_cast<int>(ToScalar(cbegin->data, i))));
+    }
+    for (int i = 0; i < cend->data->shape[0]; i++) {
+      end.push_back(Integer(static_cast<int>(ToScalar(cend->data, i))));
+    }
+    for (int i = 0; i < cstrides->data->shape[0]; i++) {
+      strides.push_back(Integer(static_cast<int>(ToScalar(cstrides->data, i))));
+    }
+    attrs->begin = begin;
+    attrs->end = end;
+    attrs->strides = strides;
+  }
   attrs->ignore_end = ignore_end;
   static const Op& op = Op::Get("strided_slice");
   return Call(op, {data, begin, end, strides}, Attrs(attrs), {});
@@ -2315,7 +2298,7 @@ Array<te::Tensor> SliceLikeCompute(const Attrs& attrs, const Array<te::Tensor>&
     }
   }
   return Array<te::Tensor>{topi::strided_slice(inputs[0], GetIntArray(begin_idx),
-                                               GetIntArray(end_idx), GetIntArray(strides))};
+                                               GetIntArray(end_idx), GetIntArray(strides), false)};
 }
 
 TVM_REGISTER_GLOBAL("relay.op._make.slice_like").set_body_typed(MakeSliceLike);
diff --git a/tests/python/relay/test_pass_combine_parallel_conv2d.py b/tests/python/relay/test_pass_combine_parallel_conv2d.py
index 291000965be9..c32e9ea39ab3 100644
--- a/tests/python/relay/test_pass_combine_parallel_conv2d.py
+++ b/tests/python/relay/test_pass_combine_parallel_conv2d.py
@@ -52,17 +52,17 @@ def expected(x, w1, w2, w3, w4, channels1, channels2, channels3, channels4):
         y = relay.nn.conv2d(x, w, channels=channels1 + channels2 + channels4)
         y1 = relay.strided_slice(y,
                                  begin=relay.const([0, 0], "int64"),
-                                 end=relay.const([1, channels1], "int64"),
-                                 strides=relay.const([2, 1], 'int64'))
+                                 end=relay.const([channels1, channels1], "int64"),
+                                 strides=relay.const([1, 1], 'int64'))
         y2 = relay.strided_slice(y,
                                  begin=relay.const([0, channels1], "int64"),
-                                 end=relay.const([1, channels1 + channels2], "int64"),
-                                 strides=relay.const([2, 1], 'int64'))
+                                 end=relay.const([channels2, channels1 + channels2], "int64"),
+                                 strides=relay.const([1, 1], 'int64'))
         y3 = relay.nn.conv2d(x, w3)
         y4 = relay.strided_slice(y,
                                  begin=relay.const([0, channels1 + channels2], "int64"),
-                                 end=relay.const([1, channels1 + channels2 + channels4], "int64"),
-                                 strides=relay.const([2, 1], 'int64'))
+                                 end=relay.const([channels4, channels1 + channels2 + channels4], "int64"),
+                                 strides=relay.const([1, 1], 'int64'))
         y5 = relay.nn.max_pool2d(x)
         y = relay.Tuple((y1, y2, y3, y4, y5))
         return relay.Function(args, y)
@@ -80,8 +80,7 @@ def check(x_shape, channels1, channels2, channels3, channels4):
                          transform.CombineParallelConv2D(min_num_branches=2))
         y_expected = expected(x, w1, w2, w3, w4, channels1, channels2, channels3, channels4)
         y_expected = run_opt_pass(y_expected, transform.InferType())
-        np.testing.assert_string_equal(str(y), str(y_expected)),\
-            "Actual = \n" + str(y) + "\nExpected = \n" + str(y_expected)
+        assert tvm.ir.structural_equal(y, y_expected, map_free_vars=True)
 
     check((1, 4, 16, 16), 4, 4, 4, 4)
     check((1, 4, 16, 16), 4, 8, 4, 7)
@@ -110,12 +109,12 @@ def expected(x, w1, w2, scale1, scale2, bias, channels1, channels2):
         y = relay.nn.relu(y)
         y1 = relay.strided_slice(y,
                                  begin=relay.const([0, 0], "int64"),
-                                 end=relay.const([1, channels1], "int64"),
-                                 strides=relay.const([2, 1], "int64"))
+                                 end=relay.const([4, channels1], "int64"),
+                                 strides=relay.const([1, 1], "int64"))
         y2 = relay.strided_slice(y,
                                  begin=relay.const([0, channels1], "int64"),
-                                 end=relay.const([1, channels1 + channels2], "int64"),
-                                 strides=relay.const([2, 1], "int64"))
+                                 end=relay.const([8, channels1 + channels2], "int64"),
+                                 strides=relay.const([1, 1], "int64"))
         y2 = relay.add(y2, bias)
         y = relay.Tuple((y1, y2))
         return relay.Function(args, y)
@@ -133,8 +132,7 @@ def check(x_shape, channels1, channels2):
                          transform.CombineParallelConv2D(min_num_branches=2))
         y_expected = expected(x, w1, w2, scale1, scale2, bias, channels1, channels2)
         y_expected = run_opt_pass(y_expected, transform.InferType())
-        np.testing.assert_string_equal(str(y), str(y_expected)),\
-            "Actual = \n" + str(y) + "Expected = \n" + str(y_expected)
+        tvm.ir.structural_equal(y, y_expected, map_free_vars=True)
 
     check((1, 4, 16, 16), 4, 8)
 
@@ -156,12 +154,12 @@ def expected(x, w1, w2, scale1, scale2, channels1, channels2):
         y = relay.nn.conv2d(x, w, channels=channels1 + channels2)
         y1 = relay.strided_slice(y,
                                  begin=relay.const([0, 0], "int64"),
-                                 end=relay.const([1, channels1], "int64"),
-                                 strides=relay.const([2, 1], "int64"))
+                                 end=relay.const([4, channels1], "int64"),
+                                 strides=relay.const([1, 1], "int64"))
         y2 = relay.strided_slice(y,
                                  begin=relay.const([0, channels1], "int64"),
-                                 end=relay.const([1, channels1 + channels2], "int64"),
-                                 strides=relay.const([2, 1], "int64"))
+                                 end=relay.const([8, channels1 + channels2], "int64"),
+                                 strides=relay.const([1, 1], "int64"))
         y1 = relay.multiply(y1, scale1)
         y2 = relay.multiply(y2, scale2)
         y = relay.Tuple((y1, y2))
@@ -179,8 +177,7 @@ def check(x_shape, channels1, channels2):
                          transform.CombineParallelConv2D(min_num_branches=2))
         y_expected = expected(x, w1, w2, scale1, scale2, channels1, channels2)
         y_expected = run_opt_pass(y_expected, transform.InferType())
-        np.testing.assert_string_equal(str(y), str(y_expected)),\
-            "Actual = \n" + str(y) + "Expected = \n" + str(y_expected)
+        tvm.ir.structural_equal(y, y_expected, map_free_vars=True)
 
     check((1, 4, 16, 16), 4, 8)
 
@@ -203,12 +200,12 @@ def expected(x, w, channels, repeat):
             y = relay.nn.conv2d(y, w_concat, channels=channels*2)
             y1 = relay.strided_slice(y,
                                      begin=relay.const([0, 0], "int64"),
-                                     end=relay.const([1, channels], "int64"),
-                                     strides=relay.const([2, 1], "int64"))
+                                     end=relay.const([2, channels], "int64"),
+                                     strides=relay.const([1, 1], "int64"))
             y2 = relay.strided_slice(y,
                                      begin=relay.const([0, channels], "int64"),
-                                     end=relay.const([1, channels * 2], "int64"),
-                                     strides=relay.const([2, 1], "int64"))
+                                     end=relay.const([2, channels * 2], "int64"),
+                                     strides=relay.const([1, 1], "int64"))
             y = relay.concatenate((y1, y2), axis=1)
         return relay.Function(args, y)
 
@@ -222,8 +219,7 @@ def check(x_shape, repeat):
                          transform.CombineParallelConv2D(min_num_branches=2))
         y_expected = expected(x, w, out_c, repeat)
         y_expected = run_opt_pass(y_expected, transform.InferType())
-        np.testing.assert_string_equal(str(y), str(y_expected)),\
-            "Actual = \n" + str(y) + "\nExpected = \n" + str(y_expected)
+        tvm.ir.structural_equal(y, y_expected, map_free_vars=True)
 
     check((1, 4, 16, 16), 4)
 
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index 400cd1edbdd3..ab5d33b81665 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -527,8 +527,8 @@ inline Array<Tensor> split(const Tensor& x, Array<Integer> split_indices, int ax
  * \return A Tensor whose op member is the split operation
  */
 inline Tensor strided_slice(const Tensor& x, const Array<Integer>& begin, const Array<Integer>& end,
-                            const Array<Integer>& strides, std::string name = "T_strided_slice",
-                            std::string tag = kInjective) {
+                            const Array<Integer>& strides, const bool& ignore_end,
+                            std::string name = "T_strided_slice", std::string tag = kInjective) {
   size_t src_tensor_dim = static_cast<size_t>(x->shape.size());
   // Setup the ranges.
   // NOTE: this code duplicates the shape inference logic relay.op
@@ -559,7 +559,7 @@ inline Tensor strided_slice(const Tensor& x, const Array<Integer>& begin, const
   std::vector<int64_t> end_vec;
   for (size_t i = 0; i < end.size(); ++i) {
     // allow end to be None
-    if (!end[i].defined()) {
+    if (ignore_end || (!end[i].defined())) {
       end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
     } else {
       end_vec.push_back(end[i]->value);
diff --git a/topi/python/topi/transform.py b/topi/python/topi/transform.py
index ef5456095899..fe7ceb6015db 100644
--- a/topi/python/topi/transform.py
+++ b/topi/python/topi/transform.py
@@ -131,7 +131,7 @@ def flip(a, axis=0):
     """
     return cpp.flip(a, axis)
 
-def strided_slice(a, begin, end, strides=None):
+def strided_slice(a, begin, end, strides=None, ignore_end=False):
     """Slice of an array.
 
     Parameters
@@ -150,13 +150,16 @@ def strided_slice(a, begin, end, strides=None):
         in that case, the input tensor will be reversed
         in that particular axis.
 
+    ignore_end: boolean, optional
+        Specifies whether to ignore input end.
+
     Returns
     -------
     ret : tvm.te.Tensor
     """
     if strides is None:
         strides = []
-    return cpp.strided_slice(a, begin, end, strides)
+    return cpp.strided_slice(a, begin, end, strides, ignore_end)
 
 @tvm.te.tag_scope(tag=tag.INJECTIVE+",strided_set")
 def strided_set(a, v, begin, end, strides=None):
diff --git a/topi/src/transform.cc b/topi/src/transform.cc
index fa27b995c365..aab20f53fc31 100644
--- a/topi/src/transform.cc
+++ b/topi/src/transform.cc
@@ -148,7 +148,7 @@ TVM_REGISTER_GLOBAL("topi.tensordot").set_body([](TVMArgs args, TVMRetValue* rv)
 });
 
 TVM_REGISTER_GLOBAL("topi.strided_slice").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = strided_slice(args[0], args[1], args[2], args[3]);
+  *rv = strided_slice(args[0], args[1], args[2], args[3], args[4]);
 });
 
 TVM_REGISTER_GLOBAL("topi.one_hot").set_body([](TVMArgs args, TVMRetValue* rv) {

From 3af16825637b0cbb7d97c4f47888644b1de28134 Mon Sep 17 00:00:00 2001
From: Yong Wu <ywu118@alumni.jh.edu>
Date: Wed, 20 May 2020 01:27:21 +0800
Subject: [PATCH 12/22] clang format

---
 include/tvm/relay/attrs/transform.h           | 12 ++-
 include/tvm/relay/attrs/vision.h              | 18 ++--
 src/relay/op/tensor/transform.cc              | 87 ++++++++-----------
 src/relay/op/vision/nms.cc                    | 21 ++---
 .../transforms/combine_parallel_conv2d.cc     | 26 ++----
 topi/include/topi/transform.h                 |  1 +
 6 files changed, 61 insertions(+), 104 deletions(-)

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index b63c319a19aa..0485dbf914fb 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -216,13 +216,11 @@ struct StridedSliceAttrs : public tvm::AttrsNode<StridedSliceAttrs> {
   bool ignore_end;
 
   TVM_DECLARE_ATTRS(StridedSliceAttrs, "relay.attrs.StridedSliceAttrs") {
-    TVM_ATTR_FIELD(begin)
-        .describe("Indices for begin of slice, begin index is also inclusive");
-    TVM_ATTR_FIELD(end)
-        .describe("Indices for end of slice, end index is exclusive");
-    TVM_ATTR_FIELD(strides)
-        .describe("Stride values of the slice");
-    TVM_ATTR_FIELD(ignore_end).set_default(false)
+    TVM_ATTR_FIELD(begin).describe("Indices for begin of slice, begin index is also inclusive");
+    TVM_ATTR_FIELD(end).describe("Indices for end of slice, end index is exclusive");
+    TVM_ATTR_FIELD(strides).describe("Stride values of the slice");
+    TVM_ATTR_FIELD(ignore_end)
+        .set_default(false)
         .describe("Whether to ignore the input end and infer value of end from input data");
   }
 };
diff --git a/include/tvm/relay/attrs/vision.h b/include/tvm/relay/attrs/vision.h
index 52669ea651ee..550e24b8de26 100644
--- a/include/tvm/relay/attrs/vision.h
+++ b/include/tvm/relay/attrs/vision.h
@@ -101,26 +101,22 @@ struct NonMaximumSuppressionAttrs : public tvm::AttrsNode<NonMaximumSuppressionA
   TVM_DECLARE_ATTRS(NonMaximumSuppressionAttrs, "relay.attrs.NonMaximumSuppressionAttrs") {
     TVM_ATTR_FIELD(max_output_size)
         .set_default(-1)
-        .describe("Max number of output valid boxes for each instance."
-                  "By default all valid boxes are returned.");
+        .describe(
+            "Max number of output valid boxes for each instance."
+            "By default all valid boxes are returned.");
     TVM_ATTR_FIELD(iou_threshold)
         .set_default(0.5)
         .describe("Non-maximum suppression iou threshold.");
     TVM_ATTR_FIELD(force_suppress)
         .set_default(false)
         .describe("Suppress all detections regardless of class_id.");
-    TVM_ATTR_FIELD(top_k)
-        .set_default(-1)
-        .describe("Keep maximum top k detections before nms, -1 for no limit.");
+    TVM_ATTR_FIELD(top_k).set_default(-1).describe(
+        "Keep maximum top k detections before nms, -1 for no limit.");
     TVM_ATTR_FIELD(coord_start)
         .set_default(2)
         .describe("Start index of the consecutive 4 coordinates.");
-    TVM_ATTR_FIELD(score_index)
-        .set_default(1)
-        .describe("Index of the scores/confidence of boxes.");
-    TVM_ATTR_FIELD(id_index)
-        .set_default(0)
-        .describe("Axis index of id.");
+    TVM_ATTR_FIELD(score_index).set_default(1).describe("Index of the scores/confidence of boxes.");
+    TVM_ATTR_FIELD(id_index).set_default(0).describe("Axis index of id.");
     TVM_ATTR_FIELD(return_indices)
         .set_default(true)
         .describe("Whether to return box indices in input data.");
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 0f54c4d5cb7f..3597e0235eb5 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1095,10 +1095,8 @@ bool ArangeRel(const Array<Type>& types, int num_inputs, const Attrs& raw_attrs,
   }
 }
 
-inline te::Tensor DynamicArange(const te::Tensor& start,
-                                const te::Tensor& stop,
-                                const te::Tensor& step,
-                                tvm::DataType dtype,
+inline te::Tensor DynamicArange(const te::Tensor& start, const te::Tensor& stop,
+                                const te::Tensor& step, tvm::DataType dtype,
                                 std::string name = "T_arange_dynamic",
                                 std::string tag = topi::kInjective) {
   tvm::PrimExpr num_elem = tvm::tir::Var("num_elem");
@@ -1736,9 +1734,7 @@ int64_t* ToVector(const runtime::NDArray& array) {
   return rel_vec;
 }
 
-bool StridedSliceRel(const Array<Type>& types,
-                     int num_inputs,
-                     const Attrs& attrs,
+bool StridedSliceRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 5);
   const StridedSliceAttrs* param = attrs.as<StridedSliceAttrs>();
@@ -1768,7 +1764,7 @@ bool StridedSliceRel(const Array<Type>& types,
         begin_vec.push_back(param->begin.value()[i]->value);
       }
     }
-    for (size_t i = begin_vec.size(); i < num_axis; ++i) {
+    for (int64_t i = begin_vec.size(); i < num_axis; ++i) {
       begin_vec.push_back(stride_vec[i] > 0 ? 0 : max_range);
     }
 
@@ -1781,7 +1777,7 @@ bool StridedSliceRel(const Array<Type>& types,
         end_vec.push_back(param->end.value()[i]->value);
       }
     }
-    for (size_t i = end_vec.size(); i < num_axis; ++i) {
+    for (int64_t i = end_vec.size(); i < num_axis; ++i) {
       end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
     }
 
@@ -1790,12 +1786,8 @@ bool StridedSliceRel(const Array<Type>& types,
       int64_t begin_v = begin_vec[i];
       int64_t end_v = end_vec[i];
 
-      if ((stride_v == 1 &&
-           begin_v == 0 &&
-           end_v == max_range) ||
-          (stride_v == -1 &&
-           begin_v == max_range &&
-           end_v == 0)) {
+      if ((stride_v == 1 && begin_v == 0 && end_v == max_range) ||
+          (stride_v == -1 && begin_v == max_range && end_v == 0)) {
         // Quick path, do not slice this dimension.
         oshape[i] = dshape[i];
         continue;
@@ -1815,16 +1807,14 @@ bool StridedSliceRel(const Array<Type>& types,
       int64_t slice_range, step;
       if (stride_v < 0) {
         if (end_v < -1) end_v = -1;
-        CHECK_LE(end_v, begin_v)
-            << "strided_slice get empty slice at axis " << i;
+        CHECK_LE(end_v, begin_v) << "strided_slice get empty slice at axis " << i;
         begin_v = std::min(dim_size - 1, begin_v);
         slice_range = begin_v - end_v;
         step = -stride_v;
       } else {
         if (begin_v < 0) begin_v = 0;
         CHECK_GE(stride_v, 0);
-        CHECK_LT(begin_v, end_v)
-            << "strided_slice get empty slice at axis " << i;
+        CHECK_LT(begin_v, end_v) << "strided_slice get empty slice at axis " << i;
         end_v = std::min(dim_size, end_v);
         slice_range = end_v - begin_v;
         step = stride_v;
@@ -1863,7 +1853,7 @@ Array<Array<Layout>> StridedSliceInferCorrectLayout(const Attrs& attrs,
     auto shape = old_in_shapes[0];
 
     // NOTE: Discard "const" qualifier here.
-    auto *params = const_cast<StridedSliceAttrs*>(attrs.as<StridedSliceAttrs>());
+    auto* params = const_cast<StridedSliceAttrs*>(attrs.as<StridedSliceAttrs>());
     CHECK(params != nullptr);
     Array<Integer> begin, end, strides;
     if (params->begin && params->end && params->strides) {
@@ -1903,8 +1893,7 @@ Array<Array<Layout>> StridedSliceInferCorrectLayout(const Attrs& attrs,
           }
         }
         int64_t bg = begin[i].defined() ? begin[i]->value : 0;
-        int64_t ed = end[i].defined() ? end[i]->value :
-            shape[i].as<IntImmNode>()->value;
+        int64_t ed = end[i].defined() ? end[i]->value : shape[i].as<IntImmNode>()->value;
         if (bg % factor || ed % factor) {
           // transform to original layout
           return {{Layout::Undef()}, {Layout::Undef()}};
@@ -1921,10 +1910,8 @@ Array<Array<Layout>> StridedSliceInferCorrectLayout(const Attrs& attrs,
   return {{layout, Layout("C"), Layout("C"), Layout("C")}, {layout}};
 }
 
-inline te::Tensor DynamicStridedSlice(const te::Tensor& input,
-                                      const te::Tensor& begin,
-                                      const te::Tensor& end,
-                                      const te::Tensor& strides,
+inline te::Tensor DynamicStridedSlice(const te::Tensor& input, const te::Tensor& begin,
+                                      const te::Tensor& end, const te::Tensor& strides,
                                       const bool& ignore_end,
                                       std::string name = "T_strided_slice_dynamic",
                                       std::string tag = topi::kInjective) {
@@ -1934,13 +1921,16 @@ inline te::Tensor DynamicStridedSlice(const te::Tensor& input,
     out_shape.push_back(tvm::tir::Var("dim"));
   }
   // TODO(yongwww): move the compute into topi
-  return te::compute(out_shape, [&](const Array<tvm::tir::Var>& indices) {
-      Array<IndexExpr> real_indices;
-      for (int32_t i = 0; i < src_tensor_dim; ++i) {
-        real_indices.push_back(indices[i] * strides(i) + begin(i));
-      }
-      return input(real_indices);
-    }, name, tag);
+  return te::compute(
+      out_shape,
+      [&](const Array<tvm::tir::Var>& indices) {
+        Array<IndexExpr> real_indices;
+        for (int32_t i = 0; i < src_tensor_dim; ++i) {
+          real_indices.push_back(indices[i] * strides(i) + begin(i));
+        }
+        return input(real_indices);
+      },
+      name, tag);
 }
 
 Array<te::Tensor> StridedSliceCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
@@ -1953,35 +1943,28 @@ Array<te::Tensor> StridedSliceCompute(const Attrs& attrs, const Array<te::Tensor
     end = param->end.value();
     strides = param->strides.value();
     return Array<te::Tensor>{
-      topi::strided_slice(inputs[0], begin, end, strides, param->ignore_end)
-    };
+        topi::strided_slice(inputs[0], begin, end, strides, param->ignore_end)};
   } else {
     te::Tensor data = inputs[0];
     te::Tensor begin = inputs[1];
     te::Tensor end = inputs[2];
     te::Tensor strides = inputs[3];
     // Dynamic computation
-    CHECK(begin->shape[0].as<IntImmNode>()->value == data->shape.size()
-          && end->shape[0].as<IntImmNode>()->value == data->shape.size()
-          && strides->shape[0].as<IntImmNode>()->value == data->shape.size())
-          << "begin, end, and strides are required to have the same length"
-          << " if they are non-constant.";
-    return Array<te::Tensor>{
-      DynamicStridedSlice(data, begin, end, strides, param->ignore_end)
-    };
+    int64_t attr_size = data->shape.size();
+    CHECK(begin->shape[0].as<IntImmNode>()->value == attr_size &&
+          end->shape[0].as<IntImmNode>()->value == attr_size &&
+          strides->shape[0].as<IntImmNode>()->value == attr_size)
+        << "begin, end, and strides are required to have the same length"
+        << " if they are non-constant.";
+    return Array<te::Tensor>{DynamicStridedSlice(data, begin, end, strides, param->ignore_end)};
   }
 }
 
 // Positional relay function to create StridedSlice operator used by frontend FFI.
-Expr MakeStridedSlice(Expr data,
-                      Expr begin,
-                      Expr end,
-                      Expr strides,
-                      bool ignore_end) {
+Expr MakeStridedSlice(Expr data, Expr begin, Expr end, Expr strides, bool ignore_end) {
   auto attrs = make_object<StridedSliceAttrs>();
   const ConstantNode *cbegin, *cend, *cstrides;
-  if ((cbegin = begin.as<ConstantNode>()) &&
-      (cend = end.as<ConstantNode>()) &&
+  if ((cbegin = begin.as<ConstantNode>()) && (cend = end.as<ConstantNode>()) &&
       (cstrides = strides.as<ConstantNode>())) {
     CHECK_EQ(cbegin->data->ndim, 1);
     CHECK_EQ(cend->data->ndim, 1);
@@ -2005,9 +1988,7 @@ Expr MakeStridedSlice(Expr data,
   return Call(op, {data, begin, end, strides}, Attrs(attrs), {});
 }
 
-TVM_REGISTER_GLOBAL("relay.op._make.strided_slice")
-.set_body_typed(MakeStridedSlice);
-
+TVM_REGISTER_GLOBAL("relay.op._make.strided_slice").set_body_typed(MakeStridedSlice);
 
 RELAY_REGISTER_OP("strided_slice")
     .describe(R"code(Strided slice of an array.
diff --git a/src/relay/op/vision/nms.cc b/src/relay/op/vision/nms.cc
index e51432931605..7486db790780 100644
--- a/src/relay/op/vision/nms.cc
+++ b/src/relay/op/vision/nms.cc
@@ -97,19 +97,9 @@ bool NMSRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   return true;
 }
 
-
-Expr MakeNMS(Expr data,
-             Expr valid_count,
-             Expr indices,
-             int max_output_size,
-             double iou_threshold,
-             bool force_suppress,
-             int top_k,
-             int coord_start,
-             int score_index,
-             int id_index,
-             bool return_indices,
-             bool invalid_to_bottom) {
+Expr MakeNMS(Expr data, Expr valid_count, Expr indices, int max_output_size, double iou_threshold,
+             bool force_suppress, int top_k, int coord_start, int score_index, int id_index,
+             bool return_indices, bool invalid_to_bottom) {
   auto attrs = make_object<NonMaximumSuppressionAttrs>();
   attrs->max_output_size = max_output_size;
   attrs->iou_threshold = iou_threshold;
@@ -124,11 +114,10 @@ Expr MakeNMS(Expr data,
   return Call(op, {data, valid_count, indices}, Attrs(attrs), {});
 }
 
-TVM_REGISTER_GLOBAL("relay.op.vision._make.non_max_suppression")
-.set_body_typed(MakeNMS);
+TVM_REGISTER_GLOBAL("relay.op.vision._make.non_max_suppression").set_body_typed(MakeNMS);
 
 RELAY_REGISTER_OP("vision.non_max_suppression")
-.describe(R"doc(Non-maximum suppression. The input boxes should
+    .describe(R"doc(Non-maximum suppression. The input boxes should
 be in the format of [class_id, score, left, top, right, bottom]
 or [score, left, top, right, bottom]. Set id_index to be -1 to
 ignore class_id axis.
diff --git a/src/relay/transforms/combine_parallel_conv2d.cc b/src/relay/transforms/combine_parallel_conv2d.cc
index d17e2fcd9655..fa8677372d35 100644
--- a/src/relay/transforms/combine_parallel_conv2d.cc
+++ b/src/relay/transforms/combine_parallel_conv2d.cc
@@ -71,15 +71,12 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
     const auto shape_b =
         tir::BijectiveLayout(Layout(attrs_b->kernel_layout), kOIHW).ForwardShape(tweight_b->shape);
 
-    return eq(attrs_a->strides, attrs_b->strides) &&
-           eq(attrs_a->padding, attrs_b->padding) &&
-           eq(attrs_a->dilation, attrs_b->dilation) &&
-           eq(attrs_a->groups, attrs_b->groups) &&
+    return eq(attrs_a->strides, attrs_b->strides) && eq(attrs_a->padding, attrs_b->padding) &&
+           eq(attrs_a->dilation, attrs_b->dilation) && eq(attrs_a->groups, attrs_b->groups) &&
            eq(attrs_a->data_layout, attrs_b->data_layout) &&
            eq(attrs_a->kernel_layout, attrs_b->kernel_layout) &&
            eq(attrs_a->out_dtype, attrs_b->out_dtype) &&
-           eq(attrs_a->out_layout, attrs_b->out_layout) &&
-           eq(shape_a[2], shape_b[2]) &&
+           eq(attrs_a->out_layout, attrs_b->out_layout) && eq(shape_a[2], shape_b[2]) &&
            eq(shape_a[3], shape_b[3]);
   }
 
@@ -183,12 +180,10 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
       DLContext ctx;
       ctx.device_type = kDLCPU;
       ctx.device_id = 0;
-      auto begin_ndarray = runtime::NDArray::Empty({int64_t(begin.size())},
-                                                   DataType::Int(64), ctx);
-      auto end_ndarray = runtime::NDArray::Empty({int64_t(begin.size())},
-                                                 DataType::Int(64), ctx);
-      auto strides_ndarray = runtime::NDArray::Empty({int64_t(begin.size())},
-                                                     DataType::Int(64), ctx);
+      auto begin_ndarray = runtime::NDArray::Empty({int64_t(begin.size())}, DataType::Int(64), ctx);
+      auto end_ndarray = runtime::NDArray::Empty({int64_t(begin.size())}, DataType::Int(64), ctx);
+      auto strides_ndarray =
+          runtime::NDArray::Empty({int64_t(begin.size())}, DataType::Int(64), ctx);
 
       auto* begin_data = static_cast<int64_t*>(begin_ndarray->data);
       auto* end_data = static_cast<int64_t*>(end_ndarray->data);
@@ -200,11 +195,8 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
         strides_data[i] = 1;
       }
 
-      auto slice = MakeStridedSlice(data,
-                                    Constant(begin_ndarray),
-                                    Constant(end_ndarray),
-                                    Constant(strides_ndarray),
-                                    false);
+      auto slice = MakeStridedSlice(data, Constant(begin_ndarray), Constant(end_ndarray),
+                                    Constant(strides_ndarray), false);
       subst_map->insert({GetRef<Expr>(branch[depth]), slice});
     }
   }
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index ab5d33b81665..2b26f75d09e3 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -520,6 +520,7 @@ inline Array<Tensor> split(const Tensor& x, Array<Integer> split_indices, int ax
  * \param begin The indices to begin with in the slicing
  * \param end Indicies indicating end of the slice
  * \param strides Specifies the stride values, it can be negative
+ * \param ignore_end Specifies whether to ignore input end
  * in that case, the input tensor will be reversed in that particular axis
  * \param name The name of the operation
  * \param tag The tag to mark the operation

From 5c5bc777fa330a716f5c28c50ee5143e32e2a469 Mon Sep 17 00:00:00 2001
From: Yong Wu <ywu118@alumni.jh.edu>
Date: Wed, 20 May 2020 03:57:38 +0800
Subject: [PATCH 13/22] remove empty lines

---
 python/tvm/relay/frontend/tensorflow.py       |  2 +-
 .../tvm/relay/frontend/tensorflow_parser.py   |  4 +--
 python/tvm/relay/op/vision/_vision.py         |  2 --
 python/tvm/relay/testing/tf.py                |  2 +-
 .../frontend/tensorflow/test_control_flow.py  | 26 +++++++++----------
 .../frontend/tensorflow/test_debugging.py     |  5 ----
 .../frontend/tensorflow/test_forward.py       |  2 +-
 tests/python/frontend/tflite/test_forward.py  |  2 +-
 tests/python/relay/test_any.py                |  1 -
 tests/python/relay/test_op_level4.py          |  2 --
 .../test_pass_combine_parallel_conv2d.py      |  7 +++--
 tutorials/frontend/from_tensorflow.py         |  4 +--
 12 files changed, 24 insertions(+), 35 deletions(-)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index a830018c8a9c..34fc4893773e 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -2530,7 +2530,7 @@ class LoopBound(ExprVisitor):
     .. code-block:: python
 
         i = tf.constant(0)
-        data = tf.placeholder(tf.float32, shape=(1024, 1024))
+        data = tf.compat.v1.placeholder(tf.float32, shape=(1024, 1024))
         slice = tf.strided_slice(data, 0, 512)
         def c(i): return tf.less(i, 10)
         def b(i): return [tf.add(i, 1), tf.add(i, 1) + slice]
diff --git a/python/tvm/relay/frontend/tensorflow_parser.py b/python/tvm/relay/frontend/tensorflow_parser.py
index 4e0f14c577cb..fdbb8768597f 100644
--- a/python/tvm/relay/frontend/tensorflow_parser.py
+++ b/python/tvm/relay/frontend/tensorflow_parser.py
@@ -80,14 +80,14 @@ def _get_output_names(self):
                 "required to restore from saved model.")
         tags = self._get_tag_set()
         output_names = set()
-        with tf.compat.v1.Session() as sess:
+        with tf.Session() as sess:
             meta_graph_def = tf.saved_model.loader.load(sess,
                                                         tags,
                                                         self._model_dir)
             for sig_def in meta_graph_def.signature_def.values():
                 for output_tensor in sig_def.outputs.values():
                     output_names.add(output_tensor.name.replace(":0", ""))
-        tf.compat.v1.reset_default_graph()
+        tf.reset_default_graph()
         return ",".join(output_names)
 
     def _load_saved_model(self):
diff --git a/python/tvm/relay/op/vision/_vision.py b/python/tvm/relay/op/vision/_vision.py
index 094671c74284..f6c4f811f13d 100644
--- a/python/tvm/relay/op/vision/_vision.py
+++ b/python/tvm/relay/op/vision/_vision.py
@@ -19,9 +19,7 @@
 from __future__ import absolute_import
 
 import topi
-
 from tvm.te.hybrid import script
-
 from .. import op as reg
 from .. import strategy
 from ..op import OpPattern
diff --git a/python/tvm/relay/testing/tf.py b/python/tvm/relay/testing/tf.py
index 567724d9d251..dc7937c0b346 100644
--- a/python/tvm/relay/testing/tf.py
+++ b/python/tvm/relay/testing/tf.py
@@ -77,7 +77,7 @@ def AddShapesToGraphDef(session, out_node):
 
     Parameters
     ----------
-    session : tf.compat.v1.Session
+    session : tf.Session
         Tensorflow session
     out_node : String or List
         Final output node of the graph.
diff --git a/tests/python/frontend/tensorflow/test_control_flow.py b/tests/python/frontend/tensorflow/test_control_flow.py
index 95d5b797430c..9777a8dc4462 100644
--- a/tests/python/frontend/tensorflow/test_control_flow.py
+++ b/tests/python/frontend/tensorflow/test_control_flow.py
@@ -53,7 +53,7 @@ def b(i): return tf.add(i, 1)
 
         r = tf.while_loop(c, b, [i])
 
-        with tf.compat.v1.Session() as sess:
+        with tf.Session() as sess:
             tf_out = sess.run(r)
 
         check_equal(graph, tf_out)
@@ -70,7 +70,7 @@ def b(i): return tf.add(i, 1)
 
         r = tf.while_loop(c, b, [i])
 
-        with tf.compat.v1.Session() as sess:
+        with tf.Session() as sess:
             tf_out = sess.run(r)
 
         check_equal(graph, tf_out)
@@ -89,7 +89,7 @@ def b(i, j): return [tf.add(i, 1), j]
         i1, i2 = tf.while_loop(c, b, loop_vars=[i0, j0])
         i1 += tf.constant(1337)
 
-        with tf.compat.v1.Session() as sess:
+        with tf.Session() as sess:
             tf_out = sess.run(i1)
 
     check_equal(graph, tf_out)
@@ -107,7 +107,7 @@ def c(i, j, k): return i < 10
         def b(i, j, k): return [i+1, j * k, k + i]
         r = tf.while_loop(c, b, loop_vars=[i0, j0, k0])
 
-        with tf.compat.v1.Session() as sess:
+        with tf.Session() as sess:
             tf_out = sess.run(r)
 
     check_equal(graph, tf_out)
@@ -127,7 +127,7 @@ def c(i, j, k): return \
 
         def b(i, j, k): return [i+j, j+k, k+1]
         r = tf.while_loop(c, b, loop_vars=[i, j, k])
-        with tf.compat.v1.Session() as sess:
+        with tf.Session() as sess:
             tf_out = sess.run(r)
 
     check_equal(graph, tf_out)
@@ -147,7 +147,7 @@ def condition(x):
             return tf.reduce_sum(x) < 100
         x = tf.constant(0, shape=[2, 2])
         r = tf.while_loop(condition, body, [x])
-        with tf.compat.v1.Session() as sess:
+        with tf.Session() as sess:
             tf_out = sess.run(r)
 
     check_equal(graph, tf_out)
@@ -170,7 +170,7 @@ def condition(x):
         x = tf.constant(3)
         r = tf.while_loop(condition, body, loop_vars=[x])
 
-        with tf.compat.v1.Session() as sess:
+        with tf.Session() as sess:
             tf_out = sess.run(r)
 
     check_equal(graph, tf_out)
@@ -189,7 +189,7 @@ def f2():
             return tf.add(4, 23)
         r = tf.cond(tf.less(i, j), f1, f2)
 
-    with tf.compat.v1.Session(graph=graph) as sess:
+    with tf.Session(graph=graph) as sess:
         tf_out = sess.run(r)
 
     check_equal(graph, tf_out)
@@ -204,7 +204,7 @@ def test_multiple_cond_vars():
         r = tf.cond(tf.less(tf.add(x1, x2), 10),
                     lambda: tf.add(10, 2), lambda: tf.square(5))
 
-        with tf.compat.v1.Session() as sess:
+        with tf.Session() as sess:
             tf_out = sess.run(r)
 
     check_equal(graph, tf_out)
@@ -224,7 +224,7 @@ def fn2(x, y):
         k = tf.constant(3)
         r = tf.cond(tf.less(i, j), lambda: fn1(i, k), lambda: fn2(j, k))
 
-        with tf.compat.v1.Session() as sess:
+        with tf.Session() as sess:
             tf_out = sess.run(r, feed_dict={i: 1, j: 2, k: 3})
 
     check_equal(graph, tf_out)
@@ -252,7 +252,7 @@ def fn2(a, b):
         pred = tf.less(x, y)
         r = tf.cond(pred, lambda: fn1(x, y), lambda: fn2(y, z))
 
-        with tf.compat.v1.Session() as sess:
+        with tf.Session() as sess:
             tf_out = sess.run(r, feed_dict={x: 1, y: 2, z: 3, pred: True})
 
     check_equal(graph, tf_out)
@@ -279,7 +279,7 @@ def fn2(a, b):
         pred = tf.less(x, y)
         r = tf.cond(pred, lambda: fn1(x, y), lambda: fn2(y, z))
 
-        with tf.compat.v1.Session() as sess:
+        with tf.Session() as sess:
             tf_out = sess.run(r, feed_dict={x: 1, y: 2, z: 3, pred: True})
 
     check_equal(graph, tf_out)
@@ -300,7 +300,7 @@ def condition(x):
             return tf.less(x, 100)
 
         r = tf.while_loop(condition, body, loop_vars=[x])
-        with tf.compat.v1.Session() as sess:
+        with tf.Session() as sess:
             tf_out = sess.run(r)
 
     check_equal(graph, tf_out)
diff --git a/tests/python/frontend/tensorflow/test_debugging.py b/tests/python/frontend/tensorflow/test_debugging.py
index 8dac612b4879..a6df6ffb63a1 100644
--- a/tests/python/frontend/tensorflow/test_debugging.py
+++ b/tests/python/frontend/tensorflow/test_debugging.py
@@ -17,7 +17,6 @@
 """Unit tests for converting TensorFlow debugging ops to Relay."""
 try:
     import tensorflow.compat.v1 as tf
-
     tf.disable_v2_behavior()
 except ImportError:
     import tensorflow as tf
@@ -25,7 +24,6 @@
 from tvm import relay
 from tvm.relay.frontend.tensorflow import from_tensorflow
 
-
 def run_relay(graph, shape_dict=None, *vars):
     mod, params = from_tensorflow(
         graph.as_graph_def(add_shapes=True),
@@ -33,7 +31,6 @@ def run_relay(graph, shape_dict=None, *vars):
     ex = relay.create_executor('debug', mod=mod)
     return ex.evaluate()(*vars)
 
-
 def test_assert_true():
     g = tf.Graph()
     shape = (1, 2)
@@ -77,7 +74,6 @@ def test_assert_true_var_capture():
         np.testing.assert_allclose(True,
                                    run_relay(g, None, x_value).asnumpy())
 
-
 def test_assert_false():
     g = tf.Graph()
     with g.as_default():
@@ -96,7 +92,6 @@ def test_assert_false():
         # argument is false.
         np.testing.assert_allclose(0, run_relay(g).asnumpy())
 
-
 if __name__ == "__main__":
     test_assert_true()
     test_assert_true_var_capture()
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 76d2fe13aa49..b0409d7d0eac 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -3347,4 +3347,4 @@ def test_forward_isfinite():
     test_read_variable_op()
 
     # Sharing params case using Mean ops
-    test_sharing_node()
\ No newline at end of file
+    test_sharing_node()
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index eca5fb7d4b74..7a8437aaedd3 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -169,7 +169,7 @@ def compare_tflite_with_tvm(in_data, in_name, input_tensors,
     for i in range(len(in_name)):
         in_node[i] = in_name[i].split(':')[0] if ":" in in_name[i] else in_name[i]
 
-    with tf.compat.v1.Session() as sess:
+    with tf.Session() as sess:
         if init_global_variables:
             sess.run(variables.global_variables_initializer())
         # convert to tflite model
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index e5a7521ce721..168db3768ea0 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -672,7 +672,6 @@ def test_any_strided_slice():
     verify_any_strided_slice(any_dims(4), (4,), (4,), (4,), (40, 50, 60, 70))
     verify_any_strided_slice(any_dims(4), (4,), (4,), (4,), (40, 50, 60, 70), ignore_end=True)
 
-
 def test_recursive_concat():
     """
     fn @concat_loop(%i: int32, %st: (any, 1)) -> (any, 1) {
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 4b5e19223dca..36dc91c53030 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -323,7 +323,6 @@ def verify(dshape, begin, end, strides, output,
         assert "begin=" in text
         assert "end=" in text
 
-
         if output:
             assert func.body.checked_type == relay.ty.TensorType(output, "float32")
 
@@ -400,4 +399,3 @@ def verify(dshape, begin, end, strides, vshape, test_ref=True):
     test_where()
     test_reduce_functions()
     test_mean_var_std()
-
diff --git a/tests/python/relay/test_pass_combine_parallel_conv2d.py b/tests/python/relay/test_pass_combine_parallel_conv2d.py
index c32e9ea39ab3..4db643e2b6f8 100644
--- a/tests/python/relay/test_pass_combine_parallel_conv2d.py
+++ b/tests/python/relay/test_pass_combine_parallel_conv2d.py
@@ -17,7 +17,6 @@
 import tvm
 from tvm import relay
 from tvm.relay import transform
-import numpy as np
 
 
 def run_combine_parallel(expr, min_num_branches=3):
@@ -132,7 +131,7 @@ def check(x_shape, channels1, channels2):
                          transform.CombineParallelConv2D(min_num_branches=2))
         y_expected = expected(x, w1, w2, scale1, scale2, bias, channels1, channels2)
         y_expected = run_opt_pass(y_expected, transform.InferType())
-        tvm.ir.structural_equal(y, y_expected, map_free_vars=True)
+        assert tvm.ir.structural_equal(y, y_expected, map_free_vars=True)
 
     check((1, 4, 16, 16), 4, 8)
 
@@ -177,7 +176,7 @@ def check(x_shape, channels1, channels2):
                          transform.CombineParallelConv2D(min_num_branches=2))
         y_expected = expected(x, w1, w2, scale1, scale2, channels1, channels2)
         y_expected = run_opt_pass(y_expected, transform.InferType())
-        tvm.ir.structural_equal(y, y_expected, map_free_vars=True)
+        assert tvm.ir.structural_equal(y, y_expected, map_free_vars=True)
 
     check((1, 4, 16, 16), 4, 8)
 
@@ -219,7 +218,7 @@ def check(x_shape, repeat):
                          transform.CombineParallelConv2D(min_num_branches=2))
         y_expected = expected(x, w, out_c, repeat)
         y_expected = run_opt_pass(y_expected, transform.InferType())
-        tvm.ir.structural_equal(y, y_expected, map_free_vars=True)
+        assert tvm.ir.structural_equal(y, y_expected, map_free_vars=True)
 
     check((1, 4, 16, 16), 4)
 
diff --git a/tutorials/frontend/from_tensorflow.py b/tutorials/frontend/from_tensorflow.py
index 1b97c442dbca..0ebd733ef9aa 100644
--- a/tutorials/frontend/from_tensorflow.py
+++ b/tutorials/frontend/from_tensorflow.py
@@ -101,7 +101,7 @@
     # Call the utility to import the graph definition into default graph.
     graph_def = tf_testing.ProcessGraphDefParam(graph_def)
     # Add shapes to the graph.
-    with tf.compat.v1.Session() as sess:
+    with tf_compat_v1.Session() as sess:
         graph_def = tf_testing.AddShapesToGraphDef(sess, 'softmax')
 
 ######################################################################
@@ -218,7 +218,7 @@ def run_inference_on_image(image):
     # Creates graph from saved GraphDef.
     create_graph()
 
-    with tf.compat.v1.Session() as sess:
+    with tf_compat_v1.Session() as sess:
         softmax_tensor = sess.graph.get_tensor_by_name('softmax:0')
         predictions = sess.run(softmax_tensor,
                                {'DecodeJpeg/contents:0': image_data})

From cbdca347899137d994853668e4da6b8df503f3c4 Mon Sep 17 00:00:00 2001
From: Yong Wu <ywu118@alumni.jh.edu>
Date: Wed, 27 May 2020 01:03:29 +0800
Subject: [PATCH 14/22] partial ignore for end of strided_slice

---
 python/tvm/relay/_parser.py                   |  2 +-
 python/tvm/relay/op/_transform.py             | 37 +++++++++-
 src/relay/analysis/util.cc                    |  7 ++
 src/relay/op/tensor/transform.cc              | 69 ++-----------------
 .../transforms/combine_parallel_conv2d.cc     |  4 +-
 tests/python/relay/test_any.py                | 43 ++++++++----
 tests/python/relay/test_op_level4.py          | 43 +++++++-----
 .../test_pass_combine_parallel_conv2d.py      | 45 +++++++-----
 topi/include/topi/transform.h                 |  2 +-
 .../topi/testing/strided_slice_python.py      | 13 ++--
 10 files changed, 141 insertions(+), 124 deletions(-)

diff --git a/python/tvm/relay/_parser.py b/python/tvm/relay/_parser.py
index 49f2d4d51321..9de1a1fcb874 100644
--- a/python/tvm/relay/_parser.py
+++ b/python/tvm/relay/_parser.py
@@ -114,7 +114,7 @@ def convert(self, v):
     def __call__(self, args, attrs, type_args):
         if attrs is None:
             attrs = {}
-        if self.operator is op.reshape:
+        if self.operator in (op.reshape, op.strided_slice):
             x = self.operator(*args)
         elif self.operator in (op.zeros, op.ones, op.full, op.broadcast_to):
             x = self.operator(*args, dtype=attrs["dtype"])
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 40fa1caf8943..4d665fab5eb1 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -99,10 +99,14 @@ def _arange_shape_func(start, stop, step):
 
 @_reg.register_shape_func("arange", True)
 def arange_shape_func(attrs, inputs, _):
+    """
+    Shape func for arange
+    """
     return [_arange_shape_func(*inputs)]
 
 @script
-def _strided_slice_shape_func(data, begin, end, strides, ignore_end):
+def _strided_slice_shape_func_input_data(data, begin, end, strides,
+                                         ignore_end):
     ndim = len(data.shape)
     out = output_tensor((ndim,), "int64")
     for i in const_range(ndim):
@@ -119,10 +123,37 @@ def _strided_slice_shape_func(data, begin, end, strides, ignore_end):
         out[i] = int64(ceil_div((int64(cend) - int64(cbegin)), int64(cstride)))
     return out
 
+@script
+def _strided_slice_shape_func_input_shape(data_shape, begin, end, strides, ignore_end):
+    ndim = data_shape.shape[0]
+    assert ndim == 2, "not correct"
+    out = output_tensor((ndim,), "int64")
+    for i in const_range(ndim):
+        cbegin = int64(0)
+        cend = int64(data_shape[i])
+        cstride = int64(1)
+        if len(begin) > i:
+            cbegin = int64(begin[i])
+        if len(end) > i:
+            cend = int64(end[i])
+        if ignore_end != 0 and len(strides) > i:
+            cstride = int64(strides[i])
+        assert cstride != 0, "Strides can't be zero."
+        out[i] = int64(ceil_div((int64(cend) - int64(cbegin)), int64(cstride)))
+    return out
+
+
 @_reg.register_shape_func("strided_slice", True)
 def strided_slice_shape_func(attrs, inputs, _):
-    ignore_end = attrs.ignore_end
-    return [_strided_slice_shape_func(*inputs, convert(get_const_int(ignore_end)))]
+    """
+    Shape func for strided_slice
+    """
+    ignore_end = convert(get_const_int(attrs.ignore_end))
+    # data independent if begin, end and strides exist
+    if attrs.begin and attrs.end and attrs.strides:
+        return [_strided_slice_shape_func_input_shape(inputs[0], attrs.begin, attrs.end,
+                                                      attrs.strides, ignore_end)]
+    return [_strided_slice_shape_func_input_data(*inputs, ignore_end)]
 
 @script
 def _concatenate_shape_func(inputs, axis):
diff --git a/src/relay/analysis/util.cc b/src/relay/analysis/util.cc
index 2853165df4ca..1d9f6a1d7181 100644
--- a/src/relay/analysis/util.cc
+++ b/src/relay/analysis/util.cc
@@ -458,6 +458,13 @@ bool IsDataDependant(const CallNode* call) {
         return false;
       }
     }
+  } else if (op->name == "strided_slice") {
+    if (const auto* attrs = call->attrs.as<StridedSliceAttrs>()) {
+      if (attrs->begin && attrs->end && attrs->strides) {
+        // not data dependant if begin, end and strides exist
+        return false;
+      }
+    }
   }
 
   return tshape_data_dependant[op];
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 3597e0235eb5..042541b12b54 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1674,66 +1674,6 @@ Array<Integer> GetIntArray(Array<IndexExpr> arr) {
 // strided_slice
 TVM_REGISTER_NODE_TYPE(StridedSliceAttrs);
 
-int64_t* ToVector(const runtime::NDArray& array) {
-  size_t len = array.Shape().front();
-  int64_t* rel_vec = new int64_t[len];
-  if (array->dtype.code == kDLInt) {
-    if (array->dtype.bits == 8) {
-      int8_t* init_array = reinterpret_cast<int8_t*>(array->data);
-      for (size_t i = 0; i < len; ++i) {
-        rel_vec[i] = int64_t(init_array[i]);
-      }
-      return rel_vec;
-    } else if (array->dtype.bits == 16) {
-      int16_t* init_array = reinterpret_cast<int16_t*>(array->data);
-      for (size_t i = 0; i < len; ++i) {
-        rel_vec[i] = int64_t(init_array[i]);
-      }
-      return rel_vec;
-    } else if (array->dtype.bits == 32) {
-      int32_t* init_array = reinterpret_cast<int32_t*>(array->data);
-      for (size_t i = 0; i < len; ++i) {
-        rel_vec[i] = int64_t(init_array[i]);
-      }
-      return rel_vec;
-    } else if (array->dtype.bits == 64) {
-      int64_t* init_array = reinterpret_cast<int64_t*>(array->data);
-      for (size_t i = 0; i < len; ++i) {
-        rel_vec[i] = int64_t(init_array[i]);
-      }
-      return rel_vec;
-    }
-  } else if (array->dtype.code == kDLUInt) {
-    if (array->dtype.bits == 8) {
-      uint8_t* init_array = reinterpret_cast<uint8_t*>(array->data);
-      for (size_t i = 0; i < len; ++i) {
-        rel_vec[i] = int64_t(init_array[i]);
-      }
-      return rel_vec;
-    } else if (array->dtype.bits == 16) {
-      uint16_t* init_array = reinterpret_cast<uint16_t*>(array->data);
-      for (size_t i = 0; i < len; ++i) {
-        rel_vec[i] = int64_t(init_array[i]);
-      }
-      return rel_vec;
-    } else if (array->dtype.bits == 32) {
-      uint32_t* init_array = reinterpret_cast<uint32_t*>(array->data);
-      for (size_t i = 0; i < len; ++i) {
-        rel_vec[i] = int64_t(init_array[i]);
-      }
-      return rel_vec;
-    } else if (array->dtype.bits == 64) {
-      uint64_t* init_array = reinterpret_cast<uint64_t*>(array->data);
-      for (size_t i = 0; i < len; ++i) {
-        rel_vec[i] = int64_t(init_array[i]);
-      }
-      return rel_vec;
-    }
-  }
-  LOG(FATAL) << "Unknown data type: " << tvm::runtime::DLDataType2String(array->dtype);
-  return rel_vec;
-}
-
 bool StridedSliceRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 5);
@@ -1771,7 +1711,8 @@ bool StridedSliceRel(const Array<Type>& types, int num_inputs, const Attrs& attr
     std::vector<int64_t> end_vec;
     for (size_t i = 0; i < param->end.value().size(); ++i) {
       // allow end to be None
-      if (param->ignore_end || (!param->end.value()[i].defined())) {
+      if (!param->end.value()[i].defined() ||
+          (param->ignore_end && param->end.value()[i]->value < 0)) {
         end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
       } else {
         end_vec.push_back(param->end.value()[i]->value);
@@ -1894,6 +1835,9 @@ Array<Array<Layout>> StridedSliceInferCorrectLayout(const Attrs& attrs,
         }
         int64_t bg = begin[i].defined() ? begin[i]->value : 0;
         int64_t ed = end[i].defined() ? end[i]->value : shape[i].as<IntImmNode>()->value;
+        if (params->ignore_end && end[i].defined() && end[i]->value < 0) {
+          ed = shape[i].as<IntImmNode>()->value;
+        }
         if (bg % factor || ed % factor) {
           // transform to original layout
           return {{Layout::Undef()}, {Layout::Undef()}};
@@ -1912,7 +1856,6 @@ Array<Array<Layout>> StridedSliceInferCorrectLayout(const Attrs& attrs,
 
 inline te::Tensor DynamicStridedSlice(const te::Tensor& input, const te::Tensor& begin,
                                       const te::Tensor& end, const te::Tensor& strides,
-                                      const bool& ignore_end,
                                       std::string name = "T_strided_slice_dynamic",
                                       std::string tag = topi::kInjective) {
   int64_t src_tensor_dim = input->shape.size();
@@ -1956,7 +1899,7 @@ Array<te::Tensor> StridedSliceCompute(const Attrs& attrs, const Array<te::Tensor
           strides->shape[0].as<IntImmNode>()->value == attr_size)
         << "begin, end, and strides are required to have the same length"
         << " if they are non-constant.";
-    return Array<te::Tensor>{DynamicStridedSlice(data, begin, end, strides, param->ignore_end)};
+    return Array<te::Tensor>{DynamicStridedSlice(data, begin, end, strides)};
   }
 }
 
diff --git a/src/relay/transforms/combine_parallel_conv2d.cc b/src/relay/transforms/combine_parallel_conv2d.cc
index fa8677372d35..04ed35b709e5 100644
--- a/src/relay/transforms/combine_parallel_conv2d.cc
+++ b/src/relay/transforms/combine_parallel_conv2d.cc
@@ -172,7 +172,7 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
       Array<Integer> end;
       for (size_t i = 0; i < channel_pos_; i++) {
         begin.push_back(0);
-        end.push_back(channels);
+        end.push_back(-1);
       }
       begin.push_back(index);
       index += channels;
@@ -196,7 +196,7 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
       }
 
       auto slice = MakeStridedSlice(data, Constant(begin_ndarray), Constant(end_ndarray),
-                                    Constant(strides_ndarray), false);
+                                    Constant(strides_ndarray), true);
       subst_map->insert({GetRef<Expr>(branch[depth]), slice});
     }
   }
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 168db3768ea0..83c6244a7d72 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -643,27 +643,39 @@ def test_arange_with_dynamic_shape():
         result = ex.evaluate()(data)
         tvm.testing.assert_allclose(result.asnumpy(), np.array(range(10)).astype("int32")+1)
 
-def verify_any_strided_slice(data_shape, begin_shape, end_shape,
-                             strides_shape, data_np_shape, ignore_end=False):
+def verify_any_strided_slice(data_shape, begin_shape, end_shape, strides_shape,
+                             data_np_shape, ignore_end=False, const_attrs=False, dtype="int32"):
+    # Generate random numpy input data
+    np_data = np.random.uniform(size=data_np_shape).astype('float32')
+    np_begin = np.random.randint(2, size=begin_shape, dtype=dtype)
+    np_end = np.random.randint(5, 15, size=end_shape, dtype=dtype)
+    np_strides = np.random.randint(1, 3, size=strides_shape, dtype=dtype)
+    # target numpy result
+    ref_res = topi.testing.strided_slice_python(np_data, np_begin, np_end, np_strides, ignore_end)
+
+    # Relay Module
     mod = tvm.IRModule()
     data = relay.var('data', shape=data_shape, dtype='float32')
-    begin = relay.var('begin', shape=begin_shape, dtype="int32")
-    end = relay.var('end', shape=end_shape, dtype="int32")
-    strides = relay.var('strides', shape=strides_shape, dtype="int32")
-    y = relay.strided_slice(data, begin, end, strides, ignore_end)
-    mod["main"] = relay.Function([data, begin, end, strides], y)
-
-    # Generate random numpy input data
-    data_np = np.random.uniform(size=data_np_shape).astype('float32')
-    begin_np = np.random.randint(2, size=begin_shape, dtype="int32")
-    end_np = np.random.randint(5, 15, size=end_shape, dtype="int32")
-    strides_np = np.random.randint(1, 3, size=strides_shape, dtype="int32")
+    if const_attrs:
+        begin = relay.const(np_begin, dtype)
+        end = relay.const(np_end, dtype)
+        strides = relay.const(np_strides, dtype)
+        args = [data]
+        np_inputs = [np_data]
+    else:
+        begin = relay.var('begin', shape=begin_shape, dtype=dtype)
+        end = relay.var('end', shape=end_shape, dtype=dtype)
+        strides = relay.var('strides', shape=strides_shape, dtype=dtype)
+        args = [data, begin, end, strides]
+        np_inputs = [np_data, np_begin, np_end, np_strides]
 
-    ref_res = topi.testing.strided_slice_python(data_np, begin_np, end_np, strides_np)
+    y = relay.strided_slice(data, begin=begin, end=end,
+                            strides=strides, ignore_end=ignore_end)
+    mod["main"] = relay.Function(args, y)
 
     for kind in ["debug", "vm"]:
         ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm")
-        result = ex.evaluate()(data_np, begin_np, end_np, strides_np)
+        result = ex.evaluate()(*np_inputs)
         tvm.testing.assert_allclose(result.asnumpy(), ref_res)
 
 def test_any_strided_slice():
@@ -671,6 +683,7 @@ def test_any_strided_slice():
     verify_any_strided_slice(any_dims(3), (3,), (3,), (3,), (23, 29, 41))
     verify_any_strided_slice(any_dims(4), (4,), (4,), (4,), (40, 50, 60, 70))
     verify_any_strided_slice(any_dims(4), (4,), (4,), (4,), (40, 50, 60, 70), ignore_end=True)
+    verify_any_strided_slice(any_dims(2), (2,), (2,), (2,), (6, 7))
 
 def test_recursive_concat():
     """
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 36dc91c53030..081236badd02 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -296,25 +296,34 @@ def test_mean_var_std():
 
 
 def test_strided_slice():
-    def verify(dshape, begin, end, strides, output,
-               ignore_end=False, test_ref=True, dtype="int32"):
+    def verify(dshape, begin, end, strides, output, ignore_end=False,
+               attr_const=True, test_ref=True, dtype="int32"):
         x = relay.var("x", relay.TensorType(dshape, "float32"))
         ndim = len(dshape)
         begin = begin if begin else [0] * ndim
         end = end if end else list(dshape)
-        begin_expr = relay.const(begin, dtype=dtype)
-        end_expr = relay.const(end, dtype=dtype)
+
+        # target numpy result
+        x_data = np.random.uniform(size=dshape).astype("float32")
+        ref_res = topi.testing.strided_slice_python(
+            x_data, begin, end, strides, ignore_end)
+
+        if attr_const:
+            begin = relay.const(begin, dtype=dtype)
+            end = relay.const(end, dtype=dtype)
+
         if strides:
-            strides_expr = relay.const(strides, dtype=dtype)
+            if attr_const:
+                strides = relay.const(strides, dtype=dtype)
             z = relay.strided_slice(x,
-                                    begin=begin_expr,
-                                    end=end_expr,
-                                    strides=strides_expr,
+                                    begin=begin,
+                                    end=end,
+                                    strides=strides,
                                     ignore_end=ignore_end)
         else:
             z = relay.strided_slice(x,
-                                    begin=begin_expr,
-                                    end=end_expr,
+                                    begin=begin,
+                                    end=end,
                                     ignore_end=ignore_end)
         func = relay.Function([x], z)
 
@@ -328,26 +337,26 @@ def verify(dshape, begin, end, strides, output,
 
         if not test_ref:
             return
-        x_data = np.random.uniform(size=dshape).astype("float32")
-        ref_res = topi.testing.strided_slice_python(
-            x_data, begin, end, strides)
         for target, ctx in ctx_list():
             intrp = relay.create_executor("graph", ctx=ctx, target=target)
             op_res = intrp.evaluate(func)(x_data)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
 
-    verify((1, 224, 224, 3), [0, 20, 20, 0], [1, 140, 140, 3], [1, 1, 1, 1],
-           (1, 120, 120, 3), dtype="int64")
+    verify((1, 224, 224, 3), [0, 20, 20, 0], [1, 140, 140, 3],
+           [1, 1, 1, 1], (1, 120, 120, 3), dtype="int64")
     verify((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], (1, 3, 3), dtype="int16")
     verify((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2], (3, 1, 2))
-    verify((3, 4, 3), [1, 0, 0], [2, 2, 3], [1, 1, 2], (2, 4, 2),
-           ignore_end=True, test_ref=False)
+    verify((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2], (3, 1, 2), attr_const=False)
     verify((3, 4, 3), [1, 1, 0], [4, 4, 3], None, (2, 3, 3))
     verify((3, 4, 3), [1, 1, 0], [4, 1000, 3], None, (2, 3, 3))
     verify((3, 4, 3), [1, 1, 0], [4, 4], None, (2, 3, 3))
     verify((3, 4, 3), [1, 1], [4, 4, 3], None, (2, 3, 3))
     verify((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], (1, 4, 3))
     verify((3, 4, 3), [1, -1, 0], [2, -3, 3], [1, -1, 1], (1, 2, 3))
+    verify((3, 4, 3), [1, 0, 0], [3, -1, 3], [1, 1, 2],
+           (2, 4, 2), ignore_end=True, test_ref=False)
+    verify((3, 4, 3), [1, 0, 0], [-1, 2, 3], [1, 1, 2],
+           (2, 2, 2), ignore_end=True, test_ref=True)
 
 def test_strided_set():
     def verify(dshape, begin, end, strides, vshape, test_ref=True):
diff --git a/tests/python/relay/test_pass_combine_parallel_conv2d.py b/tests/python/relay/test_pass_combine_parallel_conv2d.py
index 4db643e2b6f8..28c9655808f5 100644
--- a/tests/python/relay/test_pass_combine_parallel_conv2d.py
+++ b/tests/python/relay/test_pass_combine_parallel_conv2d.py
@@ -51,17 +51,20 @@ def expected(x, w1, w2, w3, w4, channels1, channels2, channels3, channels4):
         y = relay.nn.conv2d(x, w, channels=channels1 + channels2 + channels4)
         y1 = relay.strided_slice(y,
                                  begin=relay.const([0, 0], "int64"),
-                                 end=relay.const([channels1, channels1], "int64"),
-                                 strides=relay.const([1, 1], 'int64'))
+                                 end=relay.const([-1, channels1], "int64"),
+                                 strides=relay.const([1, 1], 'int64'),
+                                 ignore_end=True)
         y2 = relay.strided_slice(y,
                                  begin=relay.const([0, channels1], "int64"),
-                                 end=relay.const([channels2, channels1 + channels2], "int64"),
-                                 strides=relay.const([1, 1], 'int64'))
+                                 end=relay.const([-1, channels1 + channels2], "int64"),
+                                 strides=relay.const([1, 1], 'int64'),
+                                 ignore_end=True)
         y3 = relay.nn.conv2d(x, w3)
         y4 = relay.strided_slice(y,
                                  begin=relay.const([0, channels1 + channels2], "int64"),
-                                 end=relay.const([channels4, channels1 + channels2 + channels4], "int64"),
-                                 strides=relay.const([1, 1], 'int64'))
+                                 end=relay.const([-1, channels1 + channels2 + channels4], "int64"),
+                                 strides=relay.const([1, 1], 'int64'),
+                                 ignore_end=True)
         y5 = relay.nn.max_pool2d(x)
         y = relay.Tuple((y1, y2, y3, y4, y5))
         return relay.Function(args, y)
@@ -108,12 +111,14 @@ def expected(x, w1, w2, scale1, scale2, bias, channels1, channels2):
         y = relay.nn.relu(y)
         y1 = relay.strided_slice(y,
                                  begin=relay.const([0, 0], "int64"),
-                                 end=relay.const([4, channels1], "int64"),
-                                 strides=relay.const([1, 1], "int64"))
+                                 end=relay.const([-1, channels1], "int64"),
+                                 strides=relay.const([1, 1], "int64"),
+                                 ignore_end=True)
         y2 = relay.strided_slice(y,
                                  begin=relay.const([0, channels1], "int64"),
-                                 end=relay.const([8, channels1 + channels2], "int64"),
-                                 strides=relay.const([1, 1], "int64"))
+                                 end=relay.const([-1, channels1 + channels2], "int64"),
+                                 strides=relay.const([1, 1], "int64"),
+                                 ignore_end=True)
         y2 = relay.add(y2, bias)
         y = relay.Tuple((y1, y2))
         return relay.Function(args, y)
@@ -153,12 +158,14 @@ def expected(x, w1, w2, scale1, scale2, channels1, channels2):
         y = relay.nn.conv2d(x, w, channels=channels1 + channels2)
         y1 = relay.strided_slice(y,
                                  begin=relay.const([0, 0], "int64"),
-                                 end=relay.const([4, channels1], "int64"),
-                                 strides=relay.const([1, 1], "int64"))
+                                 end=relay.const([-1, channels1], "int64"),
+                                 strides=relay.const([1, 1], "int64"),
+                                 ignore_end=True)
         y2 = relay.strided_slice(y,
                                  begin=relay.const([0, channels1], "int64"),
-                                 end=relay.const([8, channels1 + channels2], "int64"),
-                                 strides=relay.const([1, 1], "int64"))
+                                 end=relay.const([-1, channels1 + channels2], "int64"),
+                                 strides=relay.const([1, 1], "int64"),
+                                 ignore_end=True)
         y1 = relay.multiply(y1, scale1)
         y2 = relay.multiply(y2, scale2)
         y = relay.Tuple((y1, y2))
@@ -199,12 +206,14 @@ def expected(x, w, channels, repeat):
             y = relay.nn.conv2d(y, w_concat, channels=channels*2)
             y1 = relay.strided_slice(y,
                                      begin=relay.const([0, 0], "int64"),
-                                     end=relay.const([2, channels], "int64"),
-                                     strides=relay.const([1, 1], "int64"))
+                                     end=relay.const([-1, channels], "int64"),
+                                     strides=relay.const([1, 1], "int64"),
+                                     ignore_end=True)
             y2 = relay.strided_slice(y,
                                      begin=relay.const([0, channels], "int64"),
-                                     end=relay.const([2, channels * 2], "int64"),
-                                     strides=relay.const([1, 1], "int64"))
+                                     end=relay.const([-1, channels * 2], "int64"),
+                                     strides=relay.const([1, 1], "int64"),
+                                     ignore_end=True)
             y = relay.concatenate((y1, y2), axis=1)
         return relay.Function(args, y)
 
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index 2b26f75d09e3..9d268a9afc74 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -560,7 +560,7 @@ inline Tensor strided_slice(const Tensor& x, const Array<Integer>& begin, const
   std::vector<int64_t> end_vec;
   for (size_t i = 0; i < end.size(); ++i) {
     // allow end to be None
-    if (ignore_end || (!end[i].defined())) {
+    if (!end[i].defined() || (ignore_end && end[i]->value < 0)) {
       end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
     } else {
       end_vec.push_back(end[i]->value);
diff --git a/topi/python/topi/testing/strided_slice_python.py b/topi/python/topi/testing/strided_slice_python.py
index b21c3fb87119..62aface75057 100644
--- a/topi/python/topi/testing/strided_slice_python.py
+++ b/topi/python/topi/testing/strided_slice_python.py
@@ -45,10 +45,15 @@ def strided_slice_python(data, begin, end, strides, ignore_end=False):
     strides = [] if strides is None else strides
     slices = []
     for i in range(len(data.shape)):
-        slices.append(slice(
-            begin[i] if i < len(begin) else None,
-            end[i] if i < len(end) and not ignore_end else None,
-            strides[i] if i < len(strides) else None))
+        bg = begin[i] if i < len(begin) else None
+        if i >= len(end) or (ignore_end and end[i] < 0):
+            ed = None
+        else:
+            ed = end[i]
+        sd = strides[i] if i < len(strides) else None
+        slices.append(slice(bg,
+                            ed,
+                            sd))
     return data[tuple(slices)]
 
 

From eb10cbe86ea2bf1bf0e4e9f8a8fa5b04fd1de016 Mon Sep 17 00:00:00 2001
From: Yong Wu <ywu118@alumni.jh.edu>
Date: Wed, 27 May 2020 01:36:35 +0800
Subject: [PATCH 15/22] pylint

---
 include/tvm/relay/attrs/transform.h              |  2 +-
 python/tvm/relay/op/transform.py                 |  3 ++-
 src/relay/op/tensor/transform.cc                 |  2 +-
 tests/python/relay/test_any.py                   |  6 ++++--
 topi/include/topi/transform.h                    |  2 +-
 topi/python/topi/testing/strided_slice_python.py | 16 ++++++++--------
 topi/python/topi/transform.py                    |  2 +-
 7 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 0485dbf914fb..c76b867f3b13 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -221,7 +221,7 @@ struct StridedSliceAttrs : public tvm::AttrsNode<StridedSliceAttrs> {
     TVM_ATTR_FIELD(strides).describe("Stride values of the slice");
     TVM_ATTR_FIELD(ignore_end)
         .set_default(false)
-        .describe("Whether to ignore the input end and infer value of end from input data");
+        .describe("Whether to ignore the negative elements in input end.");
   }
 };
 
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 6033aae3e960..8e10e7c7775b 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -630,7 +630,8 @@ def strided_slice(data, begin, end, strides=None, ignore_end=False):
         the input tensor will be reversed in that particular axis.
 
     ignore_end: boolean, optional
-        Whether to ignore input end.
+        Whether to ignore the negative elements in input end,
+        will slice to the end of data for the ignored element.
 
     Returns
     -------
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 042541b12b54..fe320a5e0d2b 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1962,7 +1962,7 @@ Examples::
     .add_argument("begin", "Tensor", "The indices to begin with in the slicing.")
     .add_argument("end", "Tensor", "Indices indicating end of the slice.")
     .add_argument("strides", "Tensor", "The stride values.")
-    .add_argument("ignore_end", "Tensor", "Whether to ignore end.")
+    .add_argument("ignore_end", "Tensor", "Whether to ignore negative elements of input end.")
     .set_support_level(4)
     .set_attrs_type<StridedSliceAttrs>()
     .add_type_rel("StridedSlice", StridedSliceRel)
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 83c6244a7d72..4c131715d532 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -678,12 +678,14 @@ def verify_any_strided_slice(data_shape, begin_shape, end_shape, strides_shape,
         result = ex.evaluate()(*np_inputs)
         tvm.testing.assert_allclose(result.asnumpy(), ref_res)
 
+
 def test_any_strided_slice():
+    verify_any_strided_slice(any_dims(2), (2,), (2,), (2,), (15, 21))
     verify_any_strided_slice(any_dims(3), (3,), (3,), (3,), (15, 17, 21))
     verify_any_strided_slice(any_dims(3), (3,), (3,), (3,), (23, 29, 41))
     verify_any_strided_slice(any_dims(4), (4,), (4,), (4,), (40, 50, 60, 70))
     verify_any_strided_slice(any_dims(4), (4,), (4,), (4,), (40, 50, 60, 70), ignore_end=True)
-    verify_any_strided_slice(any_dims(2), (2,), (2,), (2,), (6, 7))
+
 
 def test_recursive_concat():
     """
@@ -810,7 +812,7 @@ def test_mixed_input_type():
         ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm")
         result = ex.evaluate()([[data_np0, data_np0], data_np0], data_np1)
         assert result.asnumpy().shape == ref_out_shape, \
-            "Shape mismatch: expect %s but got %s." % (str(ref_out_shape), str(ret.asnumpy().shape))
+            "Shape mismatch: expect %s but got %s." % (str(ref_out_shape), str(result.asnumpy().shape))
 
 if __name__ == "__main__":
     test_any_full()
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index 9d268a9afc74..8201a50335ef 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -520,7 +520,7 @@ inline Array<Tensor> split(const Tensor& x, Array<Integer> split_indices, int ax
  * \param begin The indices to begin with in the slicing
  * \param end Indicies indicating end of the slice
  * \param strides Specifies the stride values, it can be negative
- * \param ignore_end Specifies whether to ignore input end
+ * \param ignore_end Specifies whether to ignore negative elements of input end
  * in that case, the input tensor will be reversed in that particular axis
  * \param name The name of the operation
  * \param tag The tag to mark the operation
diff --git a/topi/python/topi/testing/strided_slice_python.py b/topi/python/topi/testing/strided_slice_python.py
index 62aface75057..c60f05fef66d 100644
--- a/topi/python/topi/testing/strided_slice_python.py
+++ b/topi/python/topi/testing/strided_slice_python.py
@@ -35,7 +35,7 @@ def strided_slice_python(data, begin, end, strides, ignore_end=False):
         The stride of each slice.
 
     ignore_end : boolean
-        Whether to ignore input end
+        Whether to ignore negative elements of input end
 
     Returns
     -------
@@ -45,15 +45,15 @@ def strided_slice_python(data, begin, end, strides, ignore_end=False):
     strides = [] if strides is None else strides
     slices = []
     for i in range(len(data.shape)):
-        bg = begin[i] if i < len(begin) else None
+        new_begin = begin[i] if i < len(begin) else None
         if i >= len(end) or (ignore_end and end[i] < 0):
-            ed = None
+            new_end = None
         else:
-            ed = end[i]
-        sd = strides[i] if i < len(strides) else None
-        slices.append(slice(bg,
-                            ed,
-                            sd))
+            new_end = end[i]
+        new_stride = strides[i] if i < len(strides) else None
+        slices.append(slice(new_begin,
+                            new_end,
+                            new_stride))
     return data[tuple(slices)]
 
 
diff --git a/topi/python/topi/transform.py b/topi/python/topi/transform.py
index fe7ceb6015db..3e479db1c02b 100644
--- a/topi/python/topi/transform.py
+++ b/topi/python/topi/transform.py
@@ -151,7 +151,7 @@ def strided_slice(a, begin, end, strides=None, ignore_end=False):
         in that particular axis.
 
     ignore_end: boolean, optional
-        Specifies whether to ignore input end.
+        Specifies whether to ignore negative elements of input end.
 
     Returns
     -------

From f422d8e8605f6c8222f2682b5910d3c35bddabcc Mon Sep 17 00:00:00 2001
From: Yong Wu <ywu118@alumni.jh.edu>
Date: Thu, 28 May 2020 01:24:38 +0800
Subject: [PATCH 16/22] add out_indices for gpu get_valid_counts

---
 python/tvm/relay/frontend/tensorflow.py       |  1 +
 python/tvm/relay/frontend/tflite.py           |  2 +-
 .../frontend/tensorflow/test_forward.py       |  3 ++-
 tests/python/relay/test_op_level5.py          |  6 ++++--
 .../python/relay/test_pass_alter_op_layout.py |  2 +-
 topi/python/topi/cuda/conv2d_alter_op.py      |  3 ++-
 topi/python/topi/cuda/nms.py                  | 19 +++++++++++++------
 7 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 34fc4893773e..d4c658ab44e4 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -612,6 +612,7 @@ def _impl(inputs, attr, params, mod):
             out = _op.transpose(out, axes=(0, 2, 3, 4, 1))
 
         return out
+    return _impl
 
 def _nms():
     def _impl(inputs, attr, params, mod):
diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index cb10ce5ee924..7868aefc2996 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -2439,7 +2439,7 @@ def convert_detection_postprocess(self, op):
 
         ret = _op.vision.multibox_transform_loc(cls_pred, loc_prob,
                                                 anchor_expr, **multibox_transform_loc_attrs)
-        ret = _op.vision.non_max_suppression(ret[0], ret[1], **non_max_suppression_attrs)
+        ret = _op.vision.non_max_suppression(ret[0], ret[1], ret[1], **non_max_suppression_attrs)
         ret = _op.vision.get_valid_counts(ret, 0)
         valid_count = ret[0]
         # keep only the top 'max_detections' rows
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index b0409d7d0eac..403df7018c6f 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -469,7 +469,7 @@ def test_forward_convolution():
 
 #######################################################################
 # Convolution3D
-# -----------
+# -------------
 
 
 def _test_convolution3d(opname, tensor_in_sizes, filter_in_sizes,
@@ -3313,6 +3313,7 @@ def test_forward_isfinite():
 
     # NN
     test_forward_convolution()
+    test_forward_convolution3d()
     test_forward_pooling()
     test_forward_concat_v2()
     test_forward_lrn()
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index c20a66729712..40842ebcfde2 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -317,11 +317,13 @@ def verify_nms(x0_data, x1_data, x2_data, dshape, ref_res, ref_indices_res,
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             op_res1 = intrp1.evaluate(func)(x0_data, x1_data, x2_data)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
-            op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data, x2_data)
-            tvm.testing.assert_allclose(op_indices_res1[0].asnumpy(), ref_indices_res, rtol=1e-5)
             intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
             op_res2 = intrp2.evaluate(func)(x0_data, x1_data, x2_data)
             tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+            if target == 'cuda':
+                return
+            op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data, x2_data)
+            tvm.testing.assert_allclose(op_indices_res1[0].asnumpy(), ref_indices_res, rtol=1e-5)
             op_indices_res2 = intrp2.evaluate(func_indices)(x0_data, x1_data, x2_data)
             tvm.testing.assert_allclose(op_indices_res2[0].asnumpy(), ref_indices_res, rtol=1e-5)
 
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
index ee4a27c20316..bbe10c773ff9 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -644,7 +644,7 @@ def expected():
 
         y = relay.strided_slice(y,
                                 begin=relay.const([0, 4], "int32"),
-                                end=relay.const([1, 21], "int32"), # [1, 8]
+                                end=relay.const([1, 21], "int32"),
                                 strides=relay.const([1, 1], "int32"))
 
         y = relay.layout_transform(y, "NCHW4c", "NCHW")
diff --git a/topi/python/topi/cuda/conv2d_alter_op.py b/topi/python/topi/cuda/conv2d_alter_op.py
index c1e207cc2938..c2a19054434e 100644
--- a/topi/python/topi/cuda/conv2d_alter_op.py
+++ b/topi/python/topi/cuda/conv2d_alter_op.py
@@ -246,7 +246,8 @@ def _conv2d_legalize(attrs, inputs, arg_types):
                 new_attrs['channels'] = new_out_channel
                 out = tvm.relay.nn.conv2d(data, kernel, **new_attrs)
                 original_out_shape = [x.value for x in output_tensor.shape]
-                out = relay.strided_slice(out, begin=(0, 0, 0, 0), end=original_out_shape)
+                out = relay.strided_slice(out, begin=relay.const([0, 0, 0, 0]),
+                                          end=relay.const(original_out_shape))
             else:
                 out = relay.nn.conv2d(data, kernel, **new_attrs)
             return out
diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 2a206f6cbe68..c72cdad0454c 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -43,7 +43,8 @@ def atomic_add(x, y):
     return tvm.tir.call_pure_intrin(y.dtype, "atomic_add", x, y)
 
 
-def get_valid_counts_ir(data, valid_count, out, score_threshold, id_index, score_index):
+def get_valid_counts_ir(data, valid_count, out, out_indices,
+                        score_threshold, id_index, score_index):
     """Low level IR to get valid count of bounding boxes
     given a score threshold. Also prepares to move valid boxes to the
     top of input data.
@@ -83,6 +84,7 @@ def get_valid_counts_ir(data, valid_count, out, score_threshold, id_index, score
 
     valid_count = ib.buffer_ptr(valid_count)
     out = ib.buffer_ptr(out)
+    out_indices = ib.buffer_ptr(out_indices)
     atomic_add_return = ib.allocate(
         valid_count.dtype, (1,), name='atomic_add_return', scope='local')
     one_count = tvm.tir.const(1, dtype=valid_count.dtype)
@@ -115,9 +117,11 @@ def get_valid_counts_ir(data, valid_count, out, score_threshold, id_index, score
                                                                        valid_count[i]), one_count)
             with ib.for_range(0, elem_length) as k:
                 out[tid * elem_length + k] = data[tid * elem_length + k]
+                out_indices[tid + k] = tid + k
         with ib.else_scope():
             with ib.for_range(0, elem_length) as k:
                 out[tid * elem_length + k] = -one
+                out_indices[tid + k] = -one_count
 
     return ib.get()
 
@@ -149,24 +153,27 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
         Rearranged data tensor.
     """
     batch_size = data.shape[0]
+    num_anchors = data.shape[1]
     data_buf = tvm.tir.decl_buffer(
         data.shape, data.dtype, "data_buf", data_alignment=8)
     valid_count_buf = tvm.tir.decl_buffer(
         (batch_size,), "int32", "valid_count_buf", data_alignment=8)
     out_buf = tvm.tir.decl_buffer(
         data.shape, data.dtype, "out_buf", data_alignment=8)
+    out_indices_buf = tvm.tir.decl_buffer(
+        (batch_size, num_anchors), "int32", "out_buf", data_alignment=8)
 
-    valid_count, out = \
-        te.extern([(batch_size,), data.shape], [data],
+    valid_count, out, out_indices = \
+        te.extern([(batch_size,), data.shape, (batch_size, num_anchors)], [data],
                   lambda ins, outs: get_valid_counts_ir(
-            ins[0], outs[0], outs[1], score_threshold, id_index, score_index),
+            ins[0], outs[0], outs[1], outs[2], score_threshold, id_index, score_index),
             dtype=["int32", data.dtype],
             in_buffers=[data_buf],
-            out_buffers=[valid_count_buf, out_buf],
+            out_buffers=[valid_count_buf, out_buf, out_indices_buf],
             name="get_valid_counts",
             tag="get_valid_counts_gpu")
 
-    return [valid_count, out]
+    return [valid_count, out, out_indices]
 
 
 def nms_ir(data, sorted_index, valid_count, out, box_indices,

From 6caa7562a6ee52930841936cc848cb3267c0e2db Mon Sep 17 00:00:00 2001
From: Yong Wu <ywu118@alumni.jh.edu>
Date: Fri, 29 May 2020 00:22:13 +0800
Subject: [PATCH 17/22] change to slice_mode

---
 include/tvm/relay/attrs/transform.h           |  4 +-
 python/tvm/relay/op/_transform.py             | 40 ++++++++++++++-----
 python/tvm/relay/op/transform.py              |  6 +--
 src/relay/op/tensor/transform.cc              | 32 +++++++++++----
 src/relay/transforms/pattern_util.h           |  2 +-
 tests/python/relay/test_any.py                |  8 ++--
 tests/python/relay/test_op_level4.py          | 12 +++---
 .../test_pass_combine_parallel_conv2d.py      | 18 ++++-----
 topi/include/topi/transform.h                 | 15 +++++--
 .../topi/testing/strided_slice_python.py      | 17 ++++++--
 topi/python/topi/transform.py                 |  6 +--
 11 files changed, 106 insertions(+), 54 deletions(-)

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index c76b867f3b13..42e894bc0c8b 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -213,13 +213,13 @@ struct StridedSliceAttrs : public tvm::AttrsNode<StridedSliceAttrs> {
   Optional<Array<Integer>> begin;
   Optional<Array<Integer>> end;
   Optional<Array<Integer>> strides;
-  bool ignore_end;
+  bool slice_mode;
 
   TVM_DECLARE_ATTRS(StridedSliceAttrs, "relay.attrs.StridedSliceAttrs") {
     TVM_ATTR_FIELD(begin).describe("Indices for begin of slice, begin index is also inclusive");
     TVM_ATTR_FIELD(end).describe("Indices for end of slice, end index is exclusive");
     TVM_ATTR_FIELD(strides).describe("Stride values of the slice");
-    TVM_ATTR_FIELD(ignore_end)
+    TVM_ATTR_FIELD(slice_mode)
         .set_default(false)
         .describe("Whether to ignore the negative elements in input end.");
   }
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 4d665fab5eb1..3a22c16be3fa 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -106,25 +106,34 @@ def arange_shape_func(attrs, inputs, _):
 
 @script
 def _strided_slice_shape_func_input_data(data, begin, end, strides,
-                                         ignore_end):
+                                         slice_mode):
     ndim = len(data.shape)
     out = output_tensor((ndim,), "int64")
     for i in const_range(ndim):
         cbegin = 0
         cend = data.shape[i]
         cstride = 1
+        if strides.shape[0] > i:
+            cstride = strides[i]
         if begin.shape[0] > i:
             cbegin = begin[i]
-        if ignore_end != 0 or end.shape[0] > i:
+        if end.shape[0] <= i:
+            cend = data.shape[i]
+        elif slice_mode != 0:
+            if end[i] < 0:
+                cend = data.shape[i]
+            elif cstride < 0:
+                cend = cbegin - end[i]
+            else:
+                cend = cbegin + end[i]
+        else:
             cend = end[i]
-        if strides.shape[0] > i:
-            cstride = strides[i]
         assert cstride != 0, "Strides can't be zero."
         out[i] = int64(ceil_div((int64(cend) - int64(cbegin)), int64(cstride)))
     return out
 
 @script
-def _strided_slice_shape_func_input_shape(data_shape, begin, end, strides, ignore_end):
+def _strided_slice_shape_func_input_shape(data_shape, begin, end, strides, slice_mode):
     ndim = data_shape.shape[0]
     assert ndim == 2, "not correct"
     out = output_tensor((ndim,), "int64")
@@ -132,12 +141,21 @@ def _strided_slice_shape_func_input_shape(data_shape, begin, end, strides, ignor
         cbegin = int64(0)
         cend = int64(data_shape[i])
         cstride = int64(1)
+        if len(strides) > i:
+            cstride = int64(strides[i])
         if len(begin) > i:
             cbegin = int64(begin[i])
-        if len(end) > i:
+        if len(end) <= i:
+            cend = int64(data_shape[i])
+        elif slice_mode != 0:
+            if end[i] < 0:
+                cend = int64(data_shape[i])
+            elif cstride < 0:
+                cend = cbegin - int64(end[i])
+            else:
+                cend = cbegin + int64(end[i])
+        else:
             cend = int64(end[i])
-        if ignore_end != 0 and len(strides) > i:
-            cstride = int64(strides[i])
         assert cstride != 0, "Strides can't be zero."
         out[i] = int64(ceil_div((int64(cend) - int64(cbegin)), int64(cstride)))
     return out
@@ -148,12 +166,12 @@ def strided_slice_shape_func(attrs, inputs, _):
     """
     Shape func for strided_slice
     """
-    ignore_end = convert(get_const_int(attrs.ignore_end))
+    slice_mode = convert(get_const_int(attrs.slice_mode))
     # data independent if begin, end and strides exist
     if attrs.begin and attrs.end and attrs.strides:
         return [_strided_slice_shape_func_input_shape(inputs[0], attrs.begin, attrs.end,
-                                                      attrs.strides, ignore_end)]
-    return [_strided_slice_shape_func_input_data(*inputs, ignore_end)]
+                                                      attrs.strides, slice_mode)]
+    return [_strided_slice_shape_func_input_data(*inputs, slice_mode)]
 
 @script
 def _concatenate_shape_func(inputs, axis):
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 8e10e7c7775b..6e52f7896d2b 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -611,7 +611,7 @@ def split(data, indices_or_sections, axis=0):
     return TupleWrapper(_make.split(data, indices_or_sections, axis), ret_size)
 
 
-def strided_slice(data, begin, end, strides=None, ignore_end=False):
+def strided_slice(data, begin, end, strides=None, slice_mode=False):
     """Strided slice of an array.
 
     Parameters
@@ -629,7 +629,7 @@ def strided_slice(data, begin, end, strides=None, ignore_end=False):
         Specifies the stride values, it can be negative in that case,
         the input tensor will be reversed in that particular axis.
 
-    ignore_end: boolean, optional
+    slice_mode: boolean, optional
         Whether to ignore the negative elements in input end,
         will slice to the end of data for the ignored element.
 
@@ -645,7 +645,7 @@ def strided_slice(data, begin, end, strides=None, ignore_end=False):
         end = const(list(end))
     if isinstance(strides, list):
         strides = const(list(strides))
-    return _make.strided_slice(data, begin, end, strides, ignore_end)
+    return _make.strided_slice(data, begin, end, strides, slice_mode)
 
 
 def strided_set(data, v, begin, end, strides=None):
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index fe320a5e0d2b..a405aca73983 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1711,9 +1711,16 @@ bool StridedSliceRel(const Array<Type>& types, int num_inputs, const Attrs& attr
     std::vector<int64_t> end_vec;
     for (size_t i = 0; i < param->end.value().size(); ++i) {
       // allow end to be None
-      if (!param->end.value()[i].defined() ||
-          (param->ignore_end && param->end.value()[i]->value < 0)) {
+      if (!param->end.value()[i].defined()) {
         end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
+      } else if (param->slice_mode) {
+          if (param->end.value()[i]->value < 0) {
+            end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
+          } else if (stride_vec[i] < 0) {
+              end_vec.push_back(begin_vec[i] - param->end.value()[i]->value);
+          } else {
+              end_vec.push_back(begin_vec[i] + param->end.value()[i]->value);
+          }
       } else {
         end_vec.push_back(param->end.value()[i]->value);
       }
@@ -1834,10 +1841,19 @@ Array<Array<Layout>> StridedSliceInferCorrectLayout(const Attrs& attrs,
           }
         }
         int64_t bg = begin[i].defined() ? begin[i]->value : 0;
-        int64_t ed = end[i].defined() ? end[i]->value : shape[i].as<IntImmNode>()->value;
-        if (params->ignore_end && end[i].defined() && end[i]->value < 0) {
+        int64_t ed;
+        if (!end[i].defined()) {
           ed = shape[i].as<IntImmNode>()->value;
+        } else if (params->slice_mode) {
+          if (end[i]->value < 0) {
+            ed = shape[i].as<IntImmNode>()->value;
+          } else {
+            ed = bg + end[i]->value;
+          }
+        } else {
+          ed = end[i]->value;
         }
+
         if (bg % factor || ed % factor) {
           // transform to original layout
           return {{Layout::Undef()}, {Layout::Undef()}};
@@ -1886,7 +1902,7 @@ Array<te::Tensor> StridedSliceCompute(const Attrs& attrs, const Array<te::Tensor
     end = param->end.value();
     strides = param->strides.value();
     return Array<te::Tensor>{
-        topi::strided_slice(inputs[0], begin, end, strides, param->ignore_end)};
+        topi::strided_slice(inputs[0], begin, end, strides, param->slice_mode)};
   } else {
     te::Tensor data = inputs[0];
     te::Tensor begin = inputs[1];
@@ -1904,7 +1920,7 @@ Array<te::Tensor> StridedSliceCompute(const Attrs& attrs, const Array<te::Tensor
 }
 
 // Positional relay function to create StridedSlice operator used by frontend FFI.
-Expr MakeStridedSlice(Expr data, Expr begin, Expr end, Expr strides, bool ignore_end) {
+Expr MakeStridedSlice(Expr data, Expr begin, Expr end, Expr strides, bool slice_mode) {
   auto attrs = make_object<StridedSliceAttrs>();
   const ConstantNode *cbegin, *cend, *cstrides;
   if ((cbegin = begin.as<ConstantNode>()) && (cend = end.as<ConstantNode>()) &&
@@ -1926,7 +1942,7 @@ Expr MakeStridedSlice(Expr data, Expr begin, Expr end, Expr strides, bool ignore
     attrs->end = end;
     attrs->strides = strides;
   }
-  attrs->ignore_end = ignore_end;
+  attrs->slice_mode = slice_mode;
   static const Op& op = Op::Get("strided_slice");
   return Call(op, {data, begin, end, strides}, Attrs(attrs), {});
 }
@@ -1962,7 +1978,7 @@ Examples::
     .add_argument("begin", "Tensor", "The indices to begin with in the slicing.")
     .add_argument("end", "Tensor", "Indices indicating end of the slice.")
     .add_argument("strides", "Tensor", "The stride values.")
-    .add_argument("ignore_end", "Tensor", "Whether to ignore negative elements of input end.")
+    .add_argument("slice_mode", "Tensor", "Whether to ignore negative elements of input end.")
     .set_support_level(4)
     .set_attrs_type<StridedSliceAttrs>()
     .add_type_rel("StridedSlice", StridedSliceRel)
diff --git a/src/relay/transforms/pattern_util.h b/src/relay/transforms/pattern_util.h
index 89f29fcc0cce..2e73c632230b 100644
--- a/src/relay/transforms/pattern_util.h
+++ b/src/relay/transforms/pattern_util.h
@@ -673,7 +673,7 @@ Expr MakeConcatenate(Expr data, int axis);
 
 Expr MakeRepeat(Expr data, int repeats, int axis);
 
-Expr MakeStridedSlice(Expr data, Expr begin, Expr end, Expr strides, bool ignore_end);
+Expr MakeStridedSlice(Expr data, Expr begin, Expr end, Expr strides, bool slice_mode);
 
 Expr MakeStack(Expr data, int axis);
 
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 4c131715d532..e09a93b0744d 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -644,14 +644,14 @@ def test_arange_with_dynamic_shape():
         tvm.testing.assert_allclose(result.asnumpy(), np.array(range(10)).astype("int32")+1)
 
 def verify_any_strided_slice(data_shape, begin_shape, end_shape, strides_shape,
-                             data_np_shape, ignore_end=False, const_attrs=False, dtype="int32"):
+                             data_np_shape, slice_mode=False, const_attrs=False, dtype="int32"):
     # Generate random numpy input data
     np_data = np.random.uniform(size=data_np_shape).astype('float32')
     np_begin = np.random.randint(2, size=begin_shape, dtype=dtype)
     np_end = np.random.randint(5, 15, size=end_shape, dtype=dtype)
     np_strides = np.random.randint(1, 3, size=strides_shape, dtype=dtype)
     # target numpy result
-    ref_res = topi.testing.strided_slice_python(np_data, np_begin, np_end, np_strides, ignore_end)
+    ref_res = topi.testing.strided_slice_python(np_data, np_begin, np_end, np_strides, slice_mode)
 
     # Relay Module
     mod = tvm.IRModule()
@@ -670,7 +670,7 @@ def verify_any_strided_slice(data_shape, begin_shape, end_shape, strides_shape,
         np_inputs = [np_data, np_begin, np_end, np_strides]
 
     y = relay.strided_slice(data, begin=begin, end=end,
-                            strides=strides, ignore_end=ignore_end)
+                            strides=strides, slice_mode=slice_mode)
     mod["main"] = relay.Function(args, y)
 
     for kind in ["debug", "vm"]:
@@ -684,7 +684,7 @@ def test_any_strided_slice():
     verify_any_strided_slice(any_dims(3), (3,), (3,), (3,), (15, 17, 21))
     verify_any_strided_slice(any_dims(3), (3,), (3,), (3,), (23, 29, 41))
     verify_any_strided_slice(any_dims(4), (4,), (4,), (4,), (40, 50, 60, 70))
-    verify_any_strided_slice(any_dims(4), (4,), (4,), (4,), (40, 50, 60, 70), ignore_end=True)
+    verify_any_strided_slice(any_dims(4), (4,), (4,), (4,), (40, 50, 60, 70), slice_mode=True)
 
 
 def test_recursive_concat():
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 081236badd02..4f04a3a53307 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -296,7 +296,7 @@ def test_mean_var_std():
 
 
 def test_strided_slice():
-    def verify(dshape, begin, end, strides, output, ignore_end=False,
+    def verify(dshape, begin, end, strides, output, slice_mode=False,
                attr_const=True, test_ref=True, dtype="int32"):
         x = relay.var("x", relay.TensorType(dshape, "float32"))
         ndim = len(dshape)
@@ -306,7 +306,7 @@ def verify(dshape, begin, end, strides, output, ignore_end=False,
         # target numpy result
         x_data = np.random.uniform(size=dshape).astype("float32")
         ref_res = topi.testing.strided_slice_python(
-            x_data, begin, end, strides, ignore_end)
+            x_data, begin, end, strides, slice_mode)
 
         if attr_const:
             begin = relay.const(begin, dtype=dtype)
@@ -319,12 +319,12 @@ def verify(dshape, begin, end, strides, output, ignore_end=False,
                                     begin=begin,
                                     end=end,
                                     strides=strides,
-                                    ignore_end=ignore_end)
+                                    slice_mode=slice_mode)
         else:
             z = relay.strided_slice(x,
                                     begin=begin,
                                     end=end,
-                                    ignore_end=ignore_end)
+                                    slice_mode=slice_mode)
         func = relay.Function([x], z)
 
         func = run_infer_type(func)
@@ -354,9 +354,9 @@ def verify(dshape, begin, end, strides, output, ignore_end=False,
     verify((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], (1, 4, 3))
     verify((3, 4, 3), [1, -1, 0], [2, -3, 3], [1, -1, 1], (1, 2, 3))
     verify((3, 4, 3), [1, 0, 0], [3, -1, 3], [1, 1, 2],
-           (2, 4, 2), ignore_end=True, test_ref=False)
+           (2, 4, 2), slice_mode=True, test_ref=False)
     verify((3, 4, 3), [1, 0, 0], [-1, 2, 3], [1, 1, 2],
-           (2, 2, 2), ignore_end=True, test_ref=True)
+           (2, 2, 2), slice_mode=True, test_ref=True)
 
 def test_strided_set():
     def verify(dshape, begin, end, strides, vshape, test_ref=True):
diff --git a/tests/python/relay/test_pass_combine_parallel_conv2d.py b/tests/python/relay/test_pass_combine_parallel_conv2d.py
index 28c9655808f5..112791d83fb7 100644
--- a/tests/python/relay/test_pass_combine_parallel_conv2d.py
+++ b/tests/python/relay/test_pass_combine_parallel_conv2d.py
@@ -53,18 +53,18 @@ def expected(x, w1, w2, w3, w4, channels1, channels2, channels3, channels4):
                                  begin=relay.const([0, 0], "int64"),
                                  end=relay.const([-1, channels1], "int64"),
                                  strides=relay.const([1, 1], 'int64'),
-                                 ignore_end=True)
+                                 slice_mode=True)
         y2 = relay.strided_slice(y,
                                  begin=relay.const([0, channels1], "int64"),
                                  end=relay.const([-1, channels1 + channels2], "int64"),
                                  strides=relay.const([1, 1], 'int64'),
-                                 ignore_end=True)
+                                 slice_mode=True)
         y3 = relay.nn.conv2d(x, w3)
         y4 = relay.strided_slice(y,
                                  begin=relay.const([0, channels1 + channels2], "int64"),
                                  end=relay.const([-1, channels1 + channels2 + channels4], "int64"),
                                  strides=relay.const([1, 1], 'int64'),
-                                 ignore_end=True)
+                                 slice_mode=True)
         y5 = relay.nn.max_pool2d(x)
         y = relay.Tuple((y1, y2, y3, y4, y5))
         return relay.Function(args, y)
@@ -113,12 +113,12 @@ def expected(x, w1, w2, scale1, scale2, bias, channels1, channels2):
                                  begin=relay.const([0, 0], "int64"),
                                  end=relay.const([-1, channels1], "int64"),
                                  strides=relay.const([1, 1], "int64"),
-                                 ignore_end=True)
+                                 slice_mode=True)
         y2 = relay.strided_slice(y,
                                  begin=relay.const([0, channels1], "int64"),
                                  end=relay.const([-1, channels1 + channels2], "int64"),
                                  strides=relay.const([1, 1], "int64"),
-                                 ignore_end=True)
+                                 slice_mode=True)
         y2 = relay.add(y2, bias)
         y = relay.Tuple((y1, y2))
         return relay.Function(args, y)
@@ -160,12 +160,12 @@ def expected(x, w1, w2, scale1, scale2, channels1, channels2):
                                  begin=relay.const([0, 0], "int64"),
                                  end=relay.const([-1, channels1], "int64"),
                                  strides=relay.const([1, 1], "int64"),
-                                 ignore_end=True)
+                                 slice_mode=True)
         y2 = relay.strided_slice(y,
                                  begin=relay.const([0, channels1], "int64"),
                                  end=relay.const([-1, channels1 + channels2], "int64"),
                                  strides=relay.const([1, 1], "int64"),
-                                 ignore_end=True)
+                                 slice_mode=True)
         y1 = relay.multiply(y1, scale1)
         y2 = relay.multiply(y2, scale2)
         y = relay.Tuple((y1, y2))
@@ -208,12 +208,12 @@ def expected(x, w, channels, repeat):
                                      begin=relay.const([0, 0], "int64"),
                                      end=relay.const([-1, channels], "int64"),
                                      strides=relay.const([1, 1], "int64"),
-                                     ignore_end=True)
+                                     slice_mode=True)
             y2 = relay.strided_slice(y,
                                      begin=relay.const([0, channels], "int64"),
                                      end=relay.const([-1, channels * 2], "int64"),
                                      strides=relay.const([1, 1], "int64"),
-                                     ignore_end=True)
+                                     slice_mode=True)
             y = relay.concatenate((y1, y2), axis=1)
         return relay.Function(args, y)
 
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index 8201a50335ef..873eae5bc76f 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -520,7 +520,7 @@ inline Array<Tensor> split(const Tensor& x, Array<Integer> split_indices, int ax
  * \param begin The indices to begin with in the slicing
  * \param end Indicies indicating end of the slice
  * \param strides Specifies the stride values, it can be negative
- * \param ignore_end Specifies whether to ignore negative elements of input end
+ * \param slice_mode Specifies whether to ignore negative elements of input end
  * in that case, the input tensor will be reversed in that particular axis
  * \param name The name of the operation
  * \param tag The tag to mark the operation
@@ -528,7 +528,7 @@ inline Array<Tensor> split(const Tensor& x, Array<Integer> split_indices, int ax
  * \return A Tensor whose op member is the split operation
  */
 inline Tensor strided_slice(const Tensor& x, const Array<Integer>& begin, const Array<Integer>& end,
-                            const Array<Integer>& strides, const bool& ignore_end,
+                            const Array<Integer>& strides, const bool& slice_mode,
                             std::string name = "T_strided_slice", std::string tag = kInjective) {
   size_t src_tensor_dim = static_cast<size_t>(x->shape.size());
   // Setup the ranges.
@@ -560,8 +560,17 @@ inline Tensor strided_slice(const Tensor& x, const Array<Integer>& begin, const
   std::vector<int64_t> end_vec;
   for (size_t i = 0; i < end.size(); ++i) {
     // allow end to be None
-    if (!end[i].defined() || (ignore_end && end[i]->value < 0)) {
+
+    if (!end[i].defined()) {
       end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
+    } else if (slice_mode) {
+      if (end[i]->value < 0) {
+        end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
+      } else if (stride_vec[i] > 0) {
+        end_vec.push_back(begin_vec[i] + end[i]->value);
+      } else {
+        end_vec.push_back(begin_vec[i] - end[i]->value);
+      }
     } else {
       end_vec.push_back(end[i]->value);
     }
diff --git a/topi/python/topi/testing/strided_slice_python.py b/topi/python/topi/testing/strided_slice_python.py
index c60f05fef66d..a72402ca4dc2 100644
--- a/topi/python/topi/testing/strided_slice_python.py
+++ b/topi/python/topi/testing/strided_slice_python.py
@@ -17,7 +17,7 @@
 """strided_slice/set in python"""
 
 
-def strided_slice_python(data, begin, end, strides, ignore_end=False):
+def strided_slice_python(data, begin, end, strides, slice_mode=False):
     """Python version of strided slice operator.
 
     Parameters
@@ -34,7 +34,7 @@ def strided_slice_python(data, begin, end, strides, ignore_end=False):
     strides : list
         The stride of each slice.
 
-    ignore_end : boolean
+    slice_mode : boolean
         Whether to ignore negative elements of input end
 
     Returns
@@ -45,12 +45,21 @@ def strided_slice_python(data, begin, end, strides, ignore_end=False):
     strides = [] if strides is None else strides
     slices = []
     for i in range(len(data.shape)):
+        new_stride = strides[i] if i < len(strides) else None
+
         new_begin = begin[i] if i < len(begin) else None
-        if i >= len(end) or (ignore_end and end[i] < 0):
+        if i >= len(end):
             new_end = None
+        elif slice_mode:
+            if end[i] < 0:
+                new_end = None
+            elif new_stride and new_stride < 0:
+                new_end = new_begin - end[i]
+            else:
+                new_end = new_begin + end[i]
         else:
             new_end = end[i]
-        new_stride = strides[i] if i < len(strides) else None
+
         slices.append(slice(new_begin,
                             new_end,
                             new_stride))
diff --git a/topi/python/topi/transform.py b/topi/python/topi/transform.py
index 3e479db1c02b..209c4b2d837c 100644
--- a/topi/python/topi/transform.py
+++ b/topi/python/topi/transform.py
@@ -131,7 +131,7 @@ def flip(a, axis=0):
     """
     return cpp.flip(a, axis)
 
-def strided_slice(a, begin, end, strides=None, ignore_end=False):
+def strided_slice(a, begin, end, strides=None, slice_mode=False):
     """Slice of an array.
 
     Parameters
@@ -150,7 +150,7 @@ def strided_slice(a, begin, end, strides=None, ignore_end=False):
         in that case, the input tensor will be reversed
         in that particular axis.
 
-    ignore_end: boolean, optional
+    slice_mode: boolean, optional
         Specifies whether to ignore negative elements of input end.
 
     Returns
@@ -159,7 +159,7 @@ def strided_slice(a, begin, end, strides=None, ignore_end=False):
     """
     if strides is None:
         strides = []
-    return cpp.strided_slice(a, begin, end, strides, ignore_end)
+    return cpp.strided_slice(a, begin, end, strides, slice_mode)
 
 @tvm.te.tag_scope(tag=tag.INJECTIVE+",strided_set")
 def strided_set(a, v, begin, end, strides=None):

From 69a440e54b39c11e98a1d3f9cdda201b875469cc Mon Sep 17 00:00:00 2001
From: Yong Wu <ywu118@alumni.jh.edu>
Date: Sun, 31 May 2020 04:53:04 +0800
Subject: [PATCH 18/22] clang-format, fix comments

---
 include/tvm/relay/attrs/transform.h           |  6 +++-
 python/tvm/relay/frontend/pytorch.py          | 16 ++++++++--
 python/tvm/relay/frontend/tensorflow.py       | 22 ++++++-------
 python/tvm/relay/op/_transform.py             |  6 ++--
 python/tvm/relay/op/transform.py              | 30 +++++++++--------
 src/relay/op/tensor/transform.cc              | 32 +++++++++----------
 .../transforms/combine_parallel_conv2d.cc     |  1 +
 tests/python/relay/test_any.py                |  6 ++--
 tests/python/relay/test_op_level4.py          |  9 +++---
 .../test_pass_combine_parallel_conv2d.py      | 10 +++---
 topi/include/topi/transform.h                 | 18 ++++-------
 .../topi/testing/strided_slice_python.py      | 11 ++++---
 topi/python/topi/transform.py                 |  5 ++-
 13 files changed, 92 insertions(+), 80 deletions(-)

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 42e894bc0c8b..c1700ed14b79 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -221,7 +221,11 @@ struct StridedSliceAttrs : public tvm::AttrsNode<StridedSliceAttrs> {
     TVM_ATTR_FIELD(strides).describe("Stride values of the slice");
     TVM_ATTR_FIELD(slice_mode)
         .set_default(false)
-        .describe("Whether to ignore the negative elements in input end.");
+        .describe(
+            "Specifies whether to enable slice mode. In slice mode,"
+            "strides will be ignored, end indicates the size of a slice"
+            "starting at the location specified by begin. If end[i] is -1,"
+            "all remaining elements in that dimension are included in the slice");
   }
 };
 
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index cc7cd4830cd4..c7c80de15a54 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -243,7 +243,11 @@ def _impl(inputs, input_types):
             end[dim] = inputs[3]
 
         strides.append(int(inputs[4]))
-        return _op.transform.strided_slice(data, begin, end, strides)
+        return _op.transform.strided_slice(data,
+                                           begin=_expr.const(begin),
+                                           end=_expr.const(end),
+                                           strides=_expr.const(strides),
+                                           slice_mode=True)
     return _impl
 
 def _split():
@@ -1233,7 +1237,10 @@ def _impl(inputs, input_types):
             end[axis] = i + unif_size
             stride = [1] * len(shape)
 
-            chunk_out = _op.transform.strided_slice(data, begin, end, stride)
+            chunk_out = _op.transform.strided_slice(data,
+                                                    begin=_expr.const(begin),
+                                                    end=_expr.const(end),
+                                                    strides=_expr.const(stride))
             chunks.append(chunk_out)
 
         if dim % num_chunks:
@@ -1243,7 +1250,10 @@ def _impl(inputs, input_types):
             end[axis] = dim
             stride = [1] * len(shape)
 
-            chunk_out = _op.transform.strided_slice(data, begin, end, stride)
+            chunk_out = _op.transform.strided_slice(data,
+                                                    begin=_expr.const(begin),
+                                                    end=_expr.const(end),
+                                                    strides=_expr.const(stride))
             chunks.append(chunk_out)
 
         return chunks
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index d4c658ab44e4..002fb857e258 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -652,11 +652,12 @@ def _impl(inputs, attr, params, mod):
                                                       invalid_to_bottom=False)
 
         # squeeze it, TF NMS is not batched
-        end = get_relay_op("squeeze")(nms_ret[1], axis=[1])
+        size = get_relay_op("squeeze")(nms_ret[1], axis=[1])
         data_slice = get_relay_op("squeeze")(nms_ret[0], axis=[0])
 
         # slice to get the dynamic result
-        ret = get_relay_op("strided_slice")(data_slice, _expr.const([0]), end, _expr.const([1]))
+        ret = get_relay_op("strided_slice")(data_slice, begin=_expr.const([0]),
+                                            end=size, slice_mode=True)
         return ret
     return _impl
 
@@ -1165,7 +1166,11 @@ def _impl(inputs, attr, params, mod):
         try:
             begin = _get_list_param(params, inputs[1])
         except (IndexError, KeyError, AttributeError):
-            begin = _infer_value(inputs[1], params).asnumpy().tolist()[0]
+            # Handle symbolic begin
+            try:
+                begin = _infer_value(inputs[1], params).asnumpy().tolist()[0]
+            except Exception:
+                begin = inputs[1]
         try:
             size = _get_list_param(params, inputs[2])
         except (IndexError, KeyError, AttributeError):
@@ -1174,16 +1179,7 @@ def _impl(inputs, attr, params, mod):
                 size = _infer_value(inputs[2], params).asnumpy().tolist()[0]
             except Exception:
                 size = inputs[2]
-        data_shape = _infer_shape(inputs[0], mod)
-        data_dim = len(data_shape)
-        end = size
-        if not isinstance(end, (_expr.Call, _expr.Var)):
-            for i in range(data_dim):
-                if size[i] == -1:
-                    end[i] = data_shape[i]
-                else:
-                    end[i] += begin[i]
-        return _op.strided_slice(inputs[0], begin=begin, end=end)
+        return _op.strided_slice(inputs[0], begin=begin, end=size, slice_mode=True)
     return _impl
 
 
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 3a22c16be3fa..1ddd335e4d9c 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -120,10 +120,9 @@ def _strided_slice_shape_func_input_data(data, begin, end, strides,
         if end.shape[0] <= i:
             cend = data.shape[i]
         elif slice_mode != 0:
+            cstride = 1
             if end[i] < 0:
                 cend = data.shape[i]
-            elif cstride < 0:
-                cend = cbegin - end[i]
             else:
                 cend = cbegin + end[i]
         else:
@@ -148,10 +147,9 @@ def _strided_slice_shape_func_input_shape(data_shape, begin, end, strides, slice
         if len(end) <= i:
             cend = int64(data_shape[i])
         elif slice_mode != 0:
+            cstride = int64(1)
             if end[i] < 0:
                 cend = int64(data_shape[i])
-            elif cstride < 0:
-                cend = cbegin - int64(end[i])
             else:
                 cend = cbegin + int64(end[i])
         else:
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 6e52f7896d2b..38a17c41bd32 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -619,19 +619,21 @@ def strided_slice(data, begin, end, strides=None, slice_mode=False):
     data : relay.Expr
         The source array to be sliced.
 
-    begin: relay.Expr or List[int]
+    begin: relay.Expr, Tuple[int], or List[int]
         The indices to begin with in the slicing.
 
-    end: relay.Expr or List[int]
+    end: relay.Expr, Tuple[int], or List[int]
         Indices indicating end of the slice.
 
-    strides: relay.Expr or List[int], optional
+    strides: relay.Expr, Tuple[int], or List[int], optional
         Specifies the stride values, it can be negative in that case,
         the input tensor will be reversed in that particular axis.
 
     slice_mode: boolean, optional
-        Whether to ignore the negative elements in input end,
-        will slice to the end of data for the ignored element.
+        Specifies whether to enable slice mode. In slice mode,
+        strides will be ignored, end indicates the size of a slice
+        starting at the location specified by begin. If end[i] is -1,
+        all remaining elements in that dimension are included in the slice
 
     Returns
     -------
@@ -639,11 +641,11 @@ def strided_slice(data, begin, end, strides=None, slice_mode=False):
         The computed result.
     """
     strides = strides or const([1], dtype="int32")
-    if isinstance(begin, list):
+    if isinstance(begin, (tuple, list)):
         begin = const(list(begin))
-    if isinstance(end, list):
+    if isinstance(end, (tuple, list)):
         end = const(list(end))
-    if isinstance(strides, list):
+    if isinstance(strides, (tuple, list)):
         strides = const(list(strides))
     return _make.strided_slice(data, begin, end, strides, slice_mode)
 
@@ -659,13 +661,13 @@ def strided_set(data, v, begin, end, strides=None):
     v : relay.Expr
         The data to be set.
 
-    begin: relay.Expr or List[int]
+    begin: relay.Expr, Tuple[int], or List[int]
         The indices to begin with in the slicing.
 
-    end: relay.Expr or List[int]
+    end: relay.Expr, Tuple[int], or List[int]
         Indices indicating end of the slice.
 
-    strides: relay.Expr or List[int], optional
+    strides: relay.Expr, Tuple[int], or List[int], optional
         Specifies the stride values, it can be negative in that case,
         the input tensor will be reversed in that particular axis.
 
@@ -675,11 +677,11 @@ def strided_set(data, v, begin, end, strides=None):
         The computed result.
     """
     strides = strides or const([1], dtype="int32")
-    if isinstance(begin, list):
+    if isinstance(begin, (tuple, list)):
         begin = const(list(begin))
-    if isinstance(end, list):
+    if isinstance(end, (tuple, list)):
         end = const(list(end))
-    if isinstance(strides, list):
+    if isinstance(strides, (tuple, list)):
         strides = const(list(strides))
     return _make.strided_set(data, v, begin, end, strides)
 
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index a405aca73983..a2d4704dfa3b 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1687,13 +1687,13 @@ bool StridedSliceRel(const Array<Type>& types, int num_inputs, const Attrs& attr
   // calculate output shape
   std::vector<IndexExpr> oshape(num_axis);
   if (param->begin && param->end && param->strides) {
-    std::vector<int64_t> stride_vec;
-    for (Integer i : param->strides.value()) {
-      CHECK(i.defined());
-      stride_vec.push_back(i->value);
-    }
-    for (int64_t i = stride_vec.size(); i < num_axis; ++i) {
-      stride_vec.push_back(1);
+    // stride will be set as 1 if slice mode is enabled
+    std::vector<int64_t> stride_vec(num_axis, 1);
+    if (!param->slice_mode) {
+      for (size_t i = 0; i < param->strides.value().size(); ++i) {
+        CHECK(param->strides.value()[i].defined());
+        stride_vec[i] = param->strides.value()[i]->value;
+      }
     }
     const int64_t max_range = std::numeric_limits<int64_t>::max();
     std::vector<int64_t> begin_vec;
@@ -1714,13 +1714,11 @@ bool StridedSliceRel(const Array<Type>& types, int num_inputs, const Attrs& attr
       if (!param->end.value()[i].defined()) {
         end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
       } else if (param->slice_mode) {
-          if (param->end.value()[i]->value < 0) {
-            end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
-          } else if (stride_vec[i] < 0) {
-              end_vec.push_back(begin_vec[i] - param->end.value()[i]->value);
-          } else {
-              end_vec.push_back(begin_vec[i] + param->end.value()[i]->value);
-          }
+        if (param->end.value()[i]->value < 0) {
+          end_vec.push_back(max_range);
+        } else {
+          end_vec.push_back(begin_vec[i] + param->end.value()[i]->value);
+        }
       } else {
         end_vec.push_back(param->end.value()[i]->value);
       }
@@ -1762,7 +1760,7 @@ bool StridedSliceRel(const Array<Type>& types, int num_inputs, const Attrs& attr
       } else {
         if (begin_v < 0) begin_v = 0;
         CHECK_GE(stride_v, 0);
-        CHECK_LT(begin_v, end_v) << "strided_slice get empty slice at axis " << i;
+        CHECK_LE(begin_v, end_v) << "strided_slice get invalid slice at axis " << i;
         end_v = std::min(dim_size, end_v);
         slice_range = end_v - begin_v;
         step = stride_v;
@@ -1807,7 +1805,7 @@ Array<Array<Layout>> StridedSliceInferCorrectLayout(const Attrs& attrs,
     if (params->begin && params->end && params->strides) {
       for (Integer i : params->strides.value()) {
         CHECK(i.defined());
-        strides.push_back(i->value);
+        strides.push_back(params->slice_mode ? 1 : i->value);
       }
 
       for (Integer i : params->begin.value()) {
@@ -1978,7 +1976,7 @@ Examples::
     .add_argument("begin", "Tensor", "The indices to begin with in the slicing.")
     .add_argument("end", "Tensor", "Indices indicating end of the slice.")
     .add_argument("strides", "Tensor", "The stride values.")
-    .add_argument("slice_mode", "Tensor", "Whether to ignore negative elements of input end.")
+    .add_argument("slice_mode", "Tensor", "Whether to enable slice mode.")
     .set_support_level(4)
     .set_attrs_type<StridedSliceAttrs>()
     .add_type_rel("StridedSlice", StridedSliceRel)
diff --git a/src/relay/transforms/combine_parallel_conv2d.cc b/src/relay/transforms/combine_parallel_conv2d.cc
index 04ed35b709e5..d680de49b005 100644
--- a/src/relay/transforms/combine_parallel_conv2d.cc
+++ b/src/relay/transforms/combine_parallel_conv2d.cc
@@ -192,6 +192,7 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
       for (size_t i = 0; i < begin.size(); ++i) {
         begin_data[i] = begin[i];
         end_data[i] = end[i];
+        end_data[i] -= begin_data[i];
         strides_data[i] = 1;
       }
 
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index e09a93b0744d..27a2c6250dbe 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -648,8 +648,8 @@ def verify_any_strided_slice(data_shape, begin_shape, end_shape, strides_shape,
     # Generate random numpy input data
     np_data = np.random.uniform(size=data_np_shape).astype('float32')
     np_begin = np.random.randint(2, size=begin_shape, dtype=dtype)
-    np_end = np.random.randint(5, 15, size=end_shape, dtype=dtype)
-    np_strides = np.random.randint(1, 3, size=strides_shape, dtype=dtype)
+    np_end = np.random.randint(5, 10, size=end_shape, dtype=dtype)
+    np_strides = np.random.randint(1, 2 if slice_mode else 3, size=strides_shape, dtype=dtype)
     # target numpy result
     ref_res = topi.testing.strided_slice_python(np_data, np_begin, np_end, np_strides, slice_mode)
 
@@ -684,7 +684,7 @@ def test_any_strided_slice():
     verify_any_strided_slice(any_dims(3), (3,), (3,), (3,), (15, 17, 21))
     verify_any_strided_slice(any_dims(3), (3,), (3,), (3,), (23, 29, 41))
     verify_any_strided_slice(any_dims(4), (4,), (4,), (4,), (40, 50, 60, 70))
-    verify_any_strided_slice(any_dims(4), (4,), (4,), (4,), (40, 50, 60, 70), slice_mode=True)
+    verify_any_strided_slice(any_dims(3), (3,), (3,), (3,), (15, 17, 21), slice_mode=True)
 
 
 def test_recursive_concat():
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 4f04a3a53307..dad1adebbf4e 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -342,6 +342,7 @@ def verify(dshape, begin, end, strides, output, slice_mode=False,
             op_res = intrp.evaluate(func)(x_data)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
 
+    verify((1, 3, 10, 10), [0, 0, 0, 0], [-1, 3, 10, 10], [1], (0, 3, 10, 10), dtype="int64")
     verify((1, 224, 224, 3), [0, 20, 20, 0], [1, 140, 140, 3],
            [1, 1, 1, 1], (1, 120, 120, 3), dtype="int64")
     verify((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], (1, 3, 3), dtype="int16")
@@ -353,10 +354,10 @@ def verify(dshape, begin, end, strides, output, slice_mode=False,
     verify((3, 4, 3), [1, 1], [4, 4, 3], None, (2, 3, 3))
     verify((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], (1, 4, 3))
     verify((3, 4, 3), [1, -1, 0], [2, -3, 3], [1, -1, 1], (1, 2, 3))
-    verify((3, 4, 3), [1, 0, 0], [3, -1, 3], [1, 1, 2],
-           (2, 4, 2), slice_mode=True, test_ref=False)
-    verify((3, 4, 3), [1, 0, 0], [-1, 2, 3], [1, 1, 2],
-           (2, 2, 2), slice_mode=True, test_ref=True)
+    verify((3, 4, 3), [1, 0, 0], [3, -1, 3], [1, 1, 1],
+           (2, 4, 3), slice_mode=True, test_ref=False)
+    verify((3, 4, 3), [1, 0, 0], [-1, 2, 3], [1, 1, 1],
+           (2, 2, 3), slice_mode=True, test_ref=True)
 
 def test_strided_set():
     def verify(dshape, begin, end, strides, vshape, test_ref=True):
diff --git a/tests/python/relay/test_pass_combine_parallel_conv2d.py b/tests/python/relay/test_pass_combine_parallel_conv2d.py
index 112791d83fb7..429fd620e09e 100644
--- a/tests/python/relay/test_pass_combine_parallel_conv2d.py
+++ b/tests/python/relay/test_pass_combine_parallel_conv2d.py
@@ -56,13 +56,13 @@ def expected(x, w1, w2, w3, w4, channels1, channels2, channels3, channels4):
                                  slice_mode=True)
         y2 = relay.strided_slice(y,
                                  begin=relay.const([0, channels1], "int64"),
-                                 end=relay.const([-1, channels1 + channels2], "int64"),
+                                 end=relay.const([-1, channels2], "int64"),
                                  strides=relay.const([1, 1], 'int64'),
                                  slice_mode=True)
         y3 = relay.nn.conv2d(x, w3)
         y4 = relay.strided_slice(y,
                                  begin=relay.const([0, channels1 + channels2], "int64"),
-                                 end=relay.const([-1, channels1 + channels2 + channels4], "int64"),
+                                 end=relay.const([-1, channels4], "int64"),
                                  strides=relay.const([1, 1], 'int64'),
                                  slice_mode=True)
         y5 = relay.nn.max_pool2d(x)
@@ -116,7 +116,7 @@ def expected(x, w1, w2, scale1, scale2, bias, channels1, channels2):
                                  slice_mode=True)
         y2 = relay.strided_slice(y,
                                  begin=relay.const([0, channels1], "int64"),
-                                 end=relay.const([-1, channels1 + channels2], "int64"),
+                                 end=relay.const([-1, channels2], "int64"),
                                  strides=relay.const([1, 1], "int64"),
                                  slice_mode=True)
         y2 = relay.add(y2, bias)
@@ -163,7 +163,7 @@ def expected(x, w1, w2, scale1, scale2, channels1, channels2):
                                  slice_mode=True)
         y2 = relay.strided_slice(y,
                                  begin=relay.const([0, channels1], "int64"),
-                                 end=relay.const([-1, channels1 + channels2], "int64"),
+                                 end=relay.const([-1, channels2], "int64"),
                                  strides=relay.const([1, 1], "int64"),
                                  slice_mode=True)
         y1 = relay.multiply(y1, scale1)
@@ -211,7 +211,7 @@ def expected(x, w, channels, repeat):
                                      slice_mode=True)
             y2 = relay.strided_slice(y,
                                      begin=relay.const([0, channels], "int64"),
-                                     end=relay.const([-1, channels * 2], "int64"),
+                                     end=relay.const([-1, channels], "int64"),
                                      strides=relay.const([1, 1], "int64"),
                                      slice_mode=True)
             y = relay.concatenate((y1, y2), axis=1)
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index 873eae5bc76f..cacde55eb703 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -520,7 +520,7 @@ inline Array<Tensor> split(const Tensor& x, Array<Integer> split_indices, int ax
  * \param begin The indices to begin with in the slicing
  * \param end Indicies indicating end of the slice
  * \param strides Specifies the stride values, it can be negative
- * \param slice_mode Specifies whether to ignore negative elements of input end
+ * \param slice_mode Specifies whether to enable slice mode
  * in that case, the input tensor will be reversed in that particular axis
  * \param name The name of the operation
  * \param tag The tag to mark the operation
@@ -534,14 +534,12 @@ inline Tensor strided_slice(const Tensor& x, const Array<Integer>& begin, const
   // Setup the ranges.
   // NOTE: this code duplicates the shape inference logic relay.op
   // Consider to refactor in the future.
-  std::vector<int64_t> stride_vec;
-  for (Integer i : strides) {
-    CHECK(i.defined());
-    stride_vec.push_back(i->value);
-  }
-  for (size_t i = stride_vec.size(); i < src_tensor_dim; ++i) {
-    stride_vec.push_back(1);
+  std::vector<int64_t> stride_vec(src_tensor_dim, 1);
+  for (size_t i = 0; i < strides.size(); ++i) {
+    CHECK(strides[i].defined());
+    stride_vec[i] = strides[i]->value;
   }
+
   const int64_t max_range = std::numeric_limits<int64_t>::max();
 
   std::vector<int64_t> begin_vec;
@@ -566,10 +564,8 @@ inline Tensor strided_slice(const Tensor& x, const Array<Integer>& begin, const
     } else if (slice_mode) {
       if (end[i]->value < 0) {
         end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
-      } else if (stride_vec[i] > 0) {
-        end_vec.push_back(begin_vec[i] + end[i]->value);
       } else {
-        end_vec.push_back(begin_vec[i] - end[i]->value);
+        end_vec.push_back(begin_vec[i] + end[i]->value);
       }
     } else {
       end_vec.push_back(end[i]->value);
diff --git a/topi/python/topi/testing/strided_slice_python.py b/topi/python/topi/testing/strided_slice_python.py
index a72402ca4dc2..8d68fc1c1492 100644
--- a/topi/python/topi/testing/strided_slice_python.py
+++ b/topi/python/topi/testing/strided_slice_python.py
@@ -35,7 +35,10 @@ def strided_slice_python(data, begin, end, strides, slice_mode=False):
         The stride of each slice.
 
     slice_mode : boolean
-        Whether to ignore negative elements of input end
+        Specifies whether to enable slice mode.
+        In slice mode, strides will be ignored,
+        end indicates the size of a slice starting
+        at the location specified by begin.
 
     Returns
     -------
@@ -45,7 +48,9 @@ def strided_slice_python(data, begin, end, strides, slice_mode=False):
     strides = [] if strides is None else strides
     slices = []
     for i in range(len(data.shape)):
-        new_stride = strides[i] if i < len(strides) else None
+        new_stride = None
+        if not slice_mode and i < len(strides):
+            new_stride = strides[i]
 
         new_begin = begin[i] if i < len(begin) else None
         if i >= len(end):
@@ -53,8 +58,6 @@ def strided_slice_python(data, begin, end, strides, slice_mode=False):
         elif slice_mode:
             if end[i] < 0:
                 new_end = None
-            elif new_stride and new_stride < 0:
-                new_end = new_begin - end[i]
             else:
                 new_end = new_begin + end[i]
         else:
diff --git a/topi/python/topi/transform.py b/topi/python/topi/transform.py
index 209c4b2d837c..a5a564b106f1 100644
--- a/topi/python/topi/transform.py
+++ b/topi/python/topi/transform.py
@@ -151,7 +151,10 @@ def strided_slice(a, begin, end, strides=None, slice_mode=False):
         in that particular axis.
 
     slice_mode: boolean, optional
-        Specifies whether to ignore negative elements of input end.
+        Specifies whether to enable slice mode. In slice mode,
+        strides will be ignored, end indicates the size of a slice
+        starting at the location specified by begin. If end[i] is -1,
+        all remaining elements in that dimension are included in the slice
 
     Returns
     -------

From 4b7a92476f5d8d2bbbf717e07350d42a0aea3bb8 Mon Sep 17 00:00:00 2001
From: Yong Wu <ywu118@alumni.jh.edu>
Date: Thu, 4 Jun 2020 02:16:28 +0800
Subject: [PATCH 19/22] fix comment

---
 python/tvm/relay/frontend/tensorflow.py | 21 +++++++++++++++------
 python/tvm/relay/op/_transform.py       |  1 -
 src/relay/op/tensor/transform.cc        | 12 +++---------
 tests/python/relay/test_any.py          | 22 ++++++++++++----------
 4 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 002fb857e258..784e86b2f290 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -617,7 +617,16 @@ def _impl(inputs, attr, params, mod):
 def _nms():
     def _impl(inputs, attr, params, mod):
         # Get parameter values
-        max_output_size = int(np.atleast_1d(inputs[2].data.asnumpy().astype("int64"))[0])
+        # TODO(yongwww) change nms in relay to support symbolic max_output_size
+        try:
+            max_output_size = int(np.atleast_1d(inputs[2].data.asnumpy()
+                                                .astype("int64"))[0])
+        except Exception:
+            try:
+                max_output_size = _infer_value(inputs[2], params,
+                                               mod).asnumpy().astype("int64").tolist()[0]
+            except Exception:
+                max_output_size = -1
         iou_threshold = np.atleast_1d(inputs[3].data.asnumpy())[0]
         # score_threshold was introduced from V3
         score_threshold = np.atleast_1d(inputs[4].data.asnumpy())[0] if len(inputs) > 4 else 0.0
@@ -1168,7 +1177,7 @@ def _impl(inputs, attr, params, mod):
         except (IndexError, KeyError, AttributeError):
             # Handle symbolic begin
             try:
-                begin = _infer_value(inputs[1], params).asnumpy().tolist()[0]
+                begin = _infer_value(inputs[1], params).asnumpy().tolist()
             except Exception:
                 begin = inputs[1]
         try:
@@ -1176,7 +1185,7 @@ def _impl(inputs, attr, params, mod):
         except (IndexError, KeyError, AttributeError):
             # Handle symbolic size
             try:
-                size = _infer_value(inputs[2], params).asnumpy().tolist()[0]
+                size = _infer_value(inputs[2], params).asnumpy().tolist()
             except Exception:
                 size = inputs[2]
         return _op.strided_slice(inputs[0], begin=begin, end=size, slice_mode=True)
@@ -1509,9 +1518,9 @@ def _transform_mask(stride_dim, ellipsis_mask):
         if begin_mask or end_mask or ellipsis_mask or new_axis_mask or shrink_axis_mask:
             begin, end, stride, fshape_indices = _transform_mask(stride_dim, ellipsis_mask)
         out = _op.strided_slice(inputs[0],
-                                begin=_expr.const(begin),
-                                end=_expr.const(end),
-                                strides=_expr.const(stride))
+                                begin=begin,
+                                end=end,
+                                strides=stride)
         out_shape = _infer_shape(out, mod=mod)
         if not fshape_indices:
             fshape_indices = range(len(out_shape))
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 1ddd335e4d9c..32be4929a971 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -134,7 +134,6 @@ def _strided_slice_shape_func_input_data(data, begin, end, strides,
 @script
 def _strided_slice_shape_func_input_shape(data_shape, begin, end, strides, slice_mode):
     ndim = data_shape.shape[0]
-    assert ndim == 2, "not correct"
     out = output_tensor((ndim,), "int64")
     for i in const_range(ndim):
         cbegin = int64(0)
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index a2d4704dfa3b..e78f89b1d91c 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1927,15 +1927,9 @@ Expr MakeStridedSlice(Expr data, Expr begin, Expr end, Expr strides, bool slice_
     CHECK_EQ(cend->data->ndim, 1);
     CHECK_EQ(cstrides->data->ndim, 1);
     Array<Integer> begin, end, strides;
-    for (int i = 0; i < cbegin->data->shape[0]; i++) {
-      begin.push_back(Integer(static_cast<int>(ToScalar(cbegin->data, i))));
-    }
-    for (int i = 0; i < cend->data->shape[0]; i++) {
-      end.push_back(Integer(static_cast<int>(ToScalar(cend->data, i))));
-    }
-    for (int i = 0; i < cstrides->data->shape[0]; i++) {
-      strides.push_back(Integer(static_cast<int>(ToScalar(cstrides->data, i))));
-    }
+    begin = ToVector(cbegin->data);
+    end = ToVector(cend->data);
+    strides = ToVector(cstrides->data);
     attrs->begin = begin;
     attrs->end = end;
     attrs->strides = strides;
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 27a2c6250dbe..8fd6aed70a75 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -644,12 +644,12 @@ def test_arange_with_dynamic_shape():
         tvm.testing.assert_allclose(result.asnumpy(), np.array(range(10)).astype("int32")+1)
 
 def verify_any_strided_slice(data_shape, begin_shape, end_shape, strides_shape,
-                             data_np_shape, slice_mode=False, const_attrs=False, dtype="int32"):
+                             data_np_shape, slice_mode=False, const_attrs=False):
     # Generate random numpy input data
     np_data = np.random.uniform(size=data_np_shape).astype('float32')
-    np_begin = np.random.randint(2, size=begin_shape, dtype=dtype)
-    np_end = np.random.randint(5, 10, size=end_shape, dtype=dtype)
-    np_strides = np.random.randint(1, 2 if slice_mode else 3, size=strides_shape, dtype=dtype)
+    np_begin = np.random.randint(2, size=begin_shape, dtype="int32")
+    np_end = np.random.randint(5, 10, size=end_shape, dtype="int32")
+    np_strides = np.random.randint(1, 2 if slice_mode else 3, size=strides_shape, dtype="int32")
     # target numpy result
     ref_res = topi.testing.strided_slice_python(np_data, np_begin, np_end, np_strides, slice_mode)
 
@@ -657,15 +657,16 @@ def verify_any_strided_slice(data_shape, begin_shape, end_shape, strides_shape,
     mod = tvm.IRModule()
     data = relay.var('data', shape=data_shape, dtype='float32')
     if const_attrs:
-        begin = relay.const(np_begin, dtype)
-        end = relay.const(np_end, dtype)
-        strides = relay.const(np_strides, dtype)
+        data = relay.var('data', shape=data_np_shape, dtype='float32')
+        begin = relay.const(np_begin)
+        end = relay.const(np_end)
+        strides = relay.const(np_strides)
         args = [data]
         np_inputs = [np_data]
     else:
-        begin = relay.var('begin', shape=begin_shape, dtype=dtype)
-        end = relay.var('end', shape=end_shape, dtype=dtype)
-        strides = relay.var('strides', shape=strides_shape, dtype=dtype)
+        begin = relay.var('begin', shape=begin_shape, dtype="int32")
+        end = relay.var('end', shape=end_shape, dtype="int32")
+        strides = relay.var('strides', shape=strides_shape, dtype="int32")
         args = [data, begin, end, strides]
         np_inputs = [np_data, np_begin, np_end, np_strides]
 
@@ -685,6 +686,7 @@ def test_any_strided_slice():
     verify_any_strided_slice(any_dims(3), (3,), (3,), (3,), (23, 29, 41))
     verify_any_strided_slice(any_dims(4), (4,), (4,), (4,), (40, 50, 60, 70))
     verify_any_strided_slice(any_dims(3), (3,), (3,), (3,), (15, 17, 21), slice_mode=True)
+    verify_any_strided_slice(any_dims(2), (2,), (2,), (2,), (15, 21), const_attrs=True)
 
 
 def test_recursive_concat():

From f37574c68cf84b62ab7a2a7b72c513adc44aea44 Mon Sep 17 00:00:00 2001
From: Yong Wu <ywu118@alumni.jh.edu>
Date: Sun, 7 Jun 2020 00:03:39 +0800
Subject: [PATCH 20/22] change slice_mode to string

---
 include/tvm/relay/attrs/transform.h           | 14 +++++----
 python/tvm/relay/frontend/pytorch.py          |  2 +-
 python/tvm/relay/frontend/tensorflow.py       |  4 +--
 python/tvm/relay/op/_transform.py             |  2 +-
 python/tvm/relay/op/transform.py              | 13 ++++----
 src/relay/op/tensor/transform.cc              | 18 ++++++-----
 .../transforms/combine_parallel_conv2d.cc     | 30 ++++++-------------
 src/relay/transforms/pattern_util.h           |  2 +-
 tests/python/relay/test_any.py                |  6 ++--
 tests/python/relay/test_op_level4.py          |  6 ++--
 .../test_pass_combine_parallel_conv2d.py      | 18 +++++------
 topi/include/topi/transform.h                 |  6 ++--
 .../topi/testing/strided_slice_python.py      | 18 ++++++-----
 topi/python/topi/transform.py                 | 19 ++++++------
 14 files changed, 77 insertions(+), 81 deletions(-)

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index c1700ed14b79..052bc608affc 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -213,18 +213,20 @@ struct StridedSliceAttrs : public tvm::AttrsNode<StridedSliceAttrs> {
   Optional<Array<Integer>> begin;
   Optional<Array<Integer>> end;
   Optional<Array<Integer>> strides;
-  bool slice_mode;
+  std::string slice_mode;
 
   TVM_DECLARE_ATTRS(StridedSliceAttrs, "relay.attrs.StridedSliceAttrs") {
     TVM_ATTR_FIELD(begin).describe("Indices for begin of slice, begin index is also inclusive");
     TVM_ATTR_FIELD(end).describe("Indices for end of slice, end index is exclusive");
-    TVM_ATTR_FIELD(strides).describe("Stride values of the slice");
+    TVM_ATTR_FIELD(strides).describe(
+        "Stride values of the slice, a stride can be negative, which causes a reverse slice.");
     TVM_ATTR_FIELD(slice_mode)
-        .set_default(false)
+        .set_default("end")
         .describe(
-            "Specifies whether to enable slice mode. In slice mode,"
-            "strides will be ignored, end indicates the size of a slice"
-            "starting at the location specified by begin. If end[i] is -1,"
+            "The slice mode [end, size]."
+            "end - The default slice mode, ending indices for the slice."
+            "size - The input strides will be ignored, input end in this mode indicates the size"
+            "of a slice starting at the location specified by begin. If end[i] is -1,"
             "all remaining elements in that dimension are included in the slice");
   }
 };
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index c7c80de15a54..d1cf8748315b 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -247,7 +247,7 @@ def _impl(inputs, input_types):
                                            begin=_expr.const(begin),
                                            end=_expr.const(end),
                                            strides=_expr.const(strides),
-                                           slice_mode=True)
+                                           slice_mode="size")
     return _impl
 
 def _split():
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 784e86b2f290..a30f6f0fda2d 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -666,7 +666,7 @@ def _impl(inputs, attr, params, mod):
 
         # slice to get the dynamic result
         ret = get_relay_op("strided_slice")(data_slice, begin=_expr.const([0]),
-                                            end=size, slice_mode=True)
+                                            end=size, slice_mode="size")
         return ret
     return _impl
 
@@ -1188,7 +1188,7 @@ def _impl(inputs, attr, params, mod):
                 size = _infer_value(inputs[2], params).asnumpy().tolist()
             except Exception:
                 size = inputs[2]
-        return _op.strided_slice(inputs[0], begin=begin, end=size, slice_mode=True)
+        return _op.strided_slice(inputs[0], begin=begin, end=size, slice_mode="size")
     return _impl
 
 
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 32be4929a971..a409fd44fc24 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -163,7 +163,7 @@ def strided_slice_shape_func(attrs, inputs, _):
     """
     Shape func for strided_slice
     """
-    slice_mode = convert(get_const_int(attrs.slice_mode))
+    slice_mode = convert(0 if attrs.slice_mode == "end" else 1)
     # data independent if begin, end and strides exist
     if attrs.begin and attrs.end and attrs.strides:
         return [_strided_slice_shape_func_input_shape(inputs[0], attrs.begin, attrs.end,
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 38a17c41bd32..fab6e2c37454 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -611,7 +611,7 @@ def split(data, indices_or_sections, axis=0):
     return TupleWrapper(_make.split(data, indices_or_sections, axis), ret_size)
 
 
-def strided_slice(data, begin, end, strides=None, slice_mode=False):
+def strided_slice(data, begin, end, strides=None, slice_mode="end"):
     """Strided slice of an array.
 
     Parameters
@@ -629,11 +629,12 @@ def strided_slice(data, begin, end, strides=None, slice_mode=False):
         Specifies the stride values, it can be negative in that case,
         the input tensor will be reversed in that particular axis.
 
-    slice_mode: boolean, optional
-        Specifies whether to enable slice mode. In slice mode,
-        strides will be ignored, end indicates the size of a slice
-        starting at the location specified by begin. If end[i] is -1,
-        all remaining elements in that dimension are included in the slice
+    slice_mode: str, optional
+        The slice mode [end, size].
+        end: The ending indices for the slice [default].
+        size: The input strides will be ignored, input end in this mode indicates
+              the size of a slice starting at the location specified by begin. If end[i]
+              is -1, all remaining elements in that dimension are included in the slice.
 
     Returns
     -------
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index e78f89b1d91c..55f2fced3e07 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1689,7 +1689,7 @@ bool StridedSliceRel(const Array<Type>& types, int num_inputs, const Attrs& attr
   if (param->begin && param->end && param->strides) {
     // stride will be set as 1 if slice mode is enabled
     std::vector<int64_t> stride_vec(num_axis, 1);
-    if (!param->slice_mode) {
+    if (param->slice_mode == "end") {
       for (size_t i = 0; i < param->strides.value().size(); ++i) {
         CHECK(param->strides.value()[i].defined());
         stride_vec[i] = param->strides.value()[i]->value;
@@ -1713,14 +1713,16 @@ bool StridedSliceRel(const Array<Type>& types, int num_inputs, const Attrs& attr
       // allow end to be None
       if (!param->end.value()[i].defined()) {
         end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
-      } else if (param->slice_mode) {
+      } else if (param->slice_mode == "size") {
         if (param->end.value()[i]->value < 0) {
           end_vec.push_back(max_range);
         } else {
           end_vec.push_back(begin_vec[i] + param->end.value()[i]->value);
         }
-      } else {
+      } else if (param->slice_mode == "end") {
         end_vec.push_back(param->end.value()[i]->value);
+      } else {
+        LOG(FATAL) << "Unsupported slice mode: " << param->slice_mode;
       }
     }
     for (int64_t i = end_vec.size(); i < num_axis; ++i) {
@@ -1805,7 +1807,7 @@ Array<Array<Layout>> StridedSliceInferCorrectLayout(const Attrs& attrs,
     if (params->begin && params->end && params->strides) {
       for (Integer i : params->strides.value()) {
         CHECK(i.defined());
-        strides.push_back(params->slice_mode ? 1 : i->value);
+        strides.push_back(params->slice_mode == "size" ? 1 : i->value);
       }
 
       for (Integer i : params->begin.value()) {
@@ -1842,7 +1844,7 @@ Array<Array<Layout>> StridedSliceInferCorrectLayout(const Attrs& attrs,
         int64_t ed;
         if (!end[i].defined()) {
           ed = shape[i].as<IntImmNode>()->value;
-        } else if (params->slice_mode) {
+        } else if (params->slice_mode == "size") {
           if (end[i]->value < 0) {
             ed = shape[i].as<IntImmNode>()->value;
           } else {
@@ -1918,7 +1920,7 @@ Array<te::Tensor> StridedSliceCompute(const Attrs& attrs, const Array<te::Tensor
 }
 
 // Positional relay function to create StridedSlice operator used by frontend FFI.
-Expr MakeStridedSlice(Expr data, Expr begin, Expr end, Expr strides, bool slice_mode) {
+Expr MakeStridedSlice(Expr data, Expr begin, Expr end, Expr strides, String slice_mode) {
   auto attrs = make_object<StridedSliceAttrs>();
   const ConstantNode *cbegin, *cend, *cstrides;
   if ((cbegin = begin.as<ConstantNode>()) && (cend = end.as<ConstantNode>()) &&
@@ -1970,7 +1972,7 @@ Examples::
     .add_argument("begin", "Tensor", "The indices to begin with in the slicing.")
     .add_argument("end", "Tensor", "Indices indicating end of the slice.")
     .add_argument("strides", "Tensor", "The stride values.")
-    .add_argument("slice_mode", "Tensor", "Whether to enable slice mode.")
+    .add_argument("slice_mode", "Tensor", "The slice mode.")
     .set_support_level(4)
     .set_attrs_type<StridedSliceAttrs>()
     .add_type_rel("StridedSlice", StridedSliceRel)
@@ -2230,7 +2232,7 @@ Array<te::Tensor> SliceLikeCompute(const Attrs& attrs, const Array<te::Tensor>&
     }
   }
   return Array<te::Tensor>{topi::strided_slice(inputs[0], GetIntArray(begin_idx),
-                                               GetIntArray(end_idx), GetIntArray(strides), false)};
+                                               GetIntArray(end_idx), GetIntArray(strides), "end")};
 }
 
 TVM_REGISTER_GLOBAL("relay.op._make.slice_like").set_body_typed(MakeSliceLike);
diff --git a/src/relay/transforms/combine_parallel_conv2d.cc b/src/relay/transforms/combine_parallel_conv2d.cc
index d680de49b005..0bf9e7fd38a6 100644
--- a/src/relay/transforms/combine_parallel_conv2d.cc
+++ b/src/relay/transforms/combine_parallel_conv2d.cc
@@ -168,8 +168,8 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
     for (const auto& branch : branches) {
       const CallNode* conv2d = branch[0];
       int64_t channels = GetConv2DSuperChannelsDim(conv2d);
-      Array<Integer> begin;
-      Array<Integer> end;
+      std::vector<int64_t> begin;
+      std::vector<int64_t> end;
       for (size_t i = 0; i < channel_pos_; i++) {
         begin.push_back(0);
         end.push_back(-1);
@@ -177,27 +177,15 @@ class ParallelConv2DCombiner : public ParallelOpCombiner {
       begin.push_back(index);
       index += channels;
       end.push_back(index);
-      DLContext ctx;
-      ctx.device_type = kDLCPU;
-      ctx.device_id = 0;
-      auto begin_ndarray = runtime::NDArray::Empty({int64_t(begin.size())}, DataType::Int(64), ctx);
-      auto end_ndarray = runtime::NDArray::Empty({int64_t(begin.size())}, DataType::Int(64), ctx);
-      auto strides_ndarray =
-          runtime::NDArray::Empty({int64_t(begin.size())}, DataType::Int(64), ctx);
-
-      auto* begin_data = static_cast<int64_t*>(begin_ndarray->data);
-      auto* end_data = static_cast<int64_t*>(end_ndarray->data);
-      auto* strides_data = static_cast<int64_t*>(strides_ndarray->data);
-
+      std::vector<int64_t> strides(begin.size(), 1);
       for (size_t i = 0; i < begin.size(); ++i) {
-        begin_data[i] = begin[i];
-        end_data[i] = end[i];
-        end_data[i] -= begin_data[i];
-        strides_data[i] = 1;
+        end[i] -= begin[i];
       }
-
-      auto slice = MakeStridedSlice(data, Constant(begin_ndarray), Constant(end_ndarray),
-                                    Constant(strides_ndarray), true);
+      std::vector<int64_t> ndarray_shape = {static_cast<int64_t>(begin.size())};
+      Constant begin_const = MakeConstantTensor(DataType::Int(64), ndarray_shape, begin);
+      Constant end_const = MakeConstantTensor(DataType::Int(64), ndarray_shape, end);
+      Constant strides_const = MakeConstantTensor(DataType::Int(64), ndarray_shape, strides);
+      auto slice = MakeStridedSlice(data, begin_const, end_const, strides_const, "size");
       subst_map->insert({GetRef<Expr>(branch[depth]), slice});
     }
   }
diff --git a/src/relay/transforms/pattern_util.h b/src/relay/transforms/pattern_util.h
index 2e73c632230b..7518eb9ac81a 100644
--- a/src/relay/transforms/pattern_util.h
+++ b/src/relay/transforms/pattern_util.h
@@ -673,7 +673,7 @@ Expr MakeConcatenate(Expr data, int axis);
 
 Expr MakeRepeat(Expr data, int repeats, int axis);
 
-Expr MakeStridedSlice(Expr data, Expr begin, Expr end, Expr strides, bool slice_mode);
+Expr MakeStridedSlice(Expr data, Expr begin, Expr end, Expr strides, String slice_mode);
 
 Expr MakeStack(Expr data, int axis);
 
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 8fd6aed70a75..8e535a692b88 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -644,12 +644,12 @@ def test_arange_with_dynamic_shape():
         tvm.testing.assert_allclose(result.asnumpy(), np.array(range(10)).astype("int32")+1)
 
 def verify_any_strided_slice(data_shape, begin_shape, end_shape, strides_shape,
-                             data_np_shape, slice_mode=False, const_attrs=False):
+                             data_np_shape, slice_mode="end", const_attrs=False):
     # Generate random numpy input data
     np_data = np.random.uniform(size=data_np_shape).astype('float32')
     np_begin = np.random.randint(2, size=begin_shape, dtype="int32")
     np_end = np.random.randint(5, 10, size=end_shape, dtype="int32")
-    np_strides = np.random.randint(1, 2 if slice_mode else 3, size=strides_shape, dtype="int32")
+    np_strides = np.random.randint(1, 2 if slice_mode == "size" else 3, size=strides_shape, dtype="int32")
     # target numpy result
     ref_res = topi.testing.strided_slice_python(np_data, np_begin, np_end, np_strides, slice_mode)
 
@@ -685,7 +685,7 @@ def test_any_strided_slice():
     verify_any_strided_slice(any_dims(3), (3,), (3,), (3,), (15, 17, 21))
     verify_any_strided_slice(any_dims(3), (3,), (3,), (3,), (23, 29, 41))
     verify_any_strided_slice(any_dims(4), (4,), (4,), (4,), (40, 50, 60, 70))
-    verify_any_strided_slice(any_dims(3), (3,), (3,), (3,), (15, 17, 21), slice_mode=True)
+    verify_any_strided_slice(any_dims(3), (3,), (3,), (3,), (15, 17, 21), slice_mode="size")
     verify_any_strided_slice(any_dims(2), (2,), (2,), (2,), (15, 21), const_attrs=True)
 
 
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index dad1adebbf4e..74231cb0d5a1 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -296,7 +296,7 @@ def test_mean_var_std():
 
 
 def test_strided_slice():
-    def verify(dshape, begin, end, strides, output, slice_mode=False,
+    def verify(dshape, begin, end, strides, output, slice_mode="end",
                attr_const=True, test_ref=True, dtype="int32"):
         x = relay.var("x", relay.TensorType(dshape, "float32"))
         ndim = len(dshape)
@@ -355,9 +355,9 @@ def verify(dshape, begin, end, strides, output, slice_mode=False,
     verify((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], (1, 4, 3))
     verify((3, 4, 3), [1, -1, 0], [2, -3, 3], [1, -1, 1], (1, 2, 3))
     verify((3, 4, 3), [1, 0, 0], [3, -1, 3], [1, 1, 1],
-           (2, 4, 3), slice_mode=True, test_ref=False)
+           (2, 4, 3), slice_mode="size", test_ref=False)
     verify((3, 4, 3), [1, 0, 0], [-1, 2, 3], [1, 1, 1],
-           (2, 2, 3), slice_mode=True, test_ref=True)
+           (2, 2, 3), slice_mode="size", test_ref=True)
 
 def test_strided_set():
     def verify(dshape, begin, end, strides, vshape, test_ref=True):
diff --git a/tests/python/relay/test_pass_combine_parallel_conv2d.py b/tests/python/relay/test_pass_combine_parallel_conv2d.py
index 429fd620e09e..68e7fece7e98 100644
--- a/tests/python/relay/test_pass_combine_parallel_conv2d.py
+++ b/tests/python/relay/test_pass_combine_parallel_conv2d.py
@@ -53,18 +53,18 @@ def expected(x, w1, w2, w3, w4, channels1, channels2, channels3, channels4):
                                  begin=relay.const([0, 0], "int64"),
                                  end=relay.const([-1, channels1], "int64"),
                                  strides=relay.const([1, 1], 'int64'),
-                                 slice_mode=True)
+                                 slice_mode="size")
         y2 = relay.strided_slice(y,
                                  begin=relay.const([0, channels1], "int64"),
                                  end=relay.const([-1, channels2], "int64"),
                                  strides=relay.const([1, 1], 'int64'),
-                                 slice_mode=True)
+                                 slice_mode="size")
         y3 = relay.nn.conv2d(x, w3)
         y4 = relay.strided_slice(y,
                                  begin=relay.const([0, channels1 + channels2], "int64"),
                                  end=relay.const([-1, channels4], "int64"),
                                  strides=relay.const([1, 1], 'int64'),
-                                 slice_mode=True)
+                                 slice_mode="size")
         y5 = relay.nn.max_pool2d(x)
         y = relay.Tuple((y1, y2, y3, y4, y5))
         return relay.Function(args, y)
@@ -113,12 +113,12 @@ def expected(x, w1, w2, scale1, scale2, bias, channels1, channels2):
                                  begin=relay.const([0, 0], "int64"),
                                  end=relay.const([-1, channels1], "int64"),
                                  strides=relay.const([1, 1], "int64"),
-                                 slice_mode=True)
+                                 slice_mode="size")
         y2 = relay.strided_slice(y,
                                  begin=relay.const([0, channels1], "int64"),
                                  end=relay.const([-1, channels2], "int64"),
                                  strides=relay.const([1, 1], "int64"),
-                                 slice_mode=True)
+                                 slice_mode="size")
         y2 = relay.add(y2, bias)
         y = relay.Tuple((y1, y2))
         return relay.Function(args, y)
@@ -160,12 +160,12 @@ def expected(x, w1, w2, scale1, scale2, channels1, channels2):
                                  begin=relay.const([0, 0], "int64"),
                                  end=relay.const([-1, channels1], "int64"),
                                  strides=relay.const([1, 1], "int64"),
-                                 slice_mode=True)
+                                 slice_mode="size")
         y2 = relay.strided_slice(y,
                                  begin=relay.const([0, channels1], "int64"),
                                  end=relay.const([-1, channels2], "int64"),
                                  strides=relay.const([1, 1], "int64"),
-                                 slice_mode=True)
+                                 slice_mode="size")
         y1 = relay.multiply(y1, scale1)
         y2 = relay.multiply(y2, scale2)
         y = relay.Tuple((y1, y2))
@@ -208,12 +208,12 @@ def expected(x, w, channels, repeat):
                                      begin=relay.const([0, 0], "int64"),
                                      end=relay.const([-1, channels], "int64"),
                                      strides=relay.const([1, 1], "int64"),
-                                     slice_mode=True)
+                                     slice_mode="size")
             y2 = relay.strided_slice(y,
                                      begin=relay.const([0, channels], "int64"),
                                      end=relay.const([-1, channels], "int64"),
                                      strides=relay.const([1, 1], "int64"),
-                                     slice_mode=True)
+                                     slice_mode="size")
             y = relay.concatenate((y1, y2), axis=1)
         return relay.Function(args, y)
 
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index cacde55eb703..d46caff39e9a 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -520,15 +520,15 @@ inline Array<Tensor> split(const Tensor& x, Array<Integer> split_indices, int ax
  * \param begin The indices to begin with in the slicing
  * \param end Indicies indicating end of the slice
  * \param strides Specifies the stride values, it can be negative
- * \param slice_mode Specifies whether to enable slice mode
  * in that case, the input tensor will be reversed in that particular axis
+ * \param slice_mode Specifies the slice mode
  * \param name The name of the operation
  * \param tag The tag to mark the operation
  *
  * \return A Tensor whose op member is the split operation
  */
 inline Tensor strided_slice(const Tensor& x, const Array<Integer>& begin, const Array<Integer>& end,
-                            const Array<Integer>& strides, const bool& slice_mode,
+                            const Array<Integer>& strides, std::string slice_mode = "end",
                             std::string name = "T_strided_slice", std::string tag = kInjective) {
   size_t src_tensor_dim = static_cast<size_t>(x->shape.size());
   // Setup the ranges.
@@ -561,7 +561,7 @@ inline Tensor strided_slice(const Tensor& x, const Array<Integer>& begin, const
 
     if (!end[i].defined()) {
       end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
-    } else if (slice_mode) {
+    } else if (slice_mode == "size") {
       if (end[i]->value < 0) {
         end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
       } else {
diff --git a/topi/python/topi/testing/strided_slice_python.py b/topi/python/topi/testing/strided_slice_python.py
index 8d68fc1c1492..970e1dedd8c9 100644
--- a/topi/python/topi/testing/strided_slice_python.py
+++ b/topi/python/topi/testing/strided_slice_python.py
@@ -17,7 +17,7 @@
 """strided_slice/set in python"""
 
 
-def strided_slice_python(data, begin, end, strides, slice_mode=False):
+def strided_slice_python(data, begin, end, strides, slice_mode="end"):
     """Python version of strided slice operator.
 
     Parameters
@@ -34,11 +34,13 @@ def strided_slice_python(data, begin, end, strides, slice_mode=False):
     strides : list
         The stride of each slice.
 
-    slice_mode : boolean
-        Specifies whether to enable slice mode.
-        In slice mode, strides will be ignored,
-        end indicates the size of a slice starting
-        at the location specified by begin.
+    slice_mode : str, optional
+        The slice mode [end, size].
+        end: The default slice mode, ending indices for the slice.
+        size: The input strides will be ignored, input end in this mode indicates
+              the sizeof a slice starting at the location specified by begin. If end[i] is -1,
+              all remaining elements in that dimension are included in the slice.
+
 
     Returns
     -------
@@ -49,13 +51,13 @@ def strided_slice_python(data, begin, end, strides, slice_mode=False):
     slices = []
     for i in range(len(data.shape)):
         new_stride = None
-        if not slice_mode and i < len(strides):
+        if slice_mode == "end" and i < len(strides):
             new_stride = strides[i]
 
         new_begin = begin[i] if i < len(begin) else None
         if i >= len(end):
             new_end = None
-        elif slice_mode:
+        elif slice_mode == "size":
             if end[i] < 0:
                 new_end = None
             else:
diff --git a/topi/python/topi/transform.py b/topi/python/topi/transform.py
index a5a564b106f1..e1984458d677 100644
--- a/topi/python/topi/transform.py
+++ b/topi/python/topi/transform.py
@@ -131,7 +131,7 @@ def flip(a, axis=0):
     """
     return cpp.flip(a, axis)
 
-def strided_slice(a, begin, end, strides=None, slice_mode=False):
+def strided_slice(a, begin, end, strides=None, slice_mode="end"):
     """Slice of an array.
 
     Parameters
@@ -139,22 +139,23 @@ def strided_slice(a, begin, end, strides=None, slice_mode=False):
     a : tvm.te.Tensor
         The tensor to be sliced.
 
-    begin: list of int
+    begin : list of int
         The indices to begin with in the slicing.
 
-    end: list of int
+    end : list of int
         Indicies indicating end of the slice.
 
-    strides: list of int, optional
+    strides : list of int, optional
         Specifies the stride values, it can be negative
         in that case, the input tensor will be reversed
         in that particular axis.
 
-    slice_mode: boolean, optional
-        Specifies whether to enable slice mode. In slice mode,
-        strides will be ignored, end indicates the size of a slice
-        starting at the location specified by begin. If end[i] is -1,
-        all remaining elements in that dimension are included in the slice
+    slice_mode : str, optional
+        The slice mode [end, size].
+        end - The default slice mode, ending indices for the slice.
+        size - The input strides will be ignored, input end in this mode indicates
+               the sizeof a slice starting at the location specified by begin. If end[i]
+               is -1, all remaining elements in that dimension are included in the slice.
 
     Returns
     -------

From 5e32d734235fe489f90cd96452e325fe1fc5a8fc Mon Sep 17 00:00:00 2001
From: Yong Wu <ywu118@alumni.jh.edu>
Date: Sun, 7 Jun 2020 06:53:43 +0800
Subject: [PATCH 21/22] fix CI

---
 python/tvm/relay/op/transform.py | 12 ++++++------
 topi/python/topi/transform.py    |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index fab6e2c37454..6b93bbf11114 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -619,22 +619,22 @@ def strided_slice(data, begin, end, strides=None, slice_mode="end"):
     data : relay.Expr
         The source array to be sliced.
 
-    begin: relay.Expr, Tuple[int], or List[int]
+    begin : relay.Expr, Tuple[int], or List[int]
         The indices to begin with in the slicing.
 
-    end: relay.Expr, Tuple[int], or List[int]
+    end : relay.Expr, Tuple[int], or List[int]
         Indices indicating end of the slice.
 
-    strides: relay.Expr, Tuple[int], or List[int], optional
+    strides : relay.Expr, Tuple[int], or List[int], optional
         Specifies the stride values, it can be negative in that case,
         the input tensor will be reversed in that particular axis.
 
-    slice_mode: str, optional
+    slice_mode : str, optional
         The slice mode [end, size].
         end: The ending indices for the slice [default].
         size: The input strides will be ignored, input end in this mode indicates
-              the size of a slice starting at the location specified by begin. If end[i]
-              is -1, all remaining elements in that dimension are included in the slice.
+        the size of a slice starting at the location specified by begin. If end[i]
+        is -1, all remaining elements in that dimension are included in the slice.
 
     Returns
     -------
diff --git a/topi/python/topi/transform.py b/topi/python/topi/transform.py
index e1984458d677..f9a7c7796b67 100644
--- a/topi/python/topi/transform.py
+++ b/topi/python/topi/transform.py
@@ -152,10 +152,10 @@ def strided_slice(a, begin, end, strides=None, slice_mode="end"):
 
     slice_mode : str, optional
         The slice mode [end, size].
-        end - The default slice mode, ending indices for the slice.
+        end - The ending indices for the slice [default].
         size - The input strides will be ignored, input end in this mode indicates
-               the sizeof a slice starting at the location specified by begin. If end[i]
-               is -1, all remaining elements in that dimension are included in the slice.
+        the sizeof a slice starting at the location specified by begin. If end[i]
+        is -1, all remaining elements in that dimension are included in the slice.
 
     Returns
     -------

From 5b5dda5389171c934b94b6b4ef664caf74488b8e Mon Sep 17 00:00:00 2001
From: Yong Wu <ywu118@alumni.jh.edu>
Date: Tue, 9 Jun 2020 10:35:53 +0800
Subject: [PATCH 22/22] update docstring

---
 python/tvm/relay/op/vision/nms.py | 12 +++++++++---
 topi/python/topi/cuda/nms.py      | 10 ++++++++--
 topi/python/topi/vision/nms.py    | 12 +++++++++---
 3 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/python/tvm/relay/op/vision/nms.py b/python/tvm/relay/op/vision/nms.py
index 38dcbe5452be..b60b49ab0ccd 100644
--- a/python/tvm/relay/op/vision/nms.py
+++ b/python/tvm/relay/op/vision/nms.py
@@ -77,13 +77,19 @@ def non_max_suppression(data,
         or [batch_size, num_anchors, 5].
         The last dimension should be in format of
         [class_id, score, box_left, box_top, box_right, box_bottom]
-        or [score, box_left, box_top, box_right, box_bottom].
+        or [score, box_left, box_top, box_right, box_bottom]. It could
+        be the second output out_tensor of get_valid_counts.
 
     valid_count : relay.Expr
-        1-D tensor for valid number of boxes.
+        1-D tensor for valid number of boxes. It could be the output
+        valid_count of get_valid_counts.
 
     indices: relay.Expr
-        2-D tensor with shape [batch_size, num_anchors]
+        2-D tensor with shape [batch_size, num_anchors], represents
+        the index of box in original data. It could be the third
+        output out_indices of get_valid_counts. The values in the
+        second dimension are like the output of arange(num_anchors)
+        if get_valid_counts is not used before non_max_suppression.
 
     max_output_size : int, optional
         Max number of output valid boxes for each instance.
diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index c72cdad0454c..f2c1143b5fb8 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -354,12 +354,18 @@ def non_max_suppression(data, valid_count, indices, max_output_size=-1,
         3-D tensor with shape [batch_size, num_anchors, elem_length].
         The last dimension should be in format of
         [class_id, score, box_left, box_top, box_right, box_bottom].
+        It could be the second output out_tensor of get_valid_counts.
 
     valid_count : tvm.te.Tensor
-        1-D tensor for valid number of boxes.
+        1-D tensor for valid number of boxes. It could be the output
+        valid_count of get_valid_counts.
 
     indices : tvm.te.Tensor
-        2-D tensor with shape [batch_size, num_anchors].
+        2-D tensor with shape [batch_size, num_anchors], represents
+        the index of box in original data. It could be the third
+        output out_indices of get_valid_counts. The values in the
+        second dimension are like the output of arange(num_anchors)
+        if get_valid_counts is not used before non_max_suppression.
 
     max_output_size : optional, int
         Max number of output valid boxes for each instance.
diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index 1a2089683b62..269c876d647e 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -235,17 +235,23 @@ def hybrid_nms(data, sorted_index, valid_count, indices, batch_size, num_anchors
     ----------
     data: tvm.te.Tensor or numpy NDArray
         Bounding boxes with class and score. 3-D tensor with shape
-        [batch_size, num_anchors, 6].
+        [batch_size, num_anchors, 6]. It could be the second output
+        out_tensor of get_valid_counts.
 
     sorted_index : tvm.te.Tensor or numpy NDArray
         Bounding box indexes sorted by score, with shape
         [batch_size, num_anchors].
 
     valid_count : tvm.te.Tensor or numpy NDArray
-        1-D tensor for valid number of boxes.
+        1-D tensor for valid number of boxes. It could be the output
+        valid_count of get_valid_counts.
 
     indices : tvm.te.Tensor or numpy.NDArray
-        indices in original tensor, with shape [batch_size, num_anchors]
+        indices in original tensor, with shape [batch_size, num_anchors],
+        represents the index of box in original data. It could be the third
+        output out_indices of get_valid_counts. The values in the second
+        dimension are like the output of arange(num_anchors) if get_valid_counts
+        is not used before non_max_suppression.
 
     batch_size: tvm.tir.IntImm or tvm.tir.Var
         Batch size. We need to pass it in since hybrid script doesn't support