From 45807de1e1127e4fd716e75f6eb8963e8371c72a Mon Sep 17 00:00:00 2001
From: Jian Weng <jian.weng@ucla.edu>
Date: Wed, 9 Jan 2019 14:33:14 -0800
Subject: [PATCH 01/21] on the way to enable hybrid schedule

---
 3rdparty/HalideIR                           |  2 +-
 include/tvm/operation.h                     |  3 +
 python/tvm/tensor.py                        |  7 +-
 src/op/hybrid_op.cc                         | 72 +++++++++++++++++++++
 src/op/hybrid_op.h                          | 36 +++++++++++
 src/op/op_util.cc                           | 32 ---------
 src/op/op_util.h                            |  8 ---
 tests/python/unittest/test_hybrid_script.py | 22 +++++++
 8 files changed, 139 insertions(+), 43 deletions(-)
 create mode 100644 src/op/hybrid_op.h
diff --git a/3rdparty/HalideIR b/3rdparty/HalideIR
index 6e7c1f046fda..a08e26e5a97f 160000
--- a/3rdparty/HalideIR
+++ b/3rdparty/HalideIR
@@ -1 +1 @@
-Subproject commit 6e7c1f046fda536562dc80977e93324fee2324bd
+Subproject commit a08e26e5a97f4ef4d566a42f6c78704b3f9c7b8a
diff --git a/include/tvm/operation.h b/include/tvm/operation.h
index 02cd0d016f39..3509b133cfc3 100644
--- a/include/tvm/operation.h
+++ b/include/tvm/operation.h
@@ -459,6 +459,8 @@ class HybridOpNode : public OperationNode {
   Array<Tensor> inputs;
   /*! \brief Symbolic placeholder representation of outputs */
   Array<Tensor> outputs;
+  /*! \brief The axis of iterations */
+  Array<IterVar> axis;
   /*! \brief the statement that generates the computation. This is
    * slightly different from the body in ExternOpNode. All the output
    * tensors keep its own name specified by users in the script.
@@ -500,6 +502,7 @@ class HybridOpNode : public OperationNode {
     v->Visit("attrs", &attrs);
     v->Visit("inputs", &inputs);
     v->Visit("outputs", &outputs);
+    v->Visit("axis", &axis);
     v->Visit("body", &body);
   }
   EXPORT static Operation make(std::string name,
diff --git a/python/tvm/tensor.py b/python/tvm/tensor.py
index 9a98e9a6e769..e1345ad373bf 100644
--- a/python/tvm/tensor.py
+++ b/python/tvm/tensor.py
@@ -152,7 +152,7 @@ class ComputeOp(Operation):
     """Compute operation."""
     @property
     def axis(self):
-        """Represent axis of IterVar, only defined when it is a ComputeOp"""
+        """Represent axis of IterVar, defined when it is a ComputeOp"""
         return self.__getattr__("axis")
 
     @property
@@ -184,4 +184,7 @@ class ExternOp(Operation):
 @register_node
 class HybridOp(Operation):
     """Hybrid operation."""
-    pass
+    @property
+    def axis(self):
+        """Represent axis of IterVar, also defined when it is a HybridOp"""
+        return self.__getattr__("axis")
diff --git a/src/op/hybrid_op.cc b/src/op/hybrid_op.cc
index 4dbb2c0b964f..13283966db4b 100644
--- a/src/op/hybrid_op.cc
+++ b/src/op/hybrid_op.cc
@@ -8,7 +8,9 @@
 #include <tvm/ir.h>
 #include <tvm/ir_mutator.h>
 #include <unordered_set>
+
 #include "op_util.h"
+#include "hybrid_op.h"
 
 namespace tvm {
 using namespace ir;
@@ -52,6 +54,7 @@ Operation HybridOpNode::make(std::string name,
   n->attrs = std::move(attrs);
   n->inputs = std::move(inputs);
   n->outputs = std::move(outputs);
+  n->axis = op::GatherLoopVars(body);
   n->body = std::move(body);
   Operation res = Operation(n);
   return res;
@@ -186,4 +189,73 @@ Stmt HybridOpNode::BuildProvide(
   ret = op::ReplaceProvideTensor(ret, rmap);
   return ret;
 }
+
+class LoopVarFinder : public ir::IRVisitor {
+ public:
+  std::vector<IterVar> res_;
+
+  void Visit_(const ir::For *op) {
+    Var loop_var(op->loop_var);
+    Range dom = Range::make_by_min_extent(op->min, op->extent);
+    IterVarType iter_var_ty = kOpaque;
+    switch(op->for_type) {
+    case ForType::Serial:
+      iter_var_ty = kOrdered;
+      break;
+    case ForType::Parallel:
+      iter_var_ty = kDataPar;
+      break;
+    case ForType::Vectorized:
+      iter_var_ty = kVectorized;
+      break;
+    case ForType::Unrolled:
+      iter_var_ty = kUnrolled;
+      break;
+    }
+    res_.push_back(IterVarNode::make(dom, loop_var, iter_var_ty));
+    Visit(op->body);
+  }
+
+};
+
+std::vector<IterVar> GatherLoopVars(Stmt stmt) {
+  LoopVarFinder Finder;
+  Finder.Visit(stmt);
+  return Finder.res_;
+}
+
+// replacer to replace tensors' usage in Provide
+class ProviderReplacer : public ir::IRMutator {
+ public:
+  explicit ProviderReplacer(const std::unordered_map<Tensor, Tensor>& vmap)
+      : vmap_(vmap) {}
+
+  Stmt Mutate_(const ir::Provide* op, const Stmt& s) {
+    Tensor t = Operation(op->func.node_).output(op->value_index);
+    auto it = vmap_.find(t);
+    if (it != vmap_.end()) {
+      Stmt ret = ir::Provide::make(
+        it->second->op, it->second->value_index, op->value, op->args);
+      found = true;
+      return IRMutator::Mutate_(ret.as<ir::Provide>(), ret);
+    }
+    return IRMutator::Mutate_(op, s);
+  }
+
+  // whether it is found.
+  bool found{false};
+
+ private:
+  const std::unordered_map<Tensor, Tensor>& vmap_;
+};
+
+Stmt ReplaceProvideTensor(Stmt stmt,
+                   const std::unordered_map<Tensor, Tensor>& replace) {
+  ProviderReplacer repl(replace);
+  Stmt ret = repl.Mutate(stmt);
+  return repl.found ? ret : stmt;
+}
+
+
+
 }  // namespace tvm
diff --git a/src/op/hybrid_op.h b/src/op/hybrid_op.h
new file mode 100644
index 000000000000..a57d871f813d
--- /dev/null
+++ b/src/op/hybrid_op.h
@@ -0,0 +1,36 @@
+#ifndef TVM_HYBRID_OP_H
+#define TVM_HYBRID_OP_H
+
+#include <tvm/expr.h>
+#include <tvm/ir_mutator.h>
+#include <tvm/ir_visitor.h>
+#include <tvm/schedule.h>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "../pass/ir_util.h"
+#include "../pass/arg_binder.h"
+#include "../schedule/message_passing.h"
+
+
+namespace tvm {
+namespace op {
+
+/*!
+ * \brief Find all the iteration variables in the given statement body.
+ * \param stmt The body to be inspected.
+ */
+std::vector<IterVar> GatherLoopVars(Stmt stmt);
+
+/*!
+ * \brief Replace the tensor reference (especially in Provide's) in stmt by the replace map.
+ * \param stmt The statement to be processed.
+ * \param replace The replacement rule.
+ */
+Stmt ReplaceProvideTensor(Stmt stmt,
+                   const std::unordered_map<Tensor, Tensor>& replace);
+
+} // namespace op
+} // namespace tvm
+
+#endif // TVM_HYBRID_OP_H
diff --git a/src/op/op_util.cc b/src/op/op_util.cc
index 886f7c912303..9038e045106c 100644
--- a/src/op/op_util.cc
+++ b/src/op/op_util.cc
@@ -164,38 +164,6 @@ std::vector<Stmt> MakeIfNest(const std::vector<Expr>& predicates) {
   return nest;
 }
 
-// replacer to replace tensors' usage in Provide
-class ProviderReplacer : public ir::IRMutator {
- public:
-  explicit ProviderReplacer(const std::unordered_map<Tensor, Tensor>& vmap)
-      : vmap_(vmap) {}
-
-  Stmt Mutate_(const ir::Provide* op, const Stmt& s) {
-    Tensor t = Operation(op->func.node_).output(op->value_index);
-    auto it = vmap_.find(t);
-    if (it != vmap_.end()) {
-      Stmt ret = ir::Provide::make(
-        it->second->op, it->second->value_index, op->value, op->args);
-      found = true;
-      return IRMutator::Mutate_(ret.as<ir::Provide>(), ret);
-    }
-    return IRMutator::Mutate_(op, s);
-  }
-
-  // whether it is found.
-  bool found{false};
-
- private:
-  const std::unordered_map<Tensor, Tensor>& vmap_;
-};
-
-Stmt ReplaceProvideTensor(Stmt stmt,
-                   const std::unordered_map<Tensor, Tensor>& replace) {
-  ProviderReplacer repl(replace);
-  Stmt ret = repl.Mutate(stmt);
-  return repl.found ? ret : stmt;
-}
-
 // replacer to replace tensors
 class TensorReplacer : public ir::IRMutator {
  public:
diff --git a/src/op/op_util.h b/src/op/op_util.h
index 6971f14eef73..523633db0984 100644
--- a/src/op/op_util.h
+++ b/src/op/op_util.h
@@ -48,14 +48,6 @@ MakeLoopNest(const Stage& stage,
  */
 std::vector<Stmt> MakeIfNest(const std::vector<Expr>& predicates);
 
-/*!
- * \brief Replace the tensor reference (especially in Provide's) in stmt by the replace map.
- * \param stmt The statement to be processed.
- * \param replace The replacement rule.
- */
-Stmt ReplaceProvideTensor(Stmt stmt,
-                   const std::unordered_map<Tensor, Tensor>& replace);
-
 /*!
  * \brief Replace the tensor reference (especially in Call's) in stmt by the replace map.
  * \param stmt The statement to be processed.
diff --git a/tests/python/unittest/test_hybrid_script.py b/tests/python/unittest/test_hybrid_script.py
index 668b1598446b..0d6edf27561c 100644
--- a/tests/python/unittest/test_hybrid_script.py
+++ b/tests/python/unittest/test_hybrid_script.py
@@ -593,6 +593,27 @@ def hoo(a, b):
     b = [1, 2, 3, 4, 5]
     run_and_check(hoo, [a, b])
 
+def test_schedule():
+    @script
+    def outer_product(a, b):
+        """This is a simple outer product.
+        Actually this function is not required to be documented.
+        I write this docstring to test skipping docstring functionality.
+        """
+        c = output_tensor((128, 128), a.dtype)
+        for i in range(128):
+            for j in range(128):
+                c[i, j] = a[i] * b[j]
+        return c
+    a = tvm.placeholder((128,))
+    b = tvm.placeholder((128,))
+    c = outer_product(a, b)
+    sch = tvm.create_schedule(c.op)
+    i, j = c.op.axis
+    print(i.iter_type)
+    print(j.iter_type)
+    print(i, j)
+    io, ii = sch[c].split(i, 4)
 
 if __name__ == "__main__":
     test_outer_product()
@@ -610,5 +631,6 @@ def hoo(a, b):
     test_func_call()
     test_bool()
     test_const_range()
+    test_schedule()
     # TODO:
     # test_inplace()

From 52e757e4a6ef7fbc227b79691bf1dce6ab731534 Mon Sep 17 00:00:00 2001
From: Jian Weng <jian.weng@ucla.edu>
Date: Thu, 10 Jan 2019 11:03:44 -0800
Subject: [PATCH 02/21] I think I am done with imperfect loop split?

---
 src/op/compute_op.cc                        |   1 +
 src/op/hybrid_op.cc                         | 135 +++++++++++++++-----
 src/op/hybrid_op.h                          |  10 +-
 src/op/op_util.cc                           |  28 ++++
 src/op/op_util.h                            |  12 ++
 tests/python/unittest/test_hybrid_script.py |  37 ++++--
 6 files changed, 180 insertions(+), 43 deletions(-)

diff --git a/src/op/compute_op.cc b/src/op/compute_op.cc
index d4cb2b4c632b..baf42f9367b4 100644
--- a/src/op/compute_op.cc
+++ b/src/op/compute_op.cc
@@ -212,6 +212,7 @@ void ComputeOpNode::GatherBound(
     const Operation& self,
     const std::unordered_map<Tensor, TensorDom>& tensor_dom,
     std::unordered_map<IterVar, Range>* out_dom_map) const {
+  CHECK_EQ(self.operator->(), this);
   const TensorDom& tdom = tensor_dom.at(self.output(0));
   for (size_t i = 0; i < this->axis.size(); ++i) {
     Range r = arith::Union(tdom.data.at(i)).cover_range(this->axis[i]->dom);
diff --git a/src/op/hybrid_op.cc b/src/op/hybrid_op.cc
index 13283966db4b..bfdd7706ea82 100644
--- a/src/op/hybrid_op.cc
+++ b/src/op/hybrid_op.cc
@@ -7,6 +7,8 @@
 #include <tvm/arithmetic.h>
 #include <tvm/ir.h>
 #include <tvm/ir_mutator.h>
+#include <tvm/ir_operator.h>
+#include <tvm/ir_pass.h>
 #include <unordered_set>
 
 #include "op_util.h"
@@ -27,7 +29,7 @@ int HybridOpNode::num_outputs() const {
 }
 
 Array<IterVar> HybridOpNode::root_iter_vars() const {
-  return {};
+  return this->axis;
 }
 
 Type HybridOpNode::output_dtype(size_t i) const {
@@ -105,6 +107,10 @@ void HybridOpNode::GatherBound(
     const Operation& self,
     const std::unordered_map<Tensor, TensorDom>& tensor_dom,
     std::unordered_map<IterVar, Range>* out_dom_map) const {
+  for (auto iter_var : axis) {
+    CHECK(!out_dom_map->count(iter_var));
+    out_dom_map->operator[](iter_var) = iter_var->dom;
+  }
 }
 
 Stmt HybridOpNode::BuildRealize(
@@ -187,41 +193,113 @@ Stmt HybridOpNode::BuildProvide(
    * */
   ret = op::ReplaceTensor(ret, rmap);
   ret = op::ReplaceProvideTensor(ret, rmap);
+
+  ret = op::ApplySchedule(stage, dom_map, ret);
   return ret;
 }
 
-class LoopVarFinder : public ir::IRVisitor {
- public:
-  std::vector<IterVar> res_;
+namespace op {
+Stmt ApplySplits(const Stage &stage,
+                 const std::unordered_map<IterVar, Range>& dom_map, Stmt stmt) {
+  class LoopSpliter : public IRMutator {
+    Expr factor;
+    IterVar parent, inner, outer;
+   public:
+    LoopSpliter(const SplitNode *split,
+                const std::unordered_map<IterVar, Range>& dom_map) : 
+      factor(split->factor) {
+
+      auto &parent_ = split->parent;
+      if (parent_->dom.defined()) {
+        CHECK(is_const_int(parent_->dom->min, 0));
+        parent= parent_;
+      } else {
+        CHECK(dom_map.count(parent_));
+        auto &dom = dom_map.find(parent_)->second;
+        CHECK(is_const_int(dom->min, 0));
+        parent = IterVarNode::make(dom, parent_->var, parent_->iter_type);
+      }
+
+      auto &inner_ = split->inner;
+      CHECK(dom_map.count(inner_));
+      auto &inner_dom = dom_map.find(inner_)->second;
+      CHECK(is_const_int(inner_dom->min, 0));
+
+      auto &outer_ = split->outer;
+      CHECK(dom_map.count(outer_));
+      auto &outer_dom = dom_map.find(outer_)->second;
+      CHECK(is_const_int(outer_dom->min, 0));
 
-  void Visit_(const ir::For *op) {
-    Var loop_var(op->loop_var);
-    Range dom = Range::make_by_min_extent(op->min, op->extent);
-    IterVarType iter_var_ty = kOpaque;
-    switch(op->for_type) {
-    case ForType::Serial:
-      iter_var_ty = kOrdered;
-      break;
-    case ForType::Parallel:
-      iter_var_ty = kDataPar;
-      break;
-    case ForType::Vectorized:
-      iter_var_ty = kVectorized;
-      break;
-    case ForType::Unrolled:
-      iter_var_ty = kUnrolled;
-      break;
+      inner = IterVarNode::make(inner_dom, inner_->var, inner_->iter_type);
+      outer = IterVarNode::make(outer_dom, outer_->var, outer_->iter_type);
+    }
+
+    Stmt Mutate_(const For *op, const Stmt &stmt) {
+      if (op->loop_var.get() == parent->var.get()) {
+        std::unordered_map<const Variable *, Expr> rmap;
+        rmap[op->loop_var.get()] = inner + outer * factor;
+        Stmt ret = ir::Substitute(op->body, rmap);
+        Expr cond = likely(outer * factor < (parent->dom->extent - inner));
+        ret = IfThenElse::make(cond, ret);
+        ret = For::make(inner->var, Expr(0), inner->dom->extent,
+                        IterVarTypeToForType(inner->iter_type), op->device_api, ret);
+        ret = For::make(outer->var, Expr(0), outer->dom->extent,
+                        IterVarTypeToForType(outer->iter_type), op->device_api, ret);
+        return ret;
+      }
+      return IRMutator::Mutate_(op, stmt);
+    }
+  };
+
+  bool changed = true;
+  while (changed) {
+    changed = false;
+    for (auto &rel : stage->relations) {
+      if (const SplitNode* split = rel.as<SplitNode>()) {
+        bool not_splited = false;
+        PostOrderVisit(stmt, [&not_splited, &split](const NodeRef &node) {
+          if (const Variable *var = node.as<Variable>()) {
+            if (var == split->parent->var.get())
+              not_splited = true;
+          }
+        });
+        if (not_splited) {
+          stmt = LoopSpliter(split, dom_map).Mutate(stmt);
+          changed = true;
+        }
+      }
     }
-    res_.push_back(IterVarNode::make(dom, loop_var, iter_var_ty));
-    Visit(op->body);
   }
 
-};
+  return stmt;
+}
+
+Stmt ApplyLoopAnnotations(const Stage &stage, Stmt stmt) {
+  return stmt;
+}
+
+Stmt ApplyLoopOrder(const Stage &stage, Stmt stmt) {
+  return stmt;
+}
+
+Stmt ApplySchedule(const Stage &stage, const
+                   std::unordered_map<IterVar, Range>& dom_map, Stmt stmt) {
+  stmt = ApplySplits(stage, dom_map, stmt);
+  stmt = ApplyLoopAnnotations(stage, stmt);
+  stmt = ApplyLoopOrder(stage, stmt);
+  return stmt;
+}
 
 std::vector<IterVar> GatherLoopVars(Stmt stmt) {
-  LoopVarFinder Finder;
-  Finder.Visit(stmt);
-  return Finder.res_;
+  std::vector<IterVar> res_;
+  PostOrderVisit(stmt, [&res_](const NodeRef &node) {
+    if (const For *op = node.as<For>()) {
+      Var loop_var(op->loop_var);
+      Range dom = Range::make_by_min_extent(op->min, op->extent);
+      res_.push_back(IterVarNode::make(dom, loop_var, ForTypeToIterVarType(op->for_type)));
+    }
+  });
+  return res_;
 }
 
 // replacer to replace tensors' usage in Provide
@@ -255,7 +333,6 @@ Stmt ReplaceProvideTensor(Stmt stmt,
   Stmt ret = repl.Mutate(stmt);
   return repl.found ? ret : stmt;
 }
-
-
+} // namespace op
 
 }  // namespace tvm
diff --git a/src/op/hybrid_op.h b/src/op/hybrid_op.h
index a57d871f813d..5f8a7b5b578c 100644
--- a/src/op/hybrid_op.h
+++ b/src/op/hybrid_op.h
@@ -28,7 +28,15 @@ std::vector<IterVar> GatherLoopVars(Stmt stmt);
  * \param replace The replacement rule.
  */
 Stmt ReplaceProvideTensor(Stmt stmt,
-                   const std::unordered_map<Tensor, Tensor>& replace);
+                          const std::unordered_map<Tensor, Tensor>& replace);
+
+/*!
+ * \brief Replace the tensor reference (especially in Provide's) in stmt by the replace map.
+ * \param stmt The statement to be processed.
+ * \param stage The schedule information to be applied.
+ */
+Stmt ApplySchedule(const Stage& stage,
+                   const std::unordered_map<IterVar, Range>& dom_map, Stmt stmt);
 
 } // namespace op
 } // namespace tvm
diff --git a/src/op/op_util.cc b/src/op/op_util.cc
index 9038e045106c..ef0da51097a1 100644
--- a/src/op/op_util.cc
+++ b/src/op/op_util.cc
@@ -215,5 +215,33 @@ Stmt Substitute(Stmt s,
   return ir::Substitute(s, init);
 }
 
+IterVarType ForTypeToIterVarType(ir::ForType for_type) {
+  switch(for_type) {
+  case ForType::Serial:
+    return kOrdered;
+  case ForType::Parallel:
+    return kDataPar;
+  case ForType::Vectorized:
+    return kVectorized;
+  case ForType::Unrolled:
+    return kUnrolled;
+  }
+}
+
+ir::ForType IterVarTypeToForType(IterVarType iter_type) {
+  switch(iter_type) {
+  case kOrdered:
+    return ForType::Serial;
+  case kDataPar:
+    return ForType::Parallel;
+  case kVectorized:
+    return ForType::Vectorized;
+  case kUnrolled:
+    return ForType::Unrolled;
+  default:
+    return ForType::Serial;
+  }
+}
+
 }  // namespace op
 }  // namespace tvm
diff --git a/src/op/op_util.h b/src/op/op_util.h
index 523633db0984..de2e44c2ed59 100644
--- a/src/op/op_util.h
+++ b/src/op/op_util.h
@@ -72,6 +72,18 @@ Expr ReplaceTensor(Expr expr,
 Stmt Substitute(Stmt stmt,
                 const std::unordered_map<IterVar, Expr>& value_map);
 
+/*!
+ * \brief Converts Halide ForType to its corresponding IterVarType
+ * \param for_type The ForType to be converted
+ */
+IterVarType ForTypeToIterVarType(ir::ForType for_type);
+
+/*!
+ * \brief Converts IterVarType to its corresponding Halide ForType
+ * \param iter_type The IterVarType to be converted
+ */
+ir::ForType IterVarTypeToForType(IterVarType iter_type);
+
 }  // namespace op
 }  // namespace tvm
 #endif  // TVM_OP_OP_UTIL_H_
diff --git a/tests/python/unittest/test_hybrid_script.py b/tests/python/unittest/test_hybrid_script.py
index 0d6edf27561c..080618c7df12 100644
--- a/tests/python/unittest/test_hybrid_script.py
+++ b/tests/python/unittest/test_hybrid_script.py
@@ -594,26 +594,37 @@ def hoo(a, b):
     run_and_check(hoo, [a, b])
 
 def test_schedule():
+    # Testing perfect loop split
     @script
     def outer_product(a, b):
-        """This is a simple outer product.
-        Actually this function is not required to be documented.
-        I write this docstring to test skipping docstring functionality.
-        """
-        c = output_tensor((128, 128), a.dtype)
+        c = output_tensor((128, 64), a.dtype)
         for i in range(128):
-            for j in range(128):
+            for j in range(64):
                 c[i, j] = a[i] * b[j]
         return c
-    a = tvm.placeholder((128,))
-    b = tvm.placeholder((128,))
+    a = tvm.placeholder((128,), name='a')
+    b = tvm.placeholder((64,), name='b')
     c = outer_product(a, b)
     sch = tvm.create_schedule(c.op)
-    i, j = c.op.axis
-    print(i.iter_type)
-    print(j.iter_type)
-    print(i, j)
-    io, ii = sch[c].split(i, 4)
+    j, i = c.op.axis
+    jo, ji = sch[c].split(j, 4)
+    joo, joi = sch[c].split(jo, 4)
+    ir = tvm.lower(sch, [a, b, c], simple_mode=True)
+    assert isinstance(ir, tvm.stmt.ProducerConsumer)
+    ir = ir.body
+    assert isinstance(ir, tvm.stmt.AttrStmt)
+    ir = ir.body
+    assert ir.loop_var.name == 'i'
+    ir = ir.body
+    assert isinstance(ir, tvm.stmt.For)
+    assert ir.loop_var.name == 'j.outer.outer'
+    ir = ir.body
+    assert isinstance(ir, tvm.stmt.For)
+    assert ir.loop_var.name == 'j.outer.inner'
+    ir = ir.body
+    assert isinstance(ir, tvm.stmt.For)
+    assert ir.loop_var.name == 'j.inner'
+
 
 if __name__ == "__main__":
     test_outer_product()

From 11e97e56b1e5d9099fe4257a949cd8149e9689b7 Mon Sep 17 00:00:00 2001
From: Jian Weng <jian.weng@ucla.edu>
Date: Thu, 10 Jan 2019 11:11:47 -0800
Subject: [PATCH 03/21] copyright watermark

---
 src/op/hybrid_op.cc                         | 4 ++--
 src/op/hybrid_op.h                          | 5 +++++
 tests/python/unittest/test_hybrid_script.py | 7 ++++++-
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/op/hybrid_op.cc b/src/op/hybrid_op.cc
index bfdd7706ea82..4eaec71e0c90 100644
--- a/src/op/hybrid_op.cc
+++ b/src/op/hybrid_op.cc
@@ -1,5 +1,5 @@
 /*!
- *  Copyright (c) 2018 by Contributors
+ *  Copyright (c) 2019 by Contributors
  * \brief Hybrid computation rule.
  * \file hybrid_op.cc
  */
@@ -199,6 +199,7 @@ Stmt HybridOpNode::BuildProvide(
 }
 
 namespace op {
+
 Stmt ApplySplits(const Stage &stage,
                  const std::unordered_map<IterVar, Range>& dom_map, Stmt stmt) {
   class LoopSpliter : public IRMutator {
@@ -334,5 +335,4 @@ Stmt ReplaceProvideTensor(Stmt stmt,
   return repl.found ? ret : stmt;
 }
 } // namespace op
-
 }  // namespace tvm
diff --git a/src/op/hybrid_op.h b/src/op/hybrid_op.h
index 5f8a7b5b578c..6046ef04ec7b 100644
--- a/src/op/hybrid_op.h
+++ b/src/op/hybrid_op.h
@@ -1,3 +1,8 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \brief Helper utilities to implement hybrid_op.
+ * \file hybrid_op.h
+ */
 #ifndef TVM_HYBRID_OP_H
 #define TVM_HYBRID_OP_H
 
diff --git a/tests/python/unittest/test_hybrid_script.py b/tests/python/unittest/test_hybrid_script.py
index 080618c7df12..7de03ff1b906 100644
--- a/tests/python/unittest/test_hybrid_script.py
+++ b/tests/python/unittest/test_hybrid_script.py
@@ -594,7 +594,7 @@ def hoo(a, b):
     run_and_check(hoo, [a, b])
 
 def test_schedule():
-    # Testing perfect loop split
+    # Test perfect loop split
     @script
     def outer_product(a, b):
         c = output_tensor((128, 64), a.dtype)
@@ -625,6 +625,11 @@ def outer_product(a, b):
     assert isinstance(ir, tvm.stmt.For)
     assert ir.loop_var.name == 'j.inner'
 
+    # Test imperfect loop split
+    # Test loop annotation
+    # Test loop binds
+    # Test loop reorder
+
 
 if __name__ == "__main__":
     test_outer_product()

From aa46637dee64fe95cf39fc99041e5534226eb134 Mon Sep 17 00:00:00 2001
From: Jian Weng <jian.weng@ucla.edu>
Date: Thu, 10 Jan 2019 14:24:21 -0800
Subject: [PATCH 04/21] loop annotation

---
 src/op/hybrid_op.cc                         | 40 +++++++++++++++++++++
 src/op/hybrid_op.h                          | 11 +++++-
 src/op/op_util.cc                           |  8 ++---
 src/schedule/schedule_dataflow_rewrite.cc   |  3 ++
 tests/python/unittest/test_hybrid_script.py |  6 ++--
 5 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/src/op/hybrid_op.cc b/src/op/hybrid_op.cc
index 4eaec71e0c90..3e0e5e96df3d 100644
--- a/src/op/hybrid_op.cc
+++ b/src/op/hybrid_op.cc
@@ -276,6 +276,46 @@ Stmt ApplySplits(const Stage &stage,
 }
 
 Stmt ApplyLoopAnnotations(const Stage &stage, Stmt stmt) {
+  class LoopAnnotator : public IRMutator {
+    const Variable *var;
+    ForType for_type;
+   public:
+    LoopAnnotator(const Variable *var_, ForType for_type_) : var(var_), for_type(for_type_) {}
+    
+    Stmt Mutate_(const For *op, const Stmt &stmt) {
+      if (op->loop_var.get() == var) {
+        CHECK(for_type != op->for_type);
+        return For::make(op->loop_var, op->min, op->extent,
+                         for_type, op->device_api, op->body);
+      }
+      return IRMutator::Mutate_(op, stmt);
+    }
+  };
+
+  for (auto &iter_var : stage->leaf_iter_vars) {
+    bool equal = false;
+    int found = 0;
+
+    const Variable *var = iter_var->var.get();
+    ForType expected = IterVarTypeToForType(iter_var->iter_type);
+    if (stage->iter_var_attrs.count(iter_var)) {
+      expected = IterVarTypeToForType(stage->iter_var_attrs[iter_var]->iter_type);
+    }
+
+    PostOrderVisit(stmt, [&found, &var, &expected, &equal](const NodeRef &node) {
+      if (const For *op = node.as<For>()) {
+        if (op->loop_var.get() == var) {
+          ++found;
+          equal = expected == op->for_type;
+        }
+      }
+    });
+
+    CHECK_EQ(found, 1) << " iter var should be found exactly once!";
+    if (!equal) {
+      stmt = LoopAnnotator(var, expected).Mutate(stmt);
+    }
+  }
   return stmt;
 }
 
diff --git a/src/op/hybrid_op.h b/src/op/hybrid_op.h
index 6046ef04ec7b..29cc774341cd 100644
--- a/src/op/hybrid_op.h
+++ b/src/op/hybrid_op.h
@@ -36,13 +36,22 @@ Stmt ReplaceProvideTensor(Stmt stmt,
                           const std::unordered_map<Tensor, Tensor>& replace);
 
 /*!
- * \brief Replace the tensor reference (especially in Provide's) in stmt by the replace map.
+ * \brief Apply the schedule manipulation on the function body.
  * \param stmt The statement to be processed.
  * \param stage The schedule information to be applied.
  */
 Stmt ApplySchedule(const Stage& stage,
                    const std::unordered_map<IterVar, Range>& dom_map, Stmt stmt);
 
+/*!
+ * \brief Apply loop splits in the schedule on the function body.
+ * \param stage The schedule information to be applied.
+ * \param dom_map The extents of the iterative variables may be used.
+ * \param stmt The statement to be processed.
+ */
+Stmt ApplySplits(const Stage &stage,
+                 const std::unordered_map<IterVar, Range>& dom_map, Stmt stmt);
+
 } // namespace op
 } // namespace tvm
 
diff --git a/src/op/op_util.cc b/src/op/op_util.cc
index ef0da51097a1..e6fadd3328a2 100644
--- a/src/op/op_util.cc
+++ b/src/op/op_util.cc
@@ -218,9 +218,9 @@ Stmt Substitute(Stmt s,
 IterVarType ForTypeToIterVarType(ir::ForType for_type) {
   switch(for_type) {
   case ForType::Serial:
-    return kOrdered;
-  case ForType::Parallel:
     return kDataPar;
+  case ForType::Parallel:
+    return kParallelized;
   case ForType::Vectorized:
     return kVectorized;
   case ForType::Unrolled:
@@ -230,9 +230,9 @@ IterVarType ForTypeToIterVarType(ir::ForType for_type) {
 
 ir::ForType IterVarTypeToForType(IterVarType iter_type) {
   switch(iter_type) {
-  case kOrdered:
-    return ForType::Serial;
   case kDataPar:
+    return ForType::Serial;
+  case kParallelized:
     return ForType::Parallel;
   case kVectorized:
     return ForType::Vectorized;
diff --git a/src/schedule/schedule_dataflow_rewrite.cc b/src/schedule/schedule_dataflow_rewrite.cc
index f1820d2a7fc6..2e9a0aaac501 100644
--- a/src/schedule/schedule_dataflow_rewrite.cc
+++ b/src/schedule/schedule_dataflow_rewrite.cc
@@ -487,6 +487,9 @@ void RebaseNonZeroMinLoop(const Schedule& sch) {
           (*it).second->bind_thread.defined()) {
         continue;
       }
+      if (iv->dom.defined() && is_const_int(iv->dom->min, 0)) {
+        continue;
+      }
       if (idx < leaf_vars->data.size()) {
         // insert rebase
         IterVar rebased = IterVarNode::make(
diff --git a/tests/python/unittest/test_hybrid_script.py b/tests/python/unittest/test_hybrid_script.py
index 7de03ff1b906..35c58ae41c4c 100644
--- a/tests/python/unittest/test_hybrid_script.py
+++ b/tests/python/unittest/test_hybrid_script.py
@@ -595,6 +595,7 @@ def hoo(a, b):
 
 def test_schedule():
     # Test perfect loop split
+    # Test loop annotation
     @script
     def outer_product(a, b):
         c = output_tensor((128, 64), a.dtype)
@@ -609,7 +610,9 @@ def outer_product(a, b):
     j, i = c.op.axis
     jo, ji = sch[c].split(j, 4)
     joo, joi = sch[c].split(jo, 4)
+    sch[c].vectorize(ji)
     ir = tvm.lower(sch, [a, b, c], simple_mode=True)
+    print(ir)
     assert isinstance(ir, tvm.stmt.ProducerConsumer)
     ir = ir.body
     assert isinstance(ir, tvm.stmt.AttrStmt)
@@ -622,11 +625,8 @@ def outer_product(a, b):
     assert isinstance(ir, tvm.stmt.For)
     assert ir.loop_var.name == 'j.outer.inner'
     ir = ir.body
-    assert isinstance(ir, tvm.stmt.For)
-    assert ir.loop_var.name == 'j.inner'
 
     # Test imperfect loop split
-    # Test loop annotation
     # Test loop binds
     # Test loop reorder
 

From bc121a390c65f154d8f08f6c79868184cf943b18 Mon Sep 17 00:00:00 2001
From: Jian Weng <jian.weng@ucla.edu>
Date: Thu, 10 Jan 2019 14:38:16 -0800
Subject: [PATCH 05/21] fix lint

---
 src/op/hybrid_op.cc | 16 ++++++++--------
 src/op/hybrid_op.h  | 10 +++++-----
 src/op/op_util.cc   |  4 ++--
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/op/hybrid_op.cc b/src/op/hybrid_op.cc
index 3e0e5e96df3d..84645f2cfbae 100644
--- a/src/op/hybrid_op.cc
+++ b/src/op/hybrid_op.cc
@@ -10,7 +10,7 @@
 #include <tvm/ir_operator.h>
 #include <tvm/ir_pass.h>
 #include <unordered_set>
-
+#include <string>
 #include "op_util.h"
 #include "hybrid_op.h"
 
@@ -205,15 +205,15 @@ Stmt ApplySplits(const Stage &stage,
   class LoopSpliter : public IRMutator {
     Expr factor;
     IterVar parent, inner, outer;
+
    public:
     LoopSpliter(const SplitNode *split,
-                const std::unordered_map<IterVar, Range>& dom_map) : 
+                const std::unordered_map<IterVar, Range>& dom_map) :
       factor(split->factor) {
-
       auto &parent_ = split->parent;
       if (parent_->dom.defined()) {
         CHECK(is_const_int(parent_->dom->min, 0));
-        parent= parent_;
+        parent = parent_;
       } else {
         CHECK(dom_map.count(parent_));
         auto &dom = dom_map.find(parent_)->second;
@@ -281,7 +281,7 @@ Stmt ApplyLoopAnnotations(const Stage &stage, Stmt stmt) {
     ForType for_type;
    public:
     LoopAnnotator(const Variable *var_, ForType for_type_) : var(var_), for_type(for_type_) {}
-    
+
     Stmt Mutate_(const For *op, const Stmt &stmt) {
       if (op->loop_var.get() == var) {
         CHECK(for_type != op->for_type);
@@ -323,8 +323,8 @@ Stmt ApplyLoopOrder(const Stage &stage, Stmt stmt) {
   return stmt;
 }
 
-Stmt ApplySchedule(const Stage &stage, const
-                   std::unordered_map<IterVar, Range>& dom_map, Stmt stmt) {
+Stmt ApplySchedule(const Stage &stage,
+                   const std::unordered_map<IterVar, Range>& dom_map, Stmt stmt) {
   stmt = ApplySplits(stage, dom_map, stmt);
   stmt = ApplyLoopAnnotations(stage, stmt);
   stmt = ApplyLoopOrder(stage, stmt);
@@ -374,5 +374,5 @@ Stmt ReplaceProvideTensor(Stmt stmt,
   Stmt ret = repl.Mutate(stmt);
   return repl.found ? ret : stmt;
 }
-} // namespace op
+}  // namespace op
 }  // namespace tvm
diff --git a/src/op/hybrid_op.h b/src/op/hybrid_op.h
index 29cc774341cd..efc61d9e09ec 100644
--- a/src/op/hybrid_op.h
+++ b/src/op/hybrid_op.h
@@ -3,8 +3,8 @@
  * \brief Helper utilities to implement hybrid_op.
  * \file hybrid_op.h
  */
-#ifndef TVM_HYBRID_OP_H
-#define TVM_HYBRID_OP_H
+#ifndef TVM_OP_HYBRID_OP_H_
+#define TVM_OP_HYBRID_OP_H_
 
 #include <tvm/expr.h>
 #include <tvm/ir_mutator.h>
@@ -52,7 +52,7 @@ Stmt ApplySchedule(const Stage& stage,
 Stmt ApplySplits(const Stage &stage,
                  const std::unordered_map<IterVar, Range>& dom_map, Stmt stmt);
 
-} // namespace op
-} // namespace tvm
+}  // namespace op
+}  // namespace tvm
 
-#endif // TVM_HYBRID_OP_H
+#endif  // TVM_OP_HYBRID_OP_H_
diff --git a/src/op/op_util.cc b/src/op/op_util.cc
index e6fadd3328a2..9bcbf41df276 100644
--- a/src/op/op_util.cc
+++ b/src/op/op_util.cc
@@ -216,7 +216,7 @@ Stmt Substitute(Stmt s,
 }
 
 IterVarType ForTypeToIterVarType(ir::ForType for_type) {
-  switch(for_type) {
+  switch (for_type) {
   case ForType::Serial:
     return kDataPar;
   case ForType::Parallel:
@@ -229,7 +229,7 @@ IterVarType ForTypeToIterVarType(ir::ForType for_type) {
 }
 
 ir::ForType IterVarTypeToForType(IterVarType iter_type) {
-  switch(iter_type) {
+  switch (iter_type) {
   case kDataPar:
     return ForType::Serial;
   case kParallelized:

From ecc940b064bcec0f8d8ed5ceed849a4db72d8482 Mon Sep 17 00:00:00 2001
From: Jian Weng <jian.weng@ucla.edu>
Date: Thu, 10 Jan 2019 14:43:18 -0800
Subject: [PATCH 06/21] fix lint 1

---
 src/op/op_util.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/op/op_util.cc b/src/op/op_util.cc
index 9bcbf41df276..b18552d5c562 100644
--- a/src/op/op_util.cc
+++ b/src/op/op_util.cc
@@ -225,6 +225,8 @@ IterVarType ForTypeToIterVarType(ir::ForType for_type) {
     return kVectorized;
   case ForType::Unrolled:
     return kUnrolled;
+  default:
+    return kDataPar;
   }
 }
 

From 64b23713b7c91f1fddc7388de165a70f4f08381b Mon Sep 17 00:00:00 2001
From: Jian Weng <jian.weng@ucla.edu>
Date: Thu, 10 Jan 2019 14:50:38 -0800
Subject: [PATCH 07/21] shit!

---
 src/schedule/schedule_dataflow_rewrite.cc | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/schedule/schedule_dataflow_rewrite.cc b/src/schedule/schedule_dataflow_rewrite.cc
index 2e9a0aaac501..f1820d2a7fc6 100644
--- a/src/schedule/schedule_dataflow_rewrite.cc
+++ b/src/schedule/schedule_dataflow_rewrite.cc
@@ -487,9 +487,6 @@ void RebaseNonZeroMinLoop(const Schedule& sch) {
           (*it).second->bind_thread.defined()) {
         continue;
       }
-      if (iv->dom.defined() && is_const_int(iv->dom->min, 0)) {
-        continue;
-      }
       if (idx < leaf_vars->data.size()) {
         // insert rebase
         IterVar rebased = IterVarNode::make(

From 4cbbf424aa008072ca91947f16843e7b4cc34059 Mon Sep 17 00:00:00 2001
From: Jian Weng <jian.weng@ucla.edu>
Date: Fri, 11 Jan 2019 00:28:11 -0800
Subject: [PATCH 08/21] loop reorder supported

---
 src/op/hybrid_op.cc                         | 79 +++++++++++++++++++--
 src/op/hybrid_op.h                          | 21 ++++++
 tests/python/unittest/test_hybrid_script.py | 55 +++++++++-----
 3 files changed, 134 insertions(+), 21 deletions(-)

diff --git a/src/op/hybrid_op.cc b/src/op/hybrid_op.cc
index 84645f2cfbae..3dcb29274ae4 100644
--- a/src/op/hybrid_op.cc
+++ b/src/op/hybrid_op.cc
@@ -9,6 +9,7 @@
 #include <tvm/ir_mutator.h>
 #include <tvm/ir_operator.h>
 #include <tvm/ir_pass.h>
+#include <ir/Expr.h>
 #include <unordered_set>
 #include <string>
 #include "op_util.h"
@@ -200,6 +201,7 @@ Stmt HybridOpNode::BuildProvide(
 
 namespace op {
 
+
 Stmt ApplySplits(const Stage &stage,
                  const std::unordered_map<IterVar, Range>& dom_map, Stmt stmt) {
   class LoopSpliter : public IRMutator {
@@ -275,7 +277,8 @@ Stmt ApplySplits(const Stage &stage,
   return stmt;
 }
 
-Stmt ApplyLoopAnnotations(const Stage &stage, Stmt stmt) {
+Stmt ApplyLoopAnnotations(const Stage &stage,
+                          const std::unordered_map<IterVar, IterVar>& rebased, Stmt stmt) {
   class LoopAnnotator : public IRMutator {
     const Variable *var;
     ForType for_type;
@@ -296,7 +299,8 @@ Stmt ApplyLoopAnnotations(const Stage &stage, Stmt stmt) {
     bool equal = false;
     int found = 0;
 
-    const Variable *var = iter_var->var.get();
+    const IterVar &actual = rebased.count(iter_var) ? rebased.find(iter_var)->second : iter_var;
+    const Variable *var = actual->var.get();
     ForType expected = IterVarTypeToForType(iter_var->iter_type);
     if (stage->iter_var_attrs.count(iter_var)) {
       expected = IterVarTypeToForType(stage->iter_var_attrs[iter_var]->iter_type);
@@ -319,15 +323,79 @@ Stmt ApplyLoopAnnotations(const Stage &stage, Stmt stmt) {
   return stmt;
 }
 
-Stmt ApplyLoopOrder(const Stage &stage, Stmt stmt) {
+Stmt ApplyLoopOrder(const Stage &stage,
+                    const std::unordered_map<IterVar, Range> &dom_map,
+                    const std::unordered_map<IterVar, IterVar> &rebased, Stmt stmt) {
+  std::vector<const Variable*> current_order;
+  PostOrderVisit(stmt, [&current_order](const NodeRef &node) {
+    if (const For *op = node.as<For>())
+      current_order.push_back(op->loop_var.get());
+  });
+  std::reverse(current_order.begin(), current_order.end());
+  auto &required_ord = stage->leaf_iter_vars;
+  CHECK_EQ(current_order.size(), required_ord.size()) << "Cannot reorder the loops!";
+  std::unordered_map<const Variable *, IterVar> reorder;
+  bool need_reorder = false;
+  for (size_t i = 0; i < current_order.size(); ++i) {
+    auto &current = current_order[i];
+    const IterVar &iter_var = required_ord[i];
+    const IterVar &required = rebased.count(iter_var) ? rebased.find(iter_var)->second : iter_var;
+    CHECK(required->dom.defined() || dom_map.count(required)) << required << "\n";
+    reorder[current] = required;
+    if (current != required->var.get()) {
+      need_reorder = true;
+    }
+  }
+
+  class LoopReorder : public IRMutator {
+    const Stage &stage;
+    const std::unordered_map<IterVar, Range> &dom_map;
+    const std::unordered_map<const Variable *, IterVar> &reorder;
+
+   public:
+    LoopReorder(const Stage &stage,
+                const std::unordered_map<IterVar, Range> &dom_map,
+                const std::unordered_map<const Variable*, IterVar> &reorder)
+      : stage(stage), dom_map(dom_map), reorder(reorder) {}
+
+    Stmt Mutate_(const For *op, const Stmt &stmt) {
+      // Reorder from in to out
+      Stmt body_ = IRMutator::Mutate(op->body);
+      CHECK(reorder.count(op->loop_var.get()));
+      auto target = reorder.find(op->loop_var.get())->second;
+      if (body_.same_as(op->body) && op->loop_var.get() == target->var.get())
+          return stmt;
+      const Stmt &body = op->body.same_as(body_) ? op->body : body_;
+      ForType for_type = IterVarTypeToForType(target->iter_type);
+      if (stage->iter_var_attrs.count(target)) {
+        for_type = IterVarTypeToForType(stage->iter_var_attrs[target]->iter_type);
+      }
+      const Range &range = target->dom.defined() ? target->dom : dom_map.find(target)->second;
+      return For::make(target->var, range->min, range->extent,
+                       for_type, HalideIR::DeviceAPI::None, body);
+    }
+  };
+
+  if (need_reorder)
+    return LoopReorder(stage, dom_map, reorder).Mutate(stmt);
+
   return stmt;
 }
 
 Stmt ApplySchedule(const Stage &stage,
                    const std::unordered_map<IterVar, Range>& dom_map, Stmt stmt) {
+  // Gather rebased variables
+  std::unordered_map<IterVar, IterVar> rebased;
+  for (auto rel : stage->relations) {
+    if (auto rebase = rel.as<RebaseNode>()) {
+      rebased[rebase->rebased] = rebase->parent;
+      CHECK(rebase->parent->dom.defined());
+      CHECK(dom_map.count(rebase->rebased));
+    }
+  }
   stmt = ApplySplits(stage, dom_map, stmt);
-  stmt = ApplyLoopAnnotations(stage, stmt);
-  stmt = ApplyLoopOrder(stage, stmt);
+  stmt = ApplyLoopOrder(stage, dom_map, rebased, stmt);
+  stmt = ApplyLoopAnnotations(stage, rebased, stmt);
   return stmt;
 }
 
@@ -340,6 +408,7 @@ std::vector<IterVar> GatherLoopVars(Stmt stmt) {
       res_.push_back(IterVarNode::make(dom, loop_var, ForTypeToIterVarType(op->for_type)));
     }
   });
+  std::reverse(res_.begin(), res_.end());
   return res_;
 }
 
diff --git a/src/op/hybrid_op.h b/src/op/hybrid_op.h
index efc61d9e09ec..766212ac9bf1 100644
--- a/src/op/hybrid_op.h
+++ b/src/op/hybrid_op.h
@@ -52,6 +52,27 @@ Stmt ApplySchedule(const Stage& stage,
 Stmt ApplySplits(const Stage &stage,
                  const std::unordered_map<IterVar, Range>& dom_map, Stmt stmt);
 
+
+/*!
+ * \brief Apply loop annotation in the schedule on the function body.
+ * \param stage The schedule information to be applied.
+ * \param rebased The map specifies the rebase, a.k.a rename, relationship of these variables.
+ * \param stmt The statement to be processed.
+ */
+Stmt ApplyLoopAnnotations(const Stage &stage,
+                          const std::unordered_map<IterVar, IterVar>& rebased, Stmt stmt);
+
+/*!
+ * \brief Apply loop order in the schedule on the function body.
+ * \param stage The schedule information to be applied.
+ * \param dom_map The extents of the iterative variables may be used.
+ * \param rebased The map specifies the rebase, a.k.a rename, relationship of these variables.
+ * \param stmt The statement to be processed.
+ */
+Stmt ApplyLoopOrder(const Stage &stage,
+                    const std::unordered_map<IterVar, Range> &dom_map,
+                    const std::unordered_map<IterVar, IterVar> &rebased, Stmt stmt);
+
 }  // namespace op
 }  // namespace tvm
 
diff --git a/tests/python/unittest/test_hybrid_script.py b/tests/python/unittest/test_hybrid_script.py
index 35c58ae41c4c..04ceece24620 100644
--- a/tests/python/unittest/test_hybrid_script.py
+++ b/tests/python/unittest/test_hybrid_script.py
@@ -3,7 +3,7 @@
 from tvm.hybrid.intrin import HYBRID_GLOBALS
 
 @nose.tools.nottest
-def run_and_check(func, args, var_dict={}, target='llvm'):
+def run_and_check(func, args, var_dict={}, target='llvm', sch=None):
     def tvm_val_2_py_val(val):
         val = tvm.ir_pass.Substitute(val, var_dict)
         val = tvm.ir_pass.Simplify(val)
@@ -13,8 +13,13 @@ def tvm_val_2_py_val(val):
     ctx = tvm.context(target, 0)
     op = None
 
-    outs = func(*tuple(tvm.convert(i) if isinstance(i, list) else i for i in args))
-    op = outs[0].op if isinstance(outs, list) else outs.op
+    if sch is None:
+        outs = func(*tuple(tvm.convert(i) if isinstance(i, list) else i for i in args))
+        op = outs[0].op if isinstance(outs, list) else outs.op
+        sch = tvm.create_schedule(op)
+    else:
+        op = sch.outputs[0]
+        outs = list(op.outputs)
 
     emu_args = []
     nd_args = []
@@ -30,13 +35,13 @@ def tvm_val_2_py_val(val):
             assert isinstance(i, list)
             emu_args.append(numpy.array(i))
 
-    sch = tvm.create_schedule(op)
+    compile_args = [i for i in args if isinstance(i, (tvm.tensor.Tensor, tvm.expr.Var))] + \
+                   (outs if isinstance(outs, list) else [outs])
     module = tvm.build(sch,
-                       [i for i in args if isinstance(i, (tvm.tensor.Tensor, tvm.expr.Var))] + \
-                       (outs if isinstance(outs, list) else [outs]),
+                       compile_args,
                        target=target)
     assert module
-    
+
     out_tensors = []
     for i in range(op.num_outputs):
         output = op.output(i)
@@ -47,7 +52,7 @@ def tvm_val_2_py_val(val):
     ref_data = func(*emu_args)
     if isinstance(ref_data, numpy.ndarray):
         ref_data = [ref_data]
-    
+
     module(*nd_args)
 
     for nd, np in zip(out_tensors, ref_data):
@@ -595,29 +600,36 @@ def hoo(a, b):
 
 def test_schedule():
     # Test perfect loop split
+    # Test loop reorder
     # Test loop annotation
     @script
     def outer_product(a, b):
-        c = output_tensor((128, 64), a.dtype)
-        for i in range(128):
+        c = output_tensor((64, 64), a.dtype)
+        for i in range(64):
             for j in range(64):
                 c[i, j] = a[i] * b[j]
         return c
-    a = tvm.placeholder((128,), name='a')
-    b = tvm.placeholder((64,), name='b')
+    a = tvm.placeholder((64,), name='a', dtype='float32')
+    b = tvm.placeholder((64,), name='b', dtype='float32')
     c = outer_product(a, b)
     sch = tvm.create_schedule(c.op)
-    j, i = c.op.axis
+    i, j = c.op.axis
+    io, ii = sch[c].split(i, 4)
+    sch[c].parallel(ii)
     jo, ji = sch[c].split(j, 4)
     joo, joi = sch[c].split(jo, 4)
     sch[c].vectorize(ji)
+    sch[c].reorder(ii, io, joo, joi, ji)
     ir = tvm.lower(sch, [a, b, c], simple_mode=True)
-    print(ir)
     assert isinstance(ir, tvm.stmt.ProducerConsumer)
     ir = ir.body
     assert isinstance(ir, tvm.stmt.AttrStmt)
     ir = ir.body
-    assert ir.loop_var.name == 'i'
+    assert isinstance(ir, tvm.stmt.For)
+    assert ir.loop_var.name == 'i.inner'
+    ir = ir.body
+    assert isinstance(ir, tvm.stmt.For)
+    assert ir.loop_var.name == 'i.outer'
     ir = ir.body
     assert isinstance(ir, tvm.stmt.For)
     assert ir.loop_var.name == 'j.outer.outer'
@@ -626,9 +638,20 @@ def outer_product(a, b):
     assert ir.loop_var.name == 'j.outer.inner'
     ir = ir.body
 
+    module = tvm.build(sch, [a, b, c])
+    assert module
+    a = numpy.random.randn(64)
+    b = numpy.random.randn(64)
+    c = numpy.outer(a, b)
+    nd_a = tvm.ndarray.array(a.astype('float32'))
+    nd_b = tvm.ndarray.array(b.astype('float32'))
+    nd_c = tvm.ndarray.array(numpy.zeros((64, 64)).astype('float32'))
+    module(nd_a, nd_b, nd_c)
+    tvm.testing.assert_allclose(nd_c.asnumpy(), c, 1e-5, 1e-5)
+    #run_and_check(outer_product, [a, b], sch=sch)
+
     # Test imperfect loop split
     # Test loop binds
-    # Test loop reorder
 
 
 if __name__ == "__main__":

From b945d346ea4aa5621b743d32a9f7312342c3284c Mon Sep 17 00:00:00 2001
From: Jian Weng <jian.weng@ucla.edu>
Date: Fri, 11 Jan 2019 10:52:49 -0800
Subject: [PATCH 09/21] support bind to add some tests

---
 src/op/hybrid_op.cc | 79 +++++++++++++++++++++++++++------------------
 1 file changed, 47 insertions(+), 32 deletions(-)

diff --git a/src/op/hybrid_op.cc b/src/op/hybrid_op.cc
index 3dcb29274ae4..58718c12c40c 100644
--- a/src/op/hybrid_op.cc
+++ b/src/op/hybrid_op.cc
@@ -68,8 +68,8 @@ Array<Tensor> HybridOpNode::InputTensors() const {
 }
 
 Operation HybridOpNode::ReplaceInputs(
-    const Operation& self,
-    const std::unordered_map<Tensor, Tensor>& rmap) const {
+    const Operation &self,
+    const std::unordered_map<Tensor, Tensor> &rmap) const {
   CHECK_EQ(self.operator->(), this);
   auto n = make_node<HybridOpNode>(*this);
   n->body = op::ReplaceTensor(this->body, rmap);
@@ -89,13 +89,13 @@ Operation HybridOpNode::ReplaceInputs(
 }
 
 void HybridOpNode::PropBoundToInputs(
-    const Operation& self,
-    const std::unordered_map<const Variable*, IntSet>& dom_map,
+    const Operation &self,
+    const std::unordered_map<const Variable*, IntSet> &dom_map,
     std::unordered_map<Tensor, TensorDom>* out_dom_map) const {
   for (Tensor t : this->inputs) {
     auto it = out_dom_map->find(t);
     if (it == out_dom_map->end()) continue;
-    TensorDom& dom = it->second;
+    TensorDom &dom = it->second;
     for (size_t i = 0; i < t->shape.size(); ++i) {
       dom.data[i].emplace_back(IntSet::range(
           Range::make_by_min_extent(
@@ -105,8 +105,8 @@ void HybridOpNode::PropBoundToInputs(
 }
 
 void HybridOpNode::GatherBound(
-    const Operation& self,
-    const std::unordered_map<Tensor, TensorDom>& tensor_dom,
+    const Operation &self,
+    const std::unordered_map<Tensor, TensorDom> &tensor_dom,
     std::unordered_map<IterVar, Range>* out_dom_map) const {
   for (auto iter_var : axis) {
     CHECK(!out_dom_map->count(iter_var));
@@ -115,9 +115,9 @@ void HybridOpNode::GatherBound(
 }
 
 Stmt HybridOpNode::BuildRealize(
-    const Stage& stage,
-    const std::unordered_map<IterVar, Range>& realize_map,
-    const Stmt& body) const {
+    const Stage &stage,
+    const std::unordered_map<IterVar, Range> &realize_map,
+    const Stmt &body) const {
   CHECK_EQ(stage->op.get(), this);
   Stmt realize_body = body;
   for (int k = 0; k < num_outputs(); ++k) {
@@ -136,8 +136,8 @@ Stmt HybridOpNode::BuildRealize(
 }
 
 Stmt HybridOpNode::BuildProvide(
-    const Stage& stage,
-    const std::unordered_map<IterVar, Range>& dom_map,
+    const Stage &stage,
+    const std::unordered_map<IterVar, Range> &dom_map,
     bool debug_keep_trivial_loop) const {
   CHECK_EQ(stage->op.operator->(), this);
   Stmt ret = AttrStmt::make(make_zero(Int(32)), attr::extern_scope, 0, this->body);
@@ -203,14 +203,14 @@ namespace op {
 
 
 Stmt ApplySplits(const Stage &stage,
-                 const std::unordered_map<IterVar, Range>& dom_map, Stmt stmt) {
+                 const std::unordered_map<IterVar, Range> &dom_map, Stmt stmt) {
   class LoopSpliter : public IRMutator {
     Expr factor;
     IterVar parent, inner, outer;
 
    public:
     LoopSpliter(const SplitNode *split,
-                const std::unordered_map<IterVar, Range>& dom_map) :
+                const std::unordered_map<IterVar, Range> &dom_map) :
       factor(split->factor) {
       auto &parent_ = split->parent;
       if (parent_->dom.defined()) {
@@ -278,46 +278,61 @@ Stmt ApplySplits(const Stage &stage,
 }
 
 Stmt ApplyLoopAnnotations(const Stage &stage,
-                          const std::unordered_map<IterVar, IterVar>& rebased, Stmt stmt) {
+                          const std::unordered_map<IterVar, IterVar> &rebased, Stmt stmt) {
   class LoopAnnotator : public IRMutator {
     const Variable *var;
-    ForType for_type;
+    const IterVarAttr &attr;
+
    public:
-    LoopAnnotator(const Variable *var_, ForType for_type_) : var(var_), for_type(for_type_) {}
+    LoopAnnotator(const Variable *var_, const IterVarAttr &attr_) : var(var_), attr(attr_) {}
 
     Stmt Mutate_(const For *op, const Stmt &stmt) {
       if (op->loop_var.get() == var) {
-        CHECK(for_type != op->for_type);
-        return For::make(op->loop_var, op->min, op->extent,
-                         for_type, op->device_api, op->body);
+        if (attr->bind_thread.defined()) {
+          const auto &iter_var = attr->bind_thread;
+          if (iter_var->dom.defined()) {
+            CHECK(is_const_int(iter_var->dom->min, 0));
+            CHECK(Equal(iter_var->dom->extent, op->extent))
+              << "Thread extent and loop extent mismatch!\n";
+          }
+          std::unordered_map<const Variable *, Expr> rmap;
+          rmap[op->loop_var.get()] = iter_var;
+          Stmt body = ir::Substitute(op->body, rmap);
+          return AttrStmt::make(iter_var, "thread_extent", op->extent, body);
+        } else {
+          return For::make(op->loop_var, op->min, op->extent,
+                           IterVarTypeToForType(attr->iter_type), op->device_api, op->body);
+        }
       }
       return IRMutator::Mutate_(op, stmt);
     }
   };
 
   for (auto &iter_var : stage->leaf_iter_vars) {
-    bool equal = false;
+    bool need_change = false;
     int found = 0;
 
     const IterVar &actual = rebased.count(iter_var) ? rebased.find(iter_var)->second : iter_var;
     const Variable *var = actual->var.get();
     ForType expected = IterVarTypeToForType(iter_var->iter_type);
+    IterVarAttr attr;
     if (stage->iter_var_attrs.count(iter_var)) {
-      expected = IterVarTypeToForType(stage->iter_var_attrs[iter_var]->iter_type);
+      attr = stage->iter_var_attrs[iter_var];
+      expected = IterVarTypeToForType(attr->iter_type);
     }
 
-    PostOrderVisit(stmt, [&found, &var, &expected, &equal](const NodeRef &node) {
+    PostOrderVisit(stmt, [&found, &var, &attr, &expected, &need_change](const NodeRef &node) {
       if (const For *op = node.as<For>()) {
         if (op->loop_var.get() == var) {
           ++found;
-          equal = expected == op->for_type;
+          need_change = expected != op->for_type || (attr.defined() && attr->bind_thread.defined());
         }
       }
     });
 
     CHECK_EQ(found, 1) << " iter var should be found exactly once!";
-    if (!equal) {
-      stmt = LoopAnnotator(var, expected).Mutate(stmt);
+    if (need_change) {
+      stmt = LoopAnnotator(var, attr).Mutate(stmt);
     }
   }
   return stmt;
@@ -364,7 +379,7 @@ Stmt ApplyLoopOrder(const Stage &stage,
       CHECK(reorder.count(op->loop_var.get()));
       auto target = reorder.find(op->loop_var.get())->second;
       if (body_.same_as(op->body) && op->loop_var.get() == target->var.get())
-          return stmt;
+        return stmt;
       const Stmt &body = op->body.same_as(body_) ? op->body : body_;
       ForType for_type = IterVarTypeToForType(target->iter_type);
       if (stage->iter_var_attrs.count(target)) {
@@ -383,7 +398,7 @@ Stmt ApplyLoopOrder(const Stage &stage,
 }
 
 Stmt ApplySchedule(const Stage &stage,
-                   const std::unordered_map<IterVar, Range>& dom_map, Stmt stmt) {
+                   const std::unordered_map<IterVar, Range> &dom_map, Stmt stmt) {
   // Gather rebased variables
   std::unordered_map<IterVar, IterVar> rebased;
   for (auto rel : stage->relations) {
@@ -415,10 +430,10 @@ std::vector<IterVar> GatherLoopVars(Stmt stmt) {
 // replacer to replace tensors' usage in Provide
 class ProviderReplacer : public ir::IRMutator {
  public:
-  explicit ProviderReplacer(const std::unordered_map<Tensor, Tensor>& vmap)
+  explicit ProviderReplacer(const std::unordered_map<Tensor, Tensor> &vmap)
       : vmap_(vmap) {}
 
-  Stmt Mutate_(const ir::Provide* op, const Stmt& s) {
+  Stmt Mutate_(const ir::Provide* op, const Stmt &s) {
     Tensor t = Operation(op->func.node_).output(op->value_index);
     auto it = vmap_.find(t);
     if (it != vmap_.end()) {
@@ -434,11 +449,11 @@ class ProviderReplacer : public ir::IRMutator {
   bool found{false};
 
  private:
-  const std::unordered_map<Tensor, Tensor>& vmap_;
+  const std::unordered_map<Tensor, Tensor> &vmap_;
 };
 
 Stmt ReplaceProvideTensor(Stmt stmt,
-                   const std::unordered_map<Tensor, Tensor>& replace) {
+                   const std::unordered_map<Tensor, Tensor> &replace) {
   ProviderReplacer repl(replace);
   Stmt ret = repl.Mutate(stmt);
   return repl.found ? ret : stmt;

From 2639873c94d220210257b9fefb3efcfe59fdd2e0 Mon Sep 17 00:00:00 2001
From: Jian Weng <jian.weng@ucla.edu>
Date: Fri, 11 Jan 2019 12:06:32 -0800
Subject: [PATCH 10/21] fused tested

---
 src/op/hybrid_op.cc                         | 71 ++++++++++++++++-----
 tests/python/unittest/test_hybrid_script.py | 37 ++++++-----
 2 files changed, 74 insertions(+), 34 deletions(-)

diff --git a/src/op/hybrid_op.cc b/src/op/hybrid_op.cc
index 58718c12c40c..05d2a6ee0a26 100644
--- a/src/op/hybrid_op.cc
+++ b/src/op/hybrid_op.cc
@@ -209,9 +209,10 @@ Stmt ApplySplits(const Stage &stage,
     IterVar parent, inner, outer;
 
    public:
+    bool splitted;
     LoopSpliter(const SplitNode *split,
                 const std::unordered_map<IterVar, Range> &dom_map) :
-      factor(split->factor) {
+      factor(split->factor), splitted(false) {
       auto &parent_ = split->parent;
       if (parent_->dom.defined()) {
         CHECK(is_const_int(parent_->dom->min, 0));
@@ -248,29 +249,65 @@ Stmt ApplySplits(const Stage &stage,
                         IterVarTypeToForType(inner->iter_type), op->device_api, ret);
         ret = For::make(outer->var, Expr(0), outer->dom->extent,
                         IterVarTypeToForType(outer->iter_type), op->device_api, ret);
+        splitted = true;
         return ret;
       }
       return IRMutator::Mutate_(op, stmt);
     }
   };
 
-  bool changed = true;
-  while (changed) {
-    changed = false;
-    for (auto &rel : stage->relations) {
-      if (const SplitNode* split = rel.as<SplitNode>()) {
-        bool not_splited = false;
-        PostOrderVisit(stmt, [&not_splited, &split](const NodeRef &node) {
-          if (const Variable *var = node.as<Variable>()) {
-            if (var == split->parent->var.get())
-              not_splited = true;
-          }
-        });
-        if (not_splited) {
-          stmt = LoopSpliter(split, dom_map).Mutate(stmt);
-          changed = true;
-        }
+  class LoopFuser : public IRMutator {
+    const IterVar &parent;
+    const Variable *inner;
+    const Variable *outer;
+    bool under_outer;
+    Expr extent;
+    
+   public:
+    bool fused;
+    LoopFuser(const FuseNode *fuse_)
+      : parent(fuse_->fused), inner(fuse_->inner->var.get()),
+        outer(fuse_->outer->var.get()), under_outer(false),
+        extent(0), fused(false) {}
+
+    Stmt Mutate_(const For *op, const Stmt &stmt) {
+      if (op->loop_var.get() == inner) {
+        CHECK(under_outer);
+        std::unordered_map<const Variable *, Expr> rmap;
+        rmap[op->loop_var.get()] = parent % op->extent;
+        extent = op->extent;
+        fused = true;
+        return ir::Substitute(op->body, rmap);
+      } else if (op->loop_var.get() == outer) {
+        under_outer = true;
+        Stmt body = IRMutator::Mutate(op->body);
+        std::unordered_map<const Variable *, Expr> rmap;
+        rmap[op->loop_var.get()] = parent / extent;
+        body = ir::Substitute(body, rmap);
+        under_outer = false;
+        return For::make(parent->var, Expr(0), extent * op->extent,
+                         op->for_type, op->device_api, body);
+      } else if (under_outer) {
+        Stmt body = IRMutator::Mutate(op->body);
+        std::unordered_map<const Variable *, Expr> rmap;
+        rmap[op->loop_var.get()] = parent / extent % op->extent;
+        body = ir::Substitute(body, rmap);
+        extent = extent * op->extent;
+        return body;
       }
+      return IRMutator::Mutate(stmt);
+    }
+  };
+
+  for (auto &rel : stage->relations) {
+    if (const SplitNode *split = rel.as<SplitNode>()) {
+      LoopSpliter Spliter(split, dom_map);
+      stmt = Spliter.Mutate(stmt);
+      CHECK(Spliter.splitted);
+    } else if (const FuseNode *fuse = rel.as<FuseNode>()) {
+      LoopFuser Fuser(fuse);
+      stmt = Fuser.Mutate(stmt);
+      CHECK(Fuser.fused);
     }
   }
 
diff --git a/tests/python/unittest/test_hybrid_script.py b/tests/python/unittest/test_hybrid_script.py
index 04ceece24620..c1afbf1f5c38 100644
--- a/tests/python/unittest/test_hybrid_script.py
+++ b/tests/python/unittest/test_hybrid_script.py
@@ -3,7 +3,7 @@
 from tvm.hybrid.intrin import HYBRID_GLOBALS
 
 @nose.tools.nottest
-def run_and_check(func, args, var_dict={}, target='llvm', sch=None):
+def run_and_check(func, args, var_dict={}, target='llvm', sch=None, outs=None):
     def tvm_val_2_py_val(val):
         val = tvm.ir_pass.Substitute(val, var_dict)
         val = tvm.ir_pass.Simplify(val)
@@ -18,8 +18,9 @@ def tvm_val_2_py_val(val):
         op = outs[0].op if isinstance(outs, list) else outs.op
         sch = tvm.create_schedule(op)
     else:
-        op = sch.outputs[0]
-        outs = list(op.outputs)
+        assert outs is not None
+        assert isinstance(outs, list)
+        op = outs[0].op
 
     emu_args = []
     nd_args = []
@@ -599,9 +600,6 @@ def hoo(a, b):
     run_and_check(hoo, [a, b])
 
 def test_schedule():
-    # Test perfect loop split
-    # Test loop reorder
-    # Test loop annotation
     @script
     def outer_product(a, b):
         c = output_tensor((64, 64), a.dtype)
@@ -612,6 +610,10 @@ def outer_product(a, b):
     a = tvm.placeholder((64,), name='a', dtype='float32')
     b = tvm.placeholder((64,), name='b', dtype='float32')
     c = outer_product(a, b)
+
+    # Test perfect loop split
+    # Test loop reorder
+    # Test loop annotation
     sch = tvm.create_schedule(c.op)
     i, j = c.op.axis
     io, ii = sch[c].split(i, 4)
@@ -637,18 +639,19 @@ def outer_product(a, b):
     assert isinstance(ir, tvm.stmt.For)
     assert ir.loop_var.name == 'j.outer.inner'
     ir = ir.body
+    run_and_check(outer_product, [a, b], sch=sch, outs=[c])
 
-    module = tvm.build(sch, [a, b, c])
-    assert module
-    a = numpy.random.randn(64)
-    b = numpy.random.randn(64)
-    c = numpy.outer(a, b)
-    nd_a = tvm.ndarray.array(a.astype('float32'))
-    nd_b = tvm.ndarray.array(b.astype('float32'))
-    nd_c = tvm.ndarray.array(numpy.zeros((64, 64)).astype('float32'))
-    module(nd_a, nd_b, nd_c)
-    tvm.testing.assert_allclose(nd_c.asnumpy(), c, 1e-5, 1e-5)
-    #run_and_check(outer_product, [a, b], sch=sch)
+    # Test fuse
+    sch = tvm.create_schedule(c.op)
+    sch[c].fuse(c.op.axis[0], c.op.axis[1])
+    ir = tvm.lower(sch, [a, b, c], simple_mode=True)
+    assert isinstance(ir, tvm.stmt.ProducerConsumer)
+    ir = ir.body
+    assert isinstance(ir, tvm.stmt.AttrStmt)
+    ir = ir.body
+    assert isinstance(ir, tvm.stmt.For)
+    assert ir.loop_var.name == 'i.j.fused'
+    run_and_check(outer_product, [a, b], sch=sch, outs=[c])
 
     # Test imperfect loop split
     # Test loop binds

From eac5f200903d1bb40ca16ccf34464318dbc318bc Mon Sep 17 00:00:00 2001
From: Jian Weng <jian.weng@ucla.edu>
Date: Fri, 11 Jan 2019 12:13:03 -0800
Subject: [PATCH 11/21] imperfect loop testcase

---
 src/op/hybrid_op.cc                         | 6 ++++--
 tests/python/unittest/test_hybrid_script.py | 5 +++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/op/hybrid_op.cc b/src/op/hybrid_op.cc
index 05d2a6ee0a26..e80546e586ce 100644
--- a/src/op/hybrid_op.cc
+++ b/src/op/hybrid_op.cc
@@ -262,14 +262,16 @@ Stmt ApplySplits(const Stage &stage,
     const Variable *outer;
     bool under_outer;
     Expr extent;
-    
+
    public:
     bool fused;
-    LoopFuser(const FuseNode *fuse_)
+    explicit LoopFuser(const FuseNode *fuse_)
       : parent(fuse_->fused), inner(fuse_->inner->var.get()),
         outer(fuse_->outer->var.get()), under_outer(false),
         extent(0), fused(false) {}
 
+    // TODO: Handle imperfect loops
+
     Stmt Mutate_(const For *op, const Stmt &stmt) {
       if (op->loop_var.get() == inner) {
         CHECK(under_outer);
diff --git a/tests/python/unittest/test_hybrid_script.py b/tests/python/unittest/test_hybrid_script.py
index c1afbf1f5c38..24f2a1600914 100644
--- a/tests/python/unittest/test_hybrid_script.py
+++ b/tests/python/unittest/test_hybrid_script.py
@@ -654,6 +654,11 @@ def outer_product(a, b):
     run_and_check(outer_product, [a, b], sch=sch, outs=[c])
 
     # Test imperfect loop split
+    sch = tvm.create_schedule(c.op)
+    sch[c].split(c.op.axis[0], 3)
+    ir = tvm.lower(sch, [a, b, c], simple_mode=True)
+    run_and_check(outer_product, [a, b], sch=sch, outs=[c])
+
     # Test loop binds
 
 

From 471af84e70d580a9f01b302ebea88d40a38d29fd Mon Sep 17 00:00:00 2001
From: Jian Weng <jian.weng@ucla.edu>
Date: Fri, 11 Jan 2019 12:45:30 -0800
Subject: [PATCH 12/21] fix lint

---
 src/op/hybrid_op.cc | 7 ++++---
 src/op/hybrid_op.h  | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/op/hybrid_op.cc b/src/op/hybrid_op.cc
index e80546e586ce..c322c5d34ef1 100644
--- a/src/op/hybrid_op.cc
+++ b/src/op/hybrid_op.cc
@@ -202,7 +202,7 @@ Stmt HybridOpNode::BuildProvide(
 namespace op {
 
 
-Stmt ApplySplits(const Stage &stage,
+Stmt ApplyLoopShapes(const Stage &stage,
                  const std::unordered_map<IterVar, Range> &dom_map, Stmt stmt) {
   class LoopSpliter : public IRMutator {
     Expr factor;
@@ -270,7 +270,7 @@ Stmt ApplySplits(const Stage &stage,
         outer(fuse_->outer->var.get()), under_outer(false),
         extent(0), fused(false) {}
 
-    // TODO: Handle imperfect loops
+    // TODO(@were): Handle imperfect loops
 
     Stmt Mutate_(const For *op, const Stmt &stmt) {
       if (op->loop_var.get() == inner) {
@@ -447,13 +447,14 @@ Stmt ApplySchedule(const Stage &stage,
       CHECK(dom_map.count(rebase->rebased));
     }
   }
-  stmt = ApplySplits(stage, dom_map, stmt);
+  stmt = ApplyLoopShapes(stage, dom_map, stmt);
   stmt = ApplyLoopOrder(stage, dom_map, rebased, stmt);
   stmt = ApplyLoopAnnotations(stage, rebased, stmt);
   return stmt;
 }
 
 std::vector<IterVar> GatherLoopVars(Stmt stmt) {
+  // TODO(@were): Write a comprehensive pass to analyze iter var types
   std::vector<IterVar> res_;
   PostOrderVisit(stmt, [&res_](const NodeRef &node) {
     if (const For *op = node.as<For>()) {
diff --git a/src/op/hybrid_op.h b/src/op/hybrid_op.h
index 766212ac9bf1..ce75e6c37627 100644
--- a/src/op/hybrid_op.h
+++ b/src/op/hybrid_op.h
@@ -49,7 +49,7 @@ Stmt ApplySchedule(const Stage& stage,
  * \param dom_map The extents of the iterative variables may be used.
  * \param stmt The statement to be processed.
  */
-Stmt ApplySplits(const Stage &stage,
+Stmt ApplyLoopShapes(const Stage &stage,
                  const std::unordered_map<IterVar, Range>& dom_map, Stmt stmt);
 
 

From 9553514c411841dde36a4001694c6f466ed0cc78 Mon Sep 17 00:00:00 2001
From: Jian Weng <jian.weng@ucla.edu>
Date: Fri, 11 Jan 2019 13:09:08 -0800
Subject: [PATCH 13/21] add bind testcase

---
 tests/python/unittest/test_hybrid_script.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_hybrid_script.py b/tests/python/unittest/test_hybrid_script.py
index 24f2a1600914..f89123e346ec 100644
--- a/tests/python/unittest/test_hybrid_script.py
+++ b/tests/python/unittest/test_hybrid_script.py
@@ -288,9 +288,22 @@ def vec_add(a, b):
 
     a = tvm.placeholder((1000, ), dtype='float32', name='a')
     b = tvm.placeholder((1000, ), dtype='float32', name='b')
-
     run_and_check(vec_add, [a, b], target='cuda')
 
+    @script
+    def raw(a, b):
+        c = output_tensor((1000, ), 'float32')
+        for i in range(1000):
+            c[i] = a[i] + b[i]
+        return c
+
+    c = raw(a, b)
+    sch = tvm.create_schedule(c.op)
+    x = tvm.thread_axis('threadIdx.x')
+    sch[c].bind(c.op.axis[0], x)
+    run_and_check(raw, [a, b], sch=sch, outs=[c], target='cuda')
+
+
 def test_math_intrin():
     @script
     def intrin_real(a):

From eb7cee6079d1b85f2ec59d0fa756b4f2294170dc Mon Sep 17 00:00:00 2001
From: Jian Weng <jian.weng@ucla.edu>
Date: Fri, 11 Jan 2019 22:54:57 -0800
Subject: [PATCH 14/21] fix doc

---
 src/op/hybrid_op.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/op/hybrid_op.h b/src/op/hybrid_op.h
index ce75e6c37627..c66b9e7e083a 100644
--- a/src/op/hybrid_op.h
+++ b/src/op/hybrid_op.h
@@ -44,7 +44,7 @@ Stmt ApplySchedule(const Stage& stage,
                    const std::unordered_map<IterVar, Range>& dom_map, Stmt stmt);
 
 /*!
- * \brief Apply loop splits in the schedule on the function body.
+ * \brief Apply loop splits and fuses in the schedule on the function body.
  * \param stage The schedule information to be applied.
  * \param dom_map The extents of the iterative variables may be used.
  * \param stmt The statement to be processed.

From f3860ba872a4fa1b55345c0609057eb7346f527d Mon Sep 17 00:00:00 2001
From: Jian Weng <jian.weng@ucla.edu>
Date: Sat, 12 Jan 2019 20:04:41 -0800
Subject: [PATCH 15/21] fix online edit typo

---
 tests/python/unittest/test_hybrid_script.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tests/python/unittest/test_hybrid_script.py b/tests/python/unittest/test_hybrid_script.py
index f89123e346ec..7e08a7dc9ccc 100644
--- a/tests/python/unittest/test_hybrid_script.py
+++ b/tests/python/unittest/test_hybrid_script.py
@@ -303,6 +303,22 @@ def raw(a, b):
     sch[c].bind(c.op.axis[0], x)
     run_and_check(raw, [a, b], sch=sch, outs=[c], target='cuda')
 
+    # Test loop binds
+    @tvm.hybrid.script
+    def goo(a, b):
+        c = output_tensor(a.shape, a.dtype)
+        len_b = len(b)
+        for i in const_range(len_b * 2):
+            if i < len_b:
+                c[i] = a[i] + b[i]
+            else:
+                c[i - len_b] = a[i - len_b] + b[i - len_b]
+        return c
+    a = tvm.placeholder((5, ), name='a', dtype='int32')
+    b = [1, 2, 3, 4, 5]
+    c = goo(a, tvm.convert(b))
+    sch = tvm.create_schedule(c.op)
+    run_and_check(goo, [a, b], sch=sch, outs=[c])
 
 def test_math_intrin():
     @script
@@ -583,6 +599,7 @@ def foo(a, b):
     run_and_check(foo, [a, b])
 
     @tvm.hybrid.script
+<<<<<<< 52c435f9c55a53e2e8e10b3a9fe672d2f8718122
     def goo(a, b):
         c = output_tensor(a.shape, a.dtype)
         len_b = len(b)
@@ -599,6 +616,8 @@ def goo(a, b):
     run_and_check(goo, [a, b])
 
     @tvm.hybrid.script
+=======
+>>>>>>> fix online edit typo
     def hoo(a, b):
         c = output_tensor(a.shape, a.dtype)
         len_b = len(b)

From d7e0b5ae8e4f97a5711fdec93eec7afcc2c52335 Mon Sep 17 00:00:00 2001
From: Jian Weng <jian.weng@ucla.edu>
Date: Sun, 13 Jan 2019 10:51:03 -0800
Subject: [PATCH 16/21] resolve @mercymercy review

---
 src/op/hybrid_op.cc | 18 +++++-------------
 src/op/hybrid_op.h  |  1 +
 2 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/src/op/hybrid_op.cc b/src/op/hybrid_op.cc
index c322c5d34ef1..55e4f61b9fad 100644
--- a/src/op/hybrid_op.cc
+++ b/src/op/hybrid_op.cc
@@ -206,23 +206,15 @@ Stmt ApplyLoopShapes(const Stage &stage,
                  const std::unordered_map<IterVar, Range> &dom_map, Stmt stmt) {
   class LoopSpliter : public IRMutator {
     Expr factor;
-    IterVar parent, inner, outer;
+    const Variable *parent;
+    IterVar inner, outer;
 
    public:
     bool splitted;
     LoopSpliter(const SplitNode *split,
                 const std::unordered_map<IterVar, Range> &dom_map) :
       factor(split->factor), splitted(false) {
-      auto &parent_ = split->parent;
-      if (parent_->dom.defined()) {
-        CHECK(is_const_int(parent_->dom->min, 0));
-        parent = parent_;
-      } else {
-        CHECK(dom_map.count(parent_));
-        auto &dom = dom_map.find(parent_)->second;
-        CHECK(is_const_int(dom->min, 0));
-        parent = IterVarNode::make(dom, parent_->var, parent_->iter_type);
-      }
+      parent = split->parent->var.get();
 
       auto &inner_ = split->inner;
       CHECK(dom_map.count(inner_));
@@ -239,11 +231,11 @@ Stmt ApplyLoopShapes(const Stage &stage,
     }
 
     Stmt Mutate_(const For *op, const Stmt &stmt) {
-      if (op->loop_var.get() == parent->var.get()) {
+      if (op->loop_var.get() == parent) {
         std::unordered_map<const Variable *, Expr> rmap;
         rmap[op->loop_var.get()] = inner + outer * factor;
         Stmt ret = ir::Substitute(op->body, rmap);
-        Expr cond = likely(outer * factor < (parent->dom->extent - inner));
+        Expr cond = likely(outer * factor < (op->extent - inner));
         ret = IfThenElse::make(cond, ret);
         ret = For::make(inner->var, Expr(0), inner->dom->extent,
                         IterVarTypeToForType(inner->iter_type), op->device_api, ret);
diff --git a/src/op/hybrid_op.h b/src/op/hybrid_op.h
index c66b9e7e083a..db988b45875f 100644
--- a/src/op/hybrid_op.h
+++ b/src/op/hybrid_op.h
@@ -38,6 +38,7 @@ Stmt ReplaceProvideTensor(Stmt stmt,
 /*!
  * \brief Apply the schedule manipulation on the function body.
  * \param stmt The statement to be processed.
+ * \param dom_map The extents of the iterative variables may be used.
  * \param stage The schedule information to be applied.
  */
 Stmt ApplySchedule(const Stage& stage,

From 4a187143260398344823bb3de8746292b55e5cd1 Mon Sep 17 00:00:00 2001
From: Jian Weng <jian.weng@ucla.edu>
Date: Sun, 13 Jan 2019 10:59:16 -0800
Subject: [PATCH 17/21] fix indent

---
 src/op/hybrid_op.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/op/hybrid_op.h b/src/op/hybrid_op.h
index db988b45875f..892e420137d6 100644
--- a/src/op/hybrid_op.h
+++ b/src/op/hybrid_op.h
@@ -51,7 +51,7 @@ Stmt ApplySchedule(const Stage& stage,
  * \param stmt The statement to be processed.
  */
 Stmt ApplyLoopShapes(const Stage &stage,
-                 const std::unordered_map<IterVar, Range>& dom_map, Stmt stmt);
+                     const std::unordered_map<IterVar, Range>& dom_map, Stmt stmt);
 
 
 /*!

From f1a079df6a289e44c25d1dbe244ddebd1ca59e10 Mon Sep 17 00:00:00 2001
From: Jian Weng <jian.weng@ucla.edu>
Date: Mon, 14 Jan 2019 09:22:34 -0800
Subject: [PATCH 18/21] i should convince myself it is not flaky test first

---
 docs/langref/hybrid_script.rst | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/docs/langref/hybrid_script.rst b/docs/langref/hybrid_script.rst
index 122bcd95e690..7043281fcafb 100644
--- a/docs/langref/hybrid_script.rst
+++ b/docs/langref/hybrid_script.rst
@@ -68,17 +68,23 @@ to LLVM module.
 Tuning
 ~~~~~~
 
-**Under construction, not supported yet.**
-
 Follow up the example above, you can use some tvm like interfaces to tune the code: 
 
 .. code-block:: python
 
+   i, j = c.op.axis
    sch = tvm.create_schedule(op)
    jo, ji = sch.split(j, 4)
    sch.vectorize(ji)
 
-``split``, ``reorder``, and loop_annotation will be supported!
+For now, you can use loop annotations (``unroll``, ``parallel``, ``vectorize``, and ``bind``),
+loop manipulation (``split`` and ``fuse``), and ``reorder``.
+
+.. note::
+
+        This is a preliminary function, so users should be in charge of the correctness
+        of the functionality after tuning. Specifically, users should be careful when
+        fusing and reorderding imperfect loops. 
 
 Loops
 ~~~~~

From c5fa89f91ea040a6042bf653edf7bbf2620e2af3 Mon Sep 17 00:00:00 2001
From: Jian Weng <jian.weng@ucla.edu>
Date: Mon, 14 Jan 2019 10:04:51 -0800
Subject: [PATCH 19/21] fix test hybrid

---
 tests/python/unittest/test_hybrid_script.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/python/unittest/test_hybrid_script.py b/tests/python/unittest/test_hybrid_script.py
index 7e08a7dc9ccc..a54fec3a7bf7 100644
--- a/tests/python/unittest/test_hybrid_script.py
+++ b/tests/python/unittest/test_hybrid_script.py
@@ -599,7 +599,6 @@ def foo(a, b):
     run_and_check(foo, [a, b])
 
     @tvm.hybrid.script
-<<<<<<< 52c435f9c55a53e2e8e10b3a9fe672d2f8718122
     def goo(a, b):
         c = output_tensor(a.shape, a.dtype)
         len_b = len(b)
@@ -616,8 +615,6 @@ def goo(a, b):
     run_and_check(goo, [a, b])
 
     @tvm.hybrid.script
-=======
->>>>>>> fix online edit typo
     def hoo(a, b):
         c = output_tensor(a.shape, a.dtype)
         len_b = len(b)

From ea12b62c01f8e7c5308966cc0a084c8b620e6caf Mon Sep 17 00:00:00 2001
From: Jian Weng <jian.weng@ucla.edu>
Date: Mon, 14 Jan 2019 10:19:06 -0800
Subject: [PATCH 20/21] how many flaky test are you expecting; i ball ball u to
 let me pass

---
 src/op/hybrid_op.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/op/hybrid_op.cc b/src/op/hybrid_op.cc
index 55e4f61b9fad..acd7b5737c5f 100644
--- a/src/op/hybrid_op.cc
+++ b/src/op/hybrid_op.cc
@@ -118,6 +118,7 @@ Stmt HybridOpNode::BuildRealize(
     const Stage &stage,
     const std::unordered_map<IterVar, Range> &realize_map,
     const Stmt &body) const {
+  // TODO(@were): Add attribute inject here and remove it from hybrid parser.
   CHECK_EQ(stage->op.get(), this);
   Stmt realize_body = body;
   for (int k = 0; k < num_outputs(); ++k) {
@@ -430,6 +431,7 @@ Stmt ApplyLoopOrder(const Stage &stage,
 
 Stmt ApplySchedule(const Stage &stage,
                    const std::unordered_map<IterVar, Range> &dom_map, Stmt stmt) {
+  // TODO(@were): Eliminate loop rebase in script parser and move the burden here
   // Gather rebased variables
   std::unordered_map<IterVar, IterVar> rebased;
   for (auto rel : stage->relations) {

From 30f2a247c0485a06ac106cab50ec8f9cff087ddb Mon Sep 17 00:00:00 2001
From: Jian Weng <jian.weng@ucla.edu>
Date: Mon, 14 Jan 2019 12:39:00 -0800
Subject: [PATCH 21/21] rebase halide...

---
 3rdparty/HalideIR | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/HalideIR b/3rdparty/HalideIR
index a08e26e5a97f..6e7c1f046fda 160000
--- a/3rdparty/HalideIR
+++ b/3rdparty/HalideIR
@@ -1 +1 @@
-Subproject commit a08e26e5a97f4ef4d566a42f6c78704b3f9c7b8a
+Subproject commit 6e7c1f046fda536562dc80977e93324fee2324bd