From 317186f98ddecadb67650293a5c4269cbf7e76e6 Mon Sep 17 00:00:00 2001
From: Jon <jonso@microsoft.com>
Date: Fri, 20 Sep 2019 09:41:01 -0700
Subject: [PATCH 01/18] Fix extern schedule for x86

---
 topi/include/topi/x86/extern.h     | 89 ++++++++++++++++++++++++++++++
 topi/python/topi/cuda/__init__.py  |  1 -
 topi/python/topi/cuda/extern.py    | 48 ----------------
 topi/python/topi/generic/extern.py | 12 ++--
 topi/python/topi/x86/dense.py      |  4 ++
 5 files changed, 101 insertions(+), 53 deletions(-)
 create mode 100644 topi/include/topi/x86/extern.h
 delete mode 100644 topi/python/topi/cuda/extern.py
diff --git a/topi/include/topi/x86/extern.h b/topi/include/topi/x86/extern.h
new file mode 100644
index 000000000000..f925c72dd428
--- /dev/null
+++ b/topi/include/topi/x86/extern.h
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file x86/extern.h
+ * \brief x86 schedule for extern followed by injective operations
+ */
+#ifndef TOPI_X86_EXTERN_H_
+#define TOPI_X86_EXTERN_H_
+
+#include "topi/tags.h"
+#include "topi/detail/fuse.h"
+#include "tvm/operation.h"
+#include "tvm/build_module.h"
+
+namespace topi {
+using namespace tvm;
+
+namespace x86 {
+/*!
+ * \brief Schedule a given operation representing one of the outputs of an
+ * external function which is followed by injective operations.
+ *
+ * \param target The target to generate a schedule for.
+ * \param op The operation representing the output followed by injective operations.
+ * \param sch The schedule to apply this scheduling to
+ *
+ * \return The schedule given by sch
+ */
+inline Schedule ScheduleOutputForExtern(Target target, Operation op, Schedule sch) {
+  auto x = op.output(0);
+  auto axis = sch[x]->op.as<ComputeOpNode>()->axis;
+  if (axis.size() == 4) {
+    auto n = axis[0];
+    auto c = axis[1];
+    auto fused = detail::Fuse(sch[x], { n, c });  // for nhwc layout, fuse n and h
+    sch[x].parallel(fused);
+  } else {
+    sch[x].parallel(axis[0]);
+  }
+  return sch;
+}
+
+/*!
+* \brief Schedule an extern op followed by injective operations.
+* For example, cudnn kernel + bias add + relu
+*
+* \param target The target to generate a schedule for.
+* \param outs The output tensors.
+*
+* \return A schedule for the op.
+*/
+inline Schedule schedule_extern(const Target& target, Array<Tensor> outs) {
+  Array<Operation> out_ops;
+  for (auto t : outs) {
+    out_ops.push_back(t->op);
+  }
+  auto s = create_schedule(out_ops);
+
+  tvm::schedule::AutoInlineInjective(s);
+  for (auto out : outs) {
+    if (out->op->derived_from<ExternOpNode>()) {
+      continue;
+    }
+    ScheduleOutputForExtern(target, out->op, s);
+  }
+
+  return s;
+}
+
+}  // namespace x86
+}  // namespace topi
+#endif  // TOPI_X86_EXTERN_H_
diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
index 403f67b972f7..beab0afb0e3f 100644
--- a/topi/python/topi/cuda/__init__.py
+++ b/topi/python/topi/cuda/__init__.py
@@ -13,7 +13,6 @@
 from .injective import schedule_injective, schedule_elemwise, schedule_broadcast
 from .dense import schedule_dense
 from .pooling import schedule_pool, schedule_adaptive_pool
-from .extern import schedule_extern
 from .nn import schedule_lrn, schedule_l2_normalize
 from .batch_matmul import schedule_batch_matmul
 from .vision import *
diff --git a/topi/python/topi/cuda/extern.py b/topi/python/topi/cuda/extern.py
deleted file mode 100644
index 74b88ec4ccd7..000000000000
--- a/topi/python/topi/cuda/extern.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-variable,
-"""Schedule for cudnn and miopen extern op"""
-import tvm
-from .. import generic
-from .injective import _schedule_injective
-
-
-@generic.schedule_extern.register(["cuda", "gpu"])
-def schedule_extern(outs):
-    """Schedule for an extern op followed by injective operations.
-       For example, cudnn kernel + bias add + relu.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of extern plus injective ops in the format
-          of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
-
-    tvm.schedule.AutoInlineInjective(s)
-    for out in outs:
-        if isinstance(out.op, tvm.tensor.ExternOp):
-            continue
-        _schedule_injective(out.op, s)
-    return s
diff --git a/topi/python/topi/generic/extern.py b/topi/python/topi/generic/extern.py
index 3728222fa622..6cddc4c7230d 100644
--- a/topi/python/topi/generic/extern.py
+++ b/topi/python/topi/generic/extern.py
@@ -35,8 +35,12 @@ def schedule_extern(outs):
     sch: Schedule
         The computation schedule for the op.
     """
-    target = tvm.target.current_target(allow_none=False)
-    if target.target_name != "llvm":
-        raise RuntimeError("schedule_extern not registered for '%s'" % target)
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    return tvm.create_schedule([x.op for x in outs])
+    s = tvm.create_schedule([x.op for x in outs])
+
+    tvm.schedule.AutoInlineInjective(s)
+    for out in outs:
+        if isinstance(out.op, tvm.tensor.ExternOp):
+            continue
+        _schedule_injective(out.op, s)
+    return s
diff --git a/topi/python/topi/x86/dense.py b/topi/python/topi/x86/dense.py
index 27cadfa04cce..9b7624eedd06 100644
--- a/topi/python/topi/x86/dense.py
+++ b/topi/python/topi/x86/dense.py
@@ -111,6 +111,10 @@ def _declaration_dense_nopack(cfg, data, weight, bias=None, out_dtype=None):
 
 @autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct")
 def _schedule_dense(cfg, outs):
+    target = tvm.target.current_target()
+    if "cblas" in target.libs:
+        return generic.schedule_extern(outs)
+
     s = tvm.create_schedule([x.op for x in outs])
 
     def _callback(op):

From 299a80bc60409af074853bff387f075cf6264289 Mon Sep 17 00:00:00 2001
From: Jon <jonso@microsoft.com>
Date: Fri, 20 Sep 2019 14:34:30 -0700
Subject: [PATCH 02/18] Register x86::schedule_extern

---
 topi/python/topi/generic/extern.py | 1 +
 topi/src/topi.cc                   | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/topi/python/topi/generic/extern.py b/topi/python/topi/generic/extern.py
index 6cddc4c7230d..8273d9263ef1 100644
--- a/topi/python/topi/generic/extern.py
+++ b/topi/python/topi/generic/extern.py
@@ -19,6 +19,7 @@
 from __future__ import absolute_import as _abs
 
 import tvm
+from .injective import _schedule_injective
 
 @tvm.target.generic_func
 def schedule_extern(outs):
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index b0709399660c..91d18d212428 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -65,6 +65,7 @@
 
 #include <topi/x86/bnn.h>
 #include <topi/x86/default.h>
+#include <topi/x86/extern.h>
 #include <topi/x86/injective.h>
 
 #include <topi/rocm/dense.h>
@@ -606,6 +607,11 @@ TVM_REGISTER_GLOBAL("topi.x86.default_schedule")
   }
   });
 
+TVM_REGISTER_GLOBAL("topi.x86.schedule_extern")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::x86::schedule_extern(args[0], args[1]);
+  });
+
 TVM_REGISTER_GLOBAL("topi.x86.schedule_injective")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   *rv = topi::x86::schedule_injective(args[0], args[1]);

From d7b21f888729be55b7b0d74dda0fcf3a858602ca Mon Sep 17 00:00:00 2001
From: Jon <jonso@microsoft.com>
Date: Fri, 20 Sep 2019 15:20:41 -0700
Subject: [PATCH 03/18] Fix

---
 topi/python/topi/generic/extern.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/topi/python/topi/generic/extern.py b/topi/python/topi/generic/extern.py
index 8273d9263ef1..78d38de26512 100644
--- a/topi/python/topi/generic/extern.py
+++ b/topi/python/topi/generic/extern.py
@@ -19,7 +19,7 @@
 from __future__ import absolute_import as _abs
 
 import tvm
-from .injective import _schedule_injective
+from .injective import schedule_injective
 
 @tvm.target.generic_func
 def schedule_extern(outs):
@@ -43,5 +43,5 @@ def schedule_extern(outs):
     for out in outs:
         if isinstance(out.op, tvm.tensor.ExternOp):
             continue
-        _schedule_injective(out.op, s)
+        schedule_injective(out.op, s)
     return s

From 261639395e3ed74fbb3b26318bd12bc72d953dee Mon Sep 17 00:00:00 2001
From: Jon <jonso@microsoft.com>
Date: Fri, 20 Sep 2019 15:33:45 -0700
Subject: [PATCH 04/18] Fix

---
 topi/python/topi/generic/extern.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/topi/python/topi/generic/extern.py b/topi/python/topi/generic/extern.py
index 78d38de26512..5983aeccfb21 100644
--- a/topi/python/topi/generic/extern.py
+++ b/topi/python/topi/generic/extern.py
@@ -43,5 +43,5 @@ def schedule_extern(outs):
     for out in outs:
         if isinstance(out.op, tvm.tensor.ExternOp):
             continue
-        schedule_injective(out.op, s)
+        schedule_injective(out)
     return s

From 4fb7063282b6c5ab69356204a68545144059d4e8 Mon Sep 17 00:00:00 2001
From: Jon Soifer <jonso@microsoft.com>
Date: Mon, 23 Sep 2019 11:22:27 -0700
Subject: [PATCH 05/18] Replace extern.py with extern.h

---
 topi/include/topi/cuda/extern.h    | 86 -----------------------------
 topi/include/topi/generic/extern.h | 11 ++++
 topi/include/topi/x86/extern.h     | 89 ------------------------------
 topi/python/topi/generic/extern.py | 13 +----
 topi/src/topi.cc                   | 12 ----
 5 files changed, 14 insertions(+), 197 deletions(-)
 delete mode 100644 topi/include/topi/cuda/extern.h
 delete mode 100644 topi/include/topi/x86/extern.h

diff --git a/topi/include/topi/cuda/extern.h b/topi/include/topi/cuda/extern.h
deleted file mode 100644
index 7800986ead02..000000000000
--- a/topi/include/topi/cuda/extern.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file cuda/extern.h
- * \brief CUDA schedule for extern followed by injective operations
- */
-#ifndef TOPI_CUDA_EXTERN_H_
-#define TOPI_CUDA_EXTERN_H_
-
-#include "topi/tags.h"
-#include "topi/detail/fuse.h"
-#include "tvm/operation.h"
-#include "tvm/build_module.h"
-
-namespace topi {
-using namespace tvm;
-
-namespace cuda {
-/*!
- * \brief Schedule a given operation representing one of the outputs of an
- * external function which is followed by injective operations.
- *
- * \param target The target to generate a schedule for.
- * \param op The operation representing the output followed by injective operations.
- * \param sch The schedule to apply this scheduling to
- *
- * \return The schedule given by sch
- */
-inline Schedule ScheduleOutputForExtern(Target target, Operation op, Schedule sch) {
-  auto x = op.output(0);
-  auto fused = detail::Fuse(sch[x], sch[x]->op.as<ComputeOpNode>()->axis);
-  auto num_thread = target->max_num_threads;
-  IterVar bx, tx;
-  sch[x].split(fused, num_thread, &bx, &tx);
-  sch[x].bind(bx, tvm::thread_axis(Range(), "blockIdx.x"));
-  sch[x].bind(tx, tvm::thread_axis(Range(), "threadIdx.x"));
-  return sch;
-}
-
-/*!
-* \brief Schedule an extern op followed by injective operations.
-* For example, cudnn kernel + bias add + relu
-*
-* \param target The target to generate a schedule for.
-* \param outs The output tensors.
-*
-* \return A schedule for the op.
-*/
-inline Schedule schedule_extern(const Target& target, Array<Tensor> outs) {
-  Array<Operation> out_ops;
-  for (auto t : outs) {
-    out_ops.push_back(t->op);
-  }
-  auto s = create_schedule(out_ops);
-
-  tvm::schedule::AutoInlineInjective(s);
-  for (auto out : outs) {
-    if (out->op->derived_from<ExternOpNode>()) {
-      continue;
-    }
-    ScheduleOutputForExtern(target, out->op, s);
-  }
-
-  return s;
-}
-
-}  // namespace cuda
-}  // namespace topi
-#endif  // TOPI_CUDA_EXTERN_H_
diff --git a/topi/include/topi/generic/extern.h b/topi/include/topi/generic/extern.h
index 6c51d7c7b2d5..57daedd5ed85 100644
--- a/topi/include/topi/generic/extern.h
+++ b/topi/include/topi/generic/extern.h
@@ -28,6 +28,7 @@
 #include "topi/detail/fuse.h"
 #include "tvm/operation.h"
 #include "tvm/build_module.h"
+#include "injective.h"
 
 namespace topi {
 using namespace tvm;
@@ -47,6 +48,16 @@ inline Schedule schedule_extern(const Target& target, Array<Tensor> outs) {
     out_ops.push_back(t->op);
   }
   auto s = create_schedule(out_ops);
+
+  tvm::schedule::AutoInlineInjective(s);
+  for (auto out : outs) {
+    if (out->op->derived_from<ExternOpNode>()) {
+      continue;
+    }
+    Array<Tensor> new_outs = { out };
+    tvm::GenericFunc::Get("schedule_injective")(new_outs);
+  }
+
   return s;
 }
 
diff --git a/topi/include/topi/x86/extern.h b/topi/include/topi/x86/extern.h
deleted file mode 100644
index f925c72dd428..000000000000
--- a/topi/include/topi/x86/extern.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file x86/extern.h
- * \brief x86 schedule for extern followed by injective operations
- */
-#ifndef TOPI_X86_EXTERN_H_
-#define TOPI_X86_EXTERN_H_
-
-#include "topi/tags.h"
-#include "topi/detail/fuse.h"
-#include "tvm/operation.h"
-#include "tvm/build_module.h"
-
-namespace topi {
-using namespace tvm;
-
-namespace x86 {
-/*!
- * \brief Schedule a given operation representing one of the outputs of an
- * external function which is followed by injective operations.
- *
- * \param target The target to generate a schedule for.
- * \param op The operation representing the output followed by injective operations.
- * \param sch The schedule to apply this scheduling to
- *
- * \return The schedule given by sch
- */
-inline Schedule ScheduleOutputForExtern(Target target, Operation op, Schedule sch) {
-  auto x = op.output(0);
-  auto axis = sch[x]->op.as<ComputeOpNode>()->axis;
-  if (axis.size() == 4) {
-    auto n = axis[0];
-    auto c = axis[1];
-    auto fused = detail::Fuse(sch[x], { n, c });  // for nhwc layout, fuse n and h
-    sch[x].parallel(fused);
-  } else {
-    sch[x].parallel(axis[0]);
-  }
-  return sch;
-}
-
-/*!
-* \brief Schedule an extern op followed by injective operations.
-* For example, cudnn kernel + bias add + relu
-*
-* \param target The target to generate a schedule for.
-* \param outs The output tensors.
-*
-* \return A schedule for the op.
-*/
-inline Schedule schedule_extern(const Target& target, Array<Tensor> outs) {
-  Array<Operation> out_ops;
-  for (auto t : outs) {
-    out_ops.push_back(t->op);
-  }
-  auto s = create_schedule(out_ops);
-
-  tvm::schedule::AutoInlineInjective(s);
-  for (auto out : outs) {
-    if (out->op->derived_from<ExternOpNode>()) {
-      continue;
-    }
-    ScheduleOutputForExtern(target, out->op, s);
-  }
-
-  return s;
-}
-
-}  // namespace x86
-}  // namespace topi
-#endif  // TOPI_X86_EXTERN_H_
diff --git a/topi/python/topi/generic/extern.py b/topi/python/topi/generic/extern.py
index 5983aeccfb21..a0601147d5e8 100644
--- a/topi/python/topi/generic/extern.py
+++ b/topi/python/topi/generic/extern.py
@@ -19,7 +19,7 @@
 from __future__ import absolute_import as _abs
 
 import tvm
-from .injective import schedule_injective
+from .. import cpp
 
 @tvm.target.generic_func
 def schedule_extern(outs):
@@ -36,12 +36,5 @@ def schedule_extern(outs):
     sch: Schedule
         The computation schedule for the op.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
-
-    tvm.schedule.AutoInlineInjective(s)
-    for out in outs:
-        if isinstance(out.op, tvm.tensor.ExternOp):
-            continue
-        schedule_injective(out)
-    return s
+    target = tvm.target.current_target()
+    return cpp.generic.schedule_extern(target, outs)
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index 91d18d212428..c8171e04cdc8 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -56,7 +56,6 @@
 #include <topi/generic/injective.h>
 
 #include <topi/cuda/dense.h>
-#include <topi/cuda/extern.h>
 #include <topi/cuda/injective.h>
 #include <topi/cuda/pooling.h>
 #include <topi/cuda/reduction.h>
@@ -65,7 +64,6 @@
 
 #include <topi/x86/bnn.h>
 #include <topi/x86/default.h>
-#include <topi/x86/extern.h>
 #include <topi/x86/injective.h>
 
 #include <topi/rocm/dense.h>
@@ -607,11 +605,6 @@ TVM_REGISTER_GLOBAL("topi.x86.default_schedule")
   }
   });
 
-TVM_REGISTER_GLOBAL("topi.x86.schedule_extern")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = topi::x86::schedule_extern(args[0], args[1]);
-  });
-
 TVM_REGISTER_GLOBAL("topi.x86.schedule_injective")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   *rv = topi::x86::schedule_injective(args[0], args[1]);
@@ -649,11 +642,6 @@ TVM_REGISTER_GLOBAL("topi.cuda.schedule_dense")
   *rv = topi::cuda::schedule_dense(args[0], args[1]);
   });
 
-TVM_REGISTER_GLOBAL("topi.cuda.schedule_extern")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = topi::cuda::schedule_extern(args[0], args[1]);
-  });
-
 TVM_REGISTER_GLOBAL("topi.cuda.schedule_injective")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   *rv = topi::cuda::schedule_injective(args[0], args[1]);

From 9df90101cbd11b59c6101f6e053118121ca5898c Mon Sep 17 00:00:00 2001
From: Jon Soifer <jonso@microsoft.com>
Date: Mon, 23 Sep 2019 13:56:27 -0700
Subject: [PATCH 06/18] Introduce new generic function
 schedule_injective_from_existing

---
 topi/include/topi/cuda/injective.h         | 29 ++++++++-------
 topi/include/topi/generic/extern.h         |  3 +-
 topi/include/topi/generic/injective.h      | 15 +++++++-
 topi/include/topi/x86/injective.h          | 32 ++++++++++++-----
 topi/python/topi/arm_cpu/injective.py      | 35 +++++++++++++-----
 topi/python/topi/cuda/conv2d_int8.py       |  6 ++--
 topi/python/topi/cuda/group_conv2d_nchw.py |  6 ++--
 topi/python/topi/cuda/injective.py         | 41 ++++++++++++++--------
 topi/python/topi/cuda/reduction.py         |  6 ++--
 topi/python/topi/cuda/softmax.py           |  4 +--
 topi/python/topi/cuda/sort.py              |  4 +--
 topi/python/topi/cuda/vision.py            |  4 +--
 topi/python/topi/generic/injective.py      | 21 ++++++++++-
 topi/python/topi/hls/injective.py          | 25 +++++++++++--
 topi/python/topi/opengl/injective.py       | 23 +++++++++---
 topi/python/topi/x86/injective.py          | 35 +++++++++++++-----
 topi/src/topi.cc                           | 39 ++++++++++++++++++++
 17 files changed, 249 insertions(+), 79 deletions(-)

diff --git a/topi/include/topi/cuda/injective.h b/topi/include/topi/cuda/injective.h
index e629ae1c3503..663bc1fcbeb2 100644
--- a/topi/include/topi/cuda/injective.h
+++ b/topi/include/topi/cuda/injective.h
@@ -33,21 +33,24 @@ namespace topi {
 using namespace tvm;
 
 namespace cuda {
+
 /*!
-* \brief Schedule a given injective operation.
-*
-* \param target The target to generate a schedule for.
-* \param op The operation representing the injective operation.
-* \param s The schedule to apply this scheduling to
-*/
-inline void ScheduleInjectiveOp(const Target &target, Operation op, Schedule s) {
-  auto x = op.output(0);
-  auto fused = detail::Fuse(s[x], s[x]->op.as<ComputeOpNode>()->axis);
+ * \brief Updates an existing schedule for the given injective ops.
+ *
+ * \param sch The schedule to update.
+ * \param out The tensor representing the injective op.
+ * 
+ * \return The updated schedule.
+ */
+inline Schedule schedule_injective_from_existing(Schedule sch, const Tensor& out) {
+  auto fused = detail::Fuse(sch[out], sch[out]->op.as<ComputeOpNode>()->axis);
+  auto target = Target::Current(false);
   auto num_thread = target->max_num_threads;
   IterVar bx, tx;
-  s[x].split(fused, num_thread, &bx, &tx);
-  s[x].bind(bx, thread_axis(Range(), "blockIdx.x"));
-  s[x].bind(tx, thread_axis(Range(), "threadIdx.x"));
+  sch[out].split(fused, num_thread, &bx, &tx);
+  sch[out].bind(bx, thread_axis(Range(), "blockIdx.x"));
+  sch[out].bind(tx, thread_axis(Range(), "threadIdx.x"));
+  return sch;
 }
 
 /*!
@@ -66,7 +69,7 @@ inline Schedule schedule_injective(const Target &target, const Array<Tensor>& ou
   auto s = create_schedule(out_ops);
   tvm::schedule::AutoInlineInjective(s);
   for (auto out : outs) {
-    ScheduleInjectiveOp(target, out->op, s);
+    schedule_injective_from_existing(s, out);
   }
   return s;
 }
diff --git a/topi/include/topi/generic/extern.h b/topi/include/topi/generic/extern.h
index 57daedd5ed85..5c0c392f768d 100644
--- a/topi/include/topi/generic/extern.h
+++ b/topi/include/topi/generic/extern.h
@@ -54,8 +54,7 @@ inline Schedule schedule_extern(const Target& target, Array<Tensor> outs) {
     if (out->op->derived_from<ExternOpNode>()) {
       continue;
     }
-    Array<Tensor> new_outs = { out };
-    tvm::GenericFunc::Get("schedule_injective")(new_outs);
+    tvm::GenericFunc::Get("schedule_injective_from_existing")(s, out);
   }
 
   return s;
diff --git a/topi/include/topi/generic/injective.h b/topi/include/topi/generic/injective.h
index f26651c2ddb7..fa7df4ccfd46 100644
--- a/topi/include/topi/generic/injective.h
+++ b/topi/include/topi/generic/injective.h
@@ -34,6 +34,19 @@ using namespace tvm;
 
 namespace generic {
 
+/*!
+ * \brief Updates an existing schedule for the given injective ops.
+ *
+ * \param sch The schedule to update.
+ * \param out The tensor representing the injective op.
+ * 
+ * \return The updated schedule.
+ */
+inline Schedule schedule_injective_from_existing(Schedule sch, const Tensor& out) {
+  detail::Fuse(sch[out], sch[out]->op.as<ComputeOpNode>()->axis);
+  return sch;
+}
+
 /*!
  * \brief Create a generic schedule for the given injective ops.
  *
@@ -50,7 +63,7 @@ inline Schedule schedule_injective(const Target &target, const Array<Tensor>& ou
   auto s = create_schedule(out_ops);
   tvm::schedule::AutoInlineInjective(s);
   auto x = outs[0];
-  detail::Fuse(s[x], s[x]->op.as<ComputeOpNode>()->axis);
+  schedule_injective_from_existing(s, x);
 
   return s;
 }
diff --git a/topi/include/topi/x86/injective.h b/topi/include/topi/x86/injective.h
index adbe382495b8..7cb79ae322c2 100644
--- a/topi/include/topi/x86/injective.h
+++ b/topi/include/topi/x86/injective.h
@@ -33,6 +33,28 @@ namespace topi {
 using namespace tvm;
 
 namespace x86 {
+
+/*!
+ * \brief Updates an existing schedule for the given injective ops.
+ *
+ * \param sch The schedule to update.
+ * \param out The tensor representing the injective op.
+ * 
+ * \return The updated schedule.
+ */
+inline Schedule schedule_injective_from_existing(Schedule sch, const Tensor& out) {
+  auto axis = sch[out]->op.as<ComputeOpNode>()->axis;
+  if (axis.size() == 4) {
+    auto n = axis[0];
+    auto c = axis[1];
+    auto fused = detail::Fuse(sch[out], { n, c });  // for nhwc layout, fuse n and h
+    sch[out].parallel(fused);
+  } else {
+    sch[out].parallel(axis[0]);
+  }
+  return sch;
+}
+
 /*!
 * \brief Create an x86 schedule for the given injective ops.
 *
@@ -50,15 +72,7 @@ inline Schedule schedule_injective(const Target &target, const Array<Tensor>& ou
   tvm::schedule::AutoInlineInjective(s);
 
   auto x = outs[0];
-  auto axis = s[x]->op.as<ComputeOpNode>()->axis;
-  if (axis.size() == 4) {
-    auto n = axis[0];
-    auto c = axis[1];
-    auto fused = detail::Fuse(s[x], { n, c });  // for nhwc layout, fuse n and h
-    s[x].parallel(fused);
-  } else {
-    s[x].parallel(axis[0]);
-  }
+  schedule_injective_from_existing(s, x);
 
   return s;
 }
diff --git a/topi/python/topi/arm_cpu/injective.py b/topi/python/topi/arm_cpu/injective.py
index 028558f69e91..cd14e3a0f058 100644
--- a/topi/python/topi/arm_cpu/injective.py
+++ b/topi/python/topi/arm_cpu/injective.py
@@ -19,6 +19,32 @@
 import tvm
 from .. import generic
 
+@generic.schedule_injective_from_existing.register(["arm_cpu"])
+def schedule_injective_from_existing(sch, out):
+    """Schedule for injective op from existing schedule.
+
+    Parameters
+    ----------
+    sch: Schedule
+         The schedule to update.
+    out: Tensor
+         The tensor representing the injective op.
+
+    Returns
+    -------
+    sch: Schedule
+         The updated schedule.
+    """
+    if len(sch[out].op.axis) >= 4:
+        fused = sch[out].fuse(sch[out].op.axis[0], sch[out].op.axis[1], sch[out].op.axis[2])
+        sch[out].parallel(fused)
+    elif len(sch[out].op.axis) >= 3:
+        fused = sch[out].fuse(sch[out].op.axis[0], sch[out].op.axis[1])
+        sch[out].parallel(fused)
+    elif len(sch[x].op.axis) >= 2:
+        sch[out].parallel(sch[out].op.axis[0])
+    return sch
+
 @generic.schedule_injective.register(["arm_cpu"])
 def schedule_injective(outs):
     """ARM CPU schedule for injective op.
@@ -42,14 +68,7 @@ def schedule_injective(outs):
         (io, ii) = s[x].split(list(s[x].op.axis)[-1], 8)
         s[x].vectorize(ii)
     tvm.schedule.AutoInlineInjective(s)
-    if len(s[x].op.axis) >= 4:
-        fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1], s[x].op.axis[2])
-        s[x].parallel(fused)
-    elif len(s[x].op.axis) >= 3:
-        fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1])
-        s[x].parallel(fused)
-    elif len(s[x].op.axis) >= 2:
-        s[x].parallel(s[x].op.axis[0])
+    schedule_injective_from_existing(s, x)
     return s
 
 @generic.schedule_concatenate.register(["arm_cpu"])
diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py
index 25d49451810d..580cf96b53e8 100644
--- a/topi/python/topi/cuda/conv2d_int8.py
+++ b/topi/python/topi/cuda/conv2d_int8.py
@@ -19,7 +19,7 @@
 import tvm
 from tvm import autotvm
 
-from .injective import _schedule_injective
+from .injective import schedule_injective_from_existing
 from .tensor_intrin import dp4a
 from ..nn.pad import pad
 from ..nn.util import get_pad_tuple
@@ -172,8 +172,8 @@ def schedule_conv2d_NCHWc_int8(cfg, s, output):
         if isinstance(packed_kernel.op, tvm.tensor.ComputeOp) and\
                        packed_kernel.name == 'packed_kernel':
             # data and kernel are not pre-computed, schedule layout transform here
-            _schedule_injective(packed_data.op, s)
-            _schedule_injective(packed_kernel.op, s)
+            schedule_injective_from_existing(s, packed_data)
+            schedule_injective_from_existing(s, packed_kernel)
 
     if pad_data != packed_data:
         s[pad_data].compute_inline()
diff --git a/topi/python/topi/cuda/group_conv2d_nchw.py b/topi/python/topi/cuda/group_conv2d_nchw.py
index cd8c823ae8ae..f4bb73470651 100644
--- a/topi/python/topi/cuda/group_conv2d_nchw.py
+++ b/topi/python/topi/cuda/group_conv2d_nchw.py
@@ -19,7 +19,7 @@
 import tvm
 from tvm import autotvm
 
-from .injective import _schedule_injective
+from .injective import schedule_injective_from_existing
 from .tensor_intrin import dp4a
 from ..nn.pad import pad
 from ..nn.util import get_pad_tuple
@@ -201,8 +201,8 @@ def schedule_group_conv2d_NCHWc_int8(cfg, s, output):
         if isinstance(packed_kernel.op, tvm.tensor.ComputeOp) and\
                 packed_kernel.name == 'packed_kernel':
             # data and kernel are not pre-computed, schedule layout transform here
-            _schedule_injective(packed_data.op, s)
-            _schedule_injective(packed_kernel.op, s)
+            schedule_injective_from_existing(s, packed_data)
+            schedule_injective_from_existing(s, packed_kernel)
 
     if pad_data != packed_data:
         s[pad_data].compute_inline()
diff --git a/topi/python/topi/cuda/injective.py b/topi/python/topi/cuda/injective.py
index 8a16c5ca2dd4..a6ec85377a13 100644
--- a/topi/python/topi/cuda/injective.py
+++ b/topi/python/topi/cuda/injective.py
@@ -19,33 +19,46 @@
 import tvm
 from .. import generic, util
 
-def _schedule_injective(op, sch):
-    x = op.output(0)
-    fused = sch[x].fuse(*sch[x].op.axis)
+@generic.schedule_injective_from_existing.register(["cuda", "gpu"])
+def schedule_injective_from_existing(sch, out):
+    """Schedule for injective op from existing schedule.
+
+    Parameters
+    ----------
+    sch: Schedule
+         The schedule to update.
+    out: Tensor
+         The tensor representing the injective op.
+
+    Returns
+    -------
+    sch: Schedule
+         The updated schedule.
+    """
+    fused = sch[out].fuse(*sch[out].op.axis)
     num_thread = tvm.target.current_target(allow_none=False).max_num_threads
     max_block = 256
 
     try:
-        const_size = util.get_const_int(util.prod(x.shape))
+        const_size = util.get_const_int(util.prod(out.shape))
         max_block = 256
         need_block_split = const_size > max_block * num_thread
     except ValueError:
         need_block_split = False
 
     if need_block_split:
-        xo, xi = sch[x].split(fused, factor=num_thread * max_block)
-        bx, tx = sch[x].split(xi, factor=num_thread)
-        sch[x].reorder(bx, tx, xo)
-        sch[x].bind(bx, tvm.thread_axis("blockIdx.x"))
-        sch[x].bind(tx, tvm.thread_axis("threadIdx.x"))
+        xo, xi = sch[out].split(fused, factor=num_thread * max_block)
+        bx, tx = sch[out].split(xi, factor=num_thread)
+        sch[out].reorder(bx, tx, xo)
+        sch[out].bind(bx, tvm.thread_axis("blockIdx.x"))
+        sch[out].bind(tx, tvm.thread_axis("threadIdx.x"))
     else:
-        bx, tx = sch[x].split(fused, factor=num_thread)
-        sch[x].bind(tx, tvm.thread_axis("threadIdx.x"))
-        sch[x].bind(bx, tvm.thread_axis("blockIdx.x"))
+        bx, tx = sch[out].split(fused, factor=num_thread)
+        sch[out].bind(tx, tvm.thread_axis("threadIdx.x"))
+        sch[out].bind(bx, tvm.thread_axis("blockIdx.x"))
 
     return sch
 
-
 @generic.schedule_injective.register(["cuda", "gpu"])
 def schedule_injective(outs):
     """Schedule for injective op.
@@ -66,7 +79,7 @@ def schedule_injective(outs):
 
     tvm.schedule.AutoInlineInjective(s)
     for out in outs:
-        _schedule_injective(out.op, s)
+        schedule_injective_from_existing(s, out)
     return s
 
 schedule_elemwise = schedule_injective
diff --git a/topi/python/topi/cuda/reduction.py b/topi/python/topi/cuda/reduction.py
index 25885315179c..a7a56b90fb1f 100644
--- a/topi/python/topi/cuda/reduction.py
+++ b/topi/python/topi/cuda/reduction.py
@@ -20,7 +20,7 @@
 import tvm
 from .. import tag
 from .. import generic
-from .injective import _schedule_injective
+from .injective import schedule_injective_from_existing
 
 def _schedule_reduce(op, sch, is_idx_reduce=False):
     if is_idx_reduce:
@@ -30,7 +30,7 @@ def _schedule_reduce(op, sch, is_idx_reduce=False):
         data_out = op.output(0)
 
     if not sch[data_out].op.reduce_axis:
-        return _schedule_injective(op, sch)
+        return schedule_injective_from_existing(sch, op.output(0))
 
     if len(sch[data_out].op.axis) > 0:
         all_reduce = False
@@ -126,7 +126,7 @@ def traverse_after_reduce(operator):
         """Internal travserse function"""
         if tag.is_broadcast(operator.tag):
             if operator not in scheduled_ops:
-                _schedule_injective(operator, sch)
+                schedule_injective_from_existing(sch, operator.output(0))
             for tensor in operator.input_tensors:
                 traverse_after_reduce(tensor.op)
         elif operator.tag == 'comm_reduce':
diff --git a/topi/python/topi/cuda/softmax.py b/topi/python/topi/cuda/softmax.py
index 09b6ef8030ec..9fdac50a79d1 100644
--- a/topi/python/topi/cuda/softmax.py
+++ b/topi/python/topi/cuda/softmax.py
@@ -18,7 +18,7 @@
 """Schedule for softmax operator"""
 import tvm
 from .. import generic
-from .injective import _schedule_injective
+from .injective import schedule_injective_from_existing
 
 @generic.schedule_softmax.register(["cuda", "gpu"])
 def schedule_softmax(outs):
@@ -58,7 +58,7 @@ def schedule_softmax(outs):
             ops.append(exp.op)
             
         for op in ops:
-            s = _schedule_injective(op, s)
+            s = schedule_injective_from_existing(s, op.output(0))
     else:
         num_thread = 64
         block_x = tvm.thread_axis("blockIdx.x")
diff --git a/topi/python/topi/cuda/sort.py b/topi/python/topi/cuda/sort.py
index 1d9148f46278..c45465e31624 100644
--- a/topi/python/topi/cuda/sort.py
+++ b/topi/python/topi/cuda/sort.py
@@ -42,10 +42,10 @@ def _schedule_sort(outs):
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
     scheduled_ops = []
-    from .injective import _schedule_injective
+    from .injective import schedule_injective_from_existing
     def traverse(op):
         if tag.is_injective(op.tag):
-            _schedule_injective(op, s)
+            schedule_injective_from_existing(s, op.output(0))
         for tensor in op.input_tensors:
             if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                 traverse(tensor.op)
diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py
index 968e554ac81d..3a90402707ad 100644
--- a/topi/python/topi/cuda/vision.py
+++ b/topi/python/topi/cuda/vision.py
@@ -28,10 +28,10 @@ def _default_schedule(outs):
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
     scheduled_ops = []
-    from .injective import _schedule_injective
+    from .injective import schedule_injective_from_existing
     def traverse(op):
         if tag.is_broadcast(op.tag) or op.tag in ['bbox_score', 'sorted_bbox']:
-            _schedule_injective(op, s)
+            schedule_injective_from_existing(s, op.output(0))
         for tensor in op.input_tensors:
             if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                 traverse(tensor.op)
diff --git a/topi/python/topi/generic/injective.py b/topi/python/topi/generic/injective.py
index ec1732f394f2..178363dc0d4f 100644
--- a/topi/python/topi/generic/injective.py
+++ b/topi/python/topi/generic/injective.py
@@ -20,6 +20,25 @@
 
 import tvm
 
+@tvm.target.override_native_generic_func("schedule_injective_from_existing")
+def schedule_injective_from_existing(sch, out):
+    """Schedule for injective op from existing schedule.
+
+    Parameters
+    ----------
+    sch: Schedule
+         The schedule to update.
+    out: Tensor
+         The tensor representing the injective op.
+
+    Returns
+    -------
+    sch: Schedule
+         The updated schedule.
+    """
+    sch[out].fuse(s[out].op.axis)
+    return sch
+
 @tvm.target.override_native_generic_func("schedule_injective")
 def schedule_injective(outs):
     """Schedule for injective op.
@@ -42,7 +61,7 @@ def schedule_injective(outs):
     x = outs[0]
     s = tvm.create_schedule([x.op for x in outs])
     tvm.schedule.AutoInlineInjective(s)
-    s[x].fuse(s[x].op.axis)
+    schedule_injective_from_existing(s, x)
     return s
 
 @tvm.target.generic_func
diff --git a/topi/python/topi/hls/injective.py b/topi/python/topi/hls/injective.py
index 5a43306f361b..de584287a90e 100644
--- a/topi/python/topi/hls/injective.py
+++ b/topi/python/topi/hls/injective.py
@@ -19,6 +19,27 @@
 import tvm
 from .. import generic
 
+@generic.schedule_injective_from_existing.register(["hls"])
+def schedule_injective_from_existing(sch, out):
+    """Schedule for injective op from existing schedule.
+
+    Parameters
+    ----------
+    sch: Schedule
+         The schedule to update.
+    out: Tensor
+         The tensor representing the injective op.
+
+    Returns
+    -------
+    sch: Schedule
+         The updated schedule.
+    """
+    fused = sch[out].fuse(*sch[out].op.axis)
+    px, x = sch[out].split(fused, nparts=1)
+    sch[out].bind(px, tvm.thread_axis("pipeline"))
+    return sch
+
 @generic.schedule_injective.register(["hls"])
 def schedule_injective(outs):
     """Schedule for injective op.
@@ -38,9 +59,7 @@ def schedule_injective(outs):
     s = tvm.create_schedule([x.op for x in outs])
     tvm.schedule.AutoInlineInjective(s)
     for out in outs:
-        fused = s[out].fuse(*s[out].op.axis)
-        px, x = s[out].split(fused, nparts=1)
-        s[out].bind(px, tvm.thread_axis("pipeline"))
+        schedule_injective_from_existing(s, out)
     return s
 
 schedule_elemwise = schedule_injective
diff --git a/topi/python/topi/opengl/injective.py b/topi/python/topi/opengl/injective.py
index 1f3f4942ac95..d3ebc943b962 100644
--- a/topi/python/topi/opengl/injective.py
+++ b/topi/python/topi/opengl/injective.py
@@ -19,11 +19,24 @@
 import tvm
 from .. import generic
 
-def _schedule_injective(op, sch):
-    x = op.output(0)
-    sch[x].opengl()
-    return sch
+@generic.schedule_injective_from_existing.register(["opengl"])
+def schedule_injective_from_existing(sch, out):
+    """Schedule for injective op from existing schedule.
+
+    Parameters
+    ----------
+    sch: Schedule
+         The schedule to update.
+    out: Tensor
+         The tensor representing the injective op.
 
+    Returns
+    -------
+    sch: Schedule
+         The updated schedule.
+    """
+    sch[out].opengl()
+    return sch
 
 @generic.schedule_injective.register(["opengl"])
 def schedule_injective(outs):
@@ -45,7 +58,7 @@ def schedule_injective(outs):
 
     tvm.schedule.AutoInlineInjective(s)
     for out in outs:
-        _schedule_injective(out.op, s)
+        schedule_injective_from_existing(s, out)
     return s
 
 schedule_elemwise = schedule_injective
diff --git a/topi/python/topi/x86/injective.py b/topi/python/topi/x86/injective.py
index 37080e03d8e4..5bcb17921b79 100644
--- a/topi/python/topi/x86/injective.py
+++ b/topi/python/topi/x86/injective.py
@@ -20,6 +20,32 @@
 import tvm
 from .. import generic
 
+@generic.schedule_injective_from_existing.register(["cpu"])
+def schedule_injective_from_existing(sch, out):
+    """Schedule for injective op from existing schedule.
+
+    Parameters
+    ----------
+    sch: Schedule
+         The schedule to update.
+    out: Tensor
+         The tensor representing the injective op.
+
+    Returns
+    -------
+    sch: Schedule
+         The updated schedule.
+    """
+    if len(sch[out].op.axis) >= 5:
+        fused = sch[out].fuse(sch[out].op.axis[0], sch[out].op.axis[1], sch[out].op.axis[2])
+        sch[out].parallel(fused)
+    elif len(sch[out].op.axis) >= 3:
+        fused = sch[out].fuse(sch[out].op.axis[0], sch[out].op.axis[1])
+        sch[out].parallel(fused)
+    elif len(sch[out].op.axis) >= 1:
+        sch[out].parallel(sch[out].op.axis[0])
+    return sch
+
 @generic.schedule_injective.register(["cpu"])
 def schedule_injective(outs):
     """X86 schedule for injective op.
@@ -39,14 +65,7 @@ def schedule_injective(outs):
     x = outs[0]
     s = tvm.create_schedule([x.op for x in outs])
     tvm.schedule.AutoInlineInjective(s)
-    if len(s[x].op.axis) >= 5:
-        fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1], s[x].op.axis[2])
-        s[x].parallel(fused)
-    elif len(s[x].op.axis) >= 3:
-        fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1])
-        s[x].parallel(fused)
-    elif len(s[x].op.axis) >= 1:
-        s[x].parallel(s[x].op.axis[0])
+    schedule_injective_from_existing(s, x)
     return s
 
 @generic.schedule_concatenate.register(["cpu"])
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index c8171e04cdc8..7114f4d878a8 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -585,6 +585,11 @@ TVM_REGISTER_GLOBAL("topi.generic.schedule_injective")
   *rv = topi::generic::schedule_injective(args[0], args[1]);
   });
 
+TVM_REGISTER_GLOBAL("topi.generic.schedule_injective_from_existing")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::generic::schedule_injective_from_existing(args[0], args[1]);
+ });
+
 /* x86 schedules */
 TVM_REGISTER_GLOBAL("topi.x86.schedule_binarize_pack")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
@@ -610,6 +615,11 @@ TVM_REGISTER_GLOBAL("topi.x86.schedule_injective")
   *rv = topi::x86::schedule_injective(args[0], args[1]);
   });
 
+TVM_REGISTER_GLOBAL("topi.x86.schedule_injective_from_existing")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::x86::schedule_injective_from_existing(args[0], args[1]);
+ });
+
 /* ROCm schedules */
 TVM_REGISTER_GLOBAL("topi.rocm.dense_cuda")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
@@ -647,6 +657,11 @@ TVM_REGISTER_GLOBAL("topi.cuda.schedule_injective")
   *rv = topi::cuda::schedule_injective(args[0], args[1]);
   });
 
+TVM_REGISTER_GLOBAL("topi.cuda.schedule_injective_from_existing")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::cuda::schedule_injective_from_existing(args[0], args[1]);
+ });
+
 TVM_REGISTER_GLOBAL("topi.cuda.schedule_pool")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   *rv = topi::cuda::schedule_pool(args[0], args[1]);
@@ -746,6 +761,30 @@ TVM_REGISTER_GENERIC_FUNC(schedule_binary_dense)
 .set_default(WrapSchedule(topi::generic::default_schedule))
 .register_func({ "cpu" }, WrapSchedule(topi::x86::schedule_binary_dense));
 
+/*! \brief Builder function for instantiating schedules from existing schedules. */
+using FTVMScheduleFromExistingBuilder = std::function<
+  tvm::Schedule(tvm::Schedule sch, const tvm::Tensor& out)>;
+
+/*!
+ * \brief Helper function for registering generic functions matching the
+ * FTVMScheduleFromExistingBuilder signature. The schedule builder function is wrapped
+ * with a PackedFunc suitable for passing to a tvm::GenericFunc.
+ *
+ * \param builder The schedule builder to wrap.
+ *
+ * \return The wrapped schedule builder
+ */
+inline PackedFunc WrapScheduleFromExisting(FTVMScheduleFromExistingBuilder builder) {
+  return PackedFunc([builder](TVMArgs args, TVMRetValue* ret) {
+    *ret = builder(args[0], args[1]);
+  });
+}
+
+TVM_REGISTER_GENERIC_FUNC(schedule_injective_from_existing)
+.set_default(WrapScheduleFromExisting(topi::generic::schedule_injective_from_existing))
+.register_func({ "cpu" }, WrapScheduleFromExisting(topi::x86::schedule_injective_from_existing))
+.register_func({ "cuda", "gpu" }, WrapScheduleFromExisting(topi::cuda::schedule_injective_from_existing));
+
 /*! \brief Builder function for instantiating dense ops. */
 using FTVMDenseOpBuilder = std::function<tvm::Tensor(const Target& target,
                                                      const tvm::Tensor& data,

From 2bdc1f1e055cc38914be4f96fd2a695446eec5ac Mon Sep 17 00:00:00 2001
From: Jon Soifer <jonso@microsoft.com>
Date: Mon, 23 Sep 2019 15:47:18 -0700
Subject: [PATCH 07/18] Fix

---
 topi/include/topi/cuda/injective.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/topi/include/topi/cuda/injective.h b/topi/include/topi/cuda/injective.h
index 663bc1fcbeb2..5f75bd71e581 100644
--- a/topi/include/topi/cuda/injective.h
+++ b/topi/include/topi/cuda/injective.h
@@ -44,7 +44,7 @@ namespace cuda {
  */
 inline Schedule schedule_injective_from_existing(Schedule sch, const Tensor& out) {
   auto fused = detail::Fuse(sch[out], sch[out]->op.as<ComputeOpNode>()->axis);
-  auto target = Target::Current(false);
+  auto target = Target::Current();
   auto num_thread = target->max_num_threads;
   IterVar bx, tx;
   sch[out].split(fused, num_thread, &bx, &tx);

From 09d5f0c306f699945727021e02a3687970096eb7 Mon Sep 17 00:00:00 2001
From: Jon Soifer <jonso@microsoft.com>
Date: Mon, 23 Sep 2019 16:53:28 -0700
Subject: [PATCH 08/18] Fix

---
 topi/include/topi/cuda/injective.h | 21 +--------------------
 topi/include/topi/x86/injective.h  | 23 +----------------------
 topi/src/topi.cc                   | 14 +-------------
 3 files changed, 3 insertions(+), 55 deletions(-)

diff --git a/topi/include/topi/cuda/injective.h b/topi/include/topi/cuda/injective.h
index 5f75bd71e581..584779525e26 100644
--- a/topi/include/topi/cuda/injective.h
+++ b/topi/include/topi/cuda/injective.h
@@ -34,25 +34,6 @@ using namespace tvm;
 
 namespace cuda {
 
-/*!
- * \brief Updates an existing schedule for the given injective ops.
- *
- * \param sch The schedule to update.
- * \param out The tensor representing the injective op.
- * 
- * \return The updated schedule.
- */
-inline Schedule schedule_injective_from_existing(Schedule sch, const Tensor& out) {
-  auto fused = detail::Fuse(sch[out], sch[out]->op.as<ComputeOpNode>()->axis);
-  auto target = Target::Current();
-  auto num_thread = target->max_num_threads;
-  IterVar bx, tx;
-  sch[out].split(fused, num_thread, &bx, &tx);
-  sch[out].bind(bx, thread_axis(Range(), "blockIdx.x"));
-  sch[out].bind(tx, thread_axis(Range(), "threadIdx.x"));
-  return sch;
-}
-
 /*!
  * \brief Create a CUDA schedule for the given output tensors.
  *
@@ -69,7 +50,7 @@ inline Schedule schedule_injective(const Target &target, const Array<Tensor>& ou
   auto s = create_schedule(out_ops);
   tvm::schedule::AutoInlineInjective(s);
   for (auto out : outs) {
-    schedule_injective_from_existing(s, out);
+    tvm::GenericFunc::Get("schedule_injective_from_existing")(s, out);
   }
   return s;
 }
diff --git a/topi/include/topi/x86/injective.h b/topi/include/topi/x86/injective.h
index 7cb79ae322c2..a762714966cc 100644
--- a/topi/include/topi/x86/injective.h
+++ b/topi/include/topi/x86/injective.h
@@ -34,27 +34,6 @@ using namespace tvm;
 
 namespace x86 {
 
-/*!
- * \brief Updates an existing schedule for the given injective ops.
- *
- * \param sch The schedule to update.
- * \param out The tensor representing the injective op.
- * 
- * \return The updated schedule.
- */
-inline Schedule schedule_injective_from_existing(Schedule sch, const Tensor& out) {
-  auto axis = sch[out]->op.as<ComputeOpNode>()->axis;
-  if (axis.size() == 4) {
-    auto n = axis[0];
-    auto c = axis[1];
-    auto fused = detail::Fuse(sch[out], { n, c });  // for nhwc layout, fuse n and h
-    sch[out].parallel(fused);
-  } else {
-    sch[out].parallel(axis[0]);
-  }
-  return sch;
-}
-
 /*!
 * \brief Create an x86 schedule for the given injective ops.
 *
@@ -72,7 +51,7 @@ inline Schedule schedule_injective(const Target &target, const Array<Tensor>& ou
   tvm::schedule::AutoInlineInjective(s);
 
   auto x = outs[0];
-  schedule_injective_from_existing(s, x);
+  tvm::GenericFunc::Get("schedule_injective_from_existing")(s, x);
 
   return s;
 }
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index 7114f4d878a8..d5fb969ad780 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -615,11 +615,6 @@ TVM_REGISTER_GLOBAL("topi.x86.schedule_injective")
   *rv = topi::x86::schedule_injective(args[0], args[1]);
   });
 
-TVM_REGISTER_GLOBAL("topi.x86.schedule_injective_from_existing")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = topi::x86::schedule_injective_from_existing(args[0], args[1]);
- });
-
 /* ROCm schedules */
 TVM_REGISTER_GLOBAL("topi.rocm.dense_cuda")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
@@ -657,11 +652,6 @@ TVM_REGISTER_GLOBAL("topi.cuda.schedule_injective")
   *rv = topi::cuda::schedule_injective(args[0], args[1]);
   });
 
-TVM_REGISTER_GLOBAL("topi.cuda.schedule_injective_from_existing")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = topi::cuda::schedule_injective_from_existing(args[0], args[1]);
- });
-
 TVM_REGISTER_GLOBAL("topi.cuda.schedule_pool")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   *rv = topi::cuda::schedule_pool(args[0], args[1]);
@@ -781,9 +771,7 @@ inline PackedFunc WrapScheduleFromExisting(FTVMScheduleFromExistingBuilder build
 }
 
 TVM_REGISTER_GENERIC_FUNC(schedule_injective_from_existing)
-.set_default(WrapScheduleFromExisting(topi::generic::schedule_injective_from_existing))
-.register_func({ "cpu" }, WrapScheduleFromExisting(topi::x86::schedule_injective_from_existing))
-.register_func({ "cuda", "gpu" }, WrapScheduleFromExisting(topi::cuda::schedule_injective_from_existing));
+.set_default(WrapScheduleFromExisting(topi::generic::schedule_injective_from_existing));
 
 /*! \brief Builder function for instantiating dense ops. */
 using FTVMDenseOpBuilder = std::function<tvm::Tensor(const Target& target,

From 5c9e9f45e3709afbe601ccc01e6d8940d239e307 Mon Sep 17 00:00:00 2001
From: Jon Soifer <jonso@microsoft.com>
Date: Mon, 23 Sep 2019 17:19:39 -0700
Subject: [PATCH 09/18] Add back to C++

---
 topi/include/topi/cuda/injective.h    | 20 +++++++++++++++++++-
 topi/include/topi/generic/injective.h |  5 +++--
 topi/include/topi/x86/injective.h     | 23 ++++++++++++++++++++++-
 topi/src/topi.cc                      | 20 ++++++++++++++++----
 4 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/topi/include/topi/cuda/injective.h b/topi/include/topi/cuda/injective.h
index 584779525e26..885acdef230d 100644
--- a/topi/include/topi/cuda/injective.h
+++ b/topi/include/topi/cuda/injective.h
@@ -34,6 +34,24 @@ using namespace tvm;
 
 namespace cuda {
 
+/*!
+ * \brief Updates an existing schedule for the given injective ops.
+ *
+ * \param sch The schedule to update.
+ * \param out The tensor representing the injective op.
+ * 
+ * \return The updated schedule.
+ */
+inline Schedule schedule_injective_from_existing(const Target& target, Schedule sch, const Tensor& out) {
+  auto fused = detail::Fuse(sch[out], sch[out]->op.as<ComputeOpNode>()->axis);
+  auto num_thread = target->max_num_threads;
+  IterVar bx, tx;
+  sch[out].split(fused, num_thread, &bx, &tx);
+  sch[out].bind(bx, thread_axis(Range(), "blockIdx.x"));
+  sch[out].bind(tx, thread_axis(Range(), "threadIdx.x"));
+  return sch;
+}
+
 /*!
  * \brief Create a CUDA schedule for the given output tensors.
  *
@@ -50,7 +68,7 @@ inline Schedule schedule_injective(const Target &target, const Array<Tensor>& ou
   auto s = create_schedule(out_ops);
   tvm::schedule::AutoInlineInjective(s);
   for (auto out : outs) {
-    tvm::GenericFunc::Get("schedule_injective_from_existing")(s, out);
+    tvm::GenericFunc::Get("schedule_injective_from_existing")(target, s, out);
   }
   return s;
 }
diff --git a/topi/include/topi/generic/injective.h b/topi/include/topi/generic/injective.h
index fa7df4ccfd46..2cf48b54c294 100644
--- a/topi/include/topi/generic/injective.h
+++ b/topi/include/topi/generic/injective.h
@@ -37,12 +37,13 @@ namespace generic {
 /*!
  * \brief Updates an existing schedule for the given injective ops.
  *
+ * \param target The target to update the schedule for.
  * \param sch The schedule to update.
  * \param out The tensor representing the injective op.
  * 
  * \return The updated schedule.
  */
-inline Schedule schedule_injective_from_existing(Schedule sch, const Tensor& out) {
+inline Schedule schedule_injective_from_existing(const Target& target, Schedule sch, const Tensor& out) {
   detail::Fuse(sch[out], sch[out]->op.as<ComputeOpNode>()->axis);
   return sch;
 }
@@ -63,7 +64,7 @@ inline Schedule schedule_injective(const Target &target, const Array<Tensor>& ou
   auto s = create_schedule(out_ops);
   tvm::schedule::AutoInlineInjective(s);
   auto x = outs[0];
-  schedule_injective_from_existing(s, x);
+  tvm::GenericFunc::Get("schedule_injective_from_existing")(target, s, x);
 
   return s;
 }
diff --git a/topi/include/topi/x86/injective.h b/topi/include/topi/x86/injective.h
index a762714966cc..614ca8b9ed29 100644
--- a/topi/include/topi/x86/injective.h
+++ b/topi/include/topi/x86/injective.h
@@ -34,6 +34,27 @@ using namespace tvm;
 
 namespace x86 {
 
+/*!
+ * \brief Updates an existing schedule for the given injective ops.
+ *
+ * \param sch The schedule to update.
+ * \param out The tensor representing the injective op.
+ * 
+ * \return The updated schedule.
+ */
+inline Schedule schedule_injective_from_existing(const Target& target, Schedule sch, const Tensor& out) {
+  auto axis = sch[out]->op.as<ComputeOpNode>()->axis;
+  if (axis.size() == 4) {
+    auto n = axis[0];
+    auto c = axis[1];
+    auto fused = detail::Fuse(sch[out], { n, c });  // for nhwc layout, fuse n and h
+    sch[out].parallel(fused);
+  } else {
+    sch[out].parallel(axis[0]);
+  }
+  return sch;
+}
+
 /*!
 * \brief Create an x86 schedule for the given injective ops.
 *
@@ -51,7 +72,7 @@ inline Schedule schedule_injective(const Target &target, const Array<Tensor>& ou
   tvm::schedule::AutoInlineInjective(s);
 
   auto x = outs[0];
-  tvm::GenericFunc::Get("schedule_injective_from_existing")(s, x);
+  tvm::GenericFunc::Get("schedule_injective_from_existing")(target, s, x);
 
   return s;
 }
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index d5fb969ad780..b7bea8dd1717 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -587,7 +587,7 @@ TVM_REGISTER_GLOBAL("topi.generic.schedule_injective")
 
 TVM_REGISTER_GLOBAL("topi.generic.schedule_injective_from_existing")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = topi::generic::schedule_injective_from_existing(args[0], args[1]);
+  *rv = topi::generic::schedule_injective_from_existing(args[0], args[1], args[2]);
  });
 
 /* x86 schedules */
@@ -615,6 +615,11 @@ TVM_REGISTER_GLOBAL("topi.x86.schedule_injective")
   *rv = topi::x86::schedule_injective(args[0], args[1]);
   });
 
+TVM_REGISTER_GLOBAL("topi.x86.schedule_injective_from_existing")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::x86::schedule_injective_from_existing(args[0], args[1], args[2]);
+ });
+
 /* ROCm schedules */
 TVM_REGISTER_GLOBAL("topi.rocm.dense_cuda")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
@@ -652,6 +657,11 @@ TVM_REGISTER_GLOBAL("topi.cuda.schedule_injective")
   *rv = topi::cuda::schedule_injective(args[0], args[1]);
   });
 
+TVM_REGISTER_GLOBAL("topi.cuda.schedule_injective_from_existing")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::cuda::schedule_injective_from_existing(args[0], args[1], args[2]);
+ });
+
 TVM_REGISTER_GLOBAL("topi.cuda.schedule_pool")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   *rv = topi::cuda::schedule_pool(args[0], args[1]);
@@ -753,7 +763,7 @@ TVM_REGISTER_GENERIC_FUNC(schedule_binary_dense)
 
 /*! \brief Builder function for instantiating schedules from existing schedules. */
 using FTVMScheduleFromExistingBuilder = std::function<
-  tvm::Schedule(tvm::Schedule sch, const tvm::Tensor& out)>;
+  tvm::Schedule(const Target& target, tvm::Schedule sch, const tvm::Tensor& out)>;
 
 /*!
  * \brief Helper function for registering generic functions matching the
@@ -766,12 +776,14 @@ using FTVMScheduleFromExistingBuilder = std::function<
  */
 inline PackedFunc WrapScheduleFromExisting(FTVMScheduleFromExistingBuilder builder) {
   return PackedFunc([builder](TVMArgs args, TVMRetValue* ret) {
-    *ret = builder(args[0], args[1]);
+    *ret = builder(args[0], args[1], args[2]);
   });
 }
 
 TVM_REGISTER_GENERIC_FUNC(schedule_injective_from_existing)
-.set_default(WrapScheduleFromExisting(topi::generic::schedule_injective_from_existing));
+.set_default(WrapScheduleFromExisting(topi::generic::schedule_injective_from_existing))
+.register_func({ "cpu" }, WrapScheduleFromExisting(topi::x86::schedule_injective_from_existing))
+.register_func({ "cuda", "gpu" }, WrapScheduleFromExisting(topi::cuda::schedule_injective_from_existing));
 
 /*! \brief Builder function for instantiating dense ops. */
 using FTVMDenseOpBuilder = std::function<tvm::Tensor(const Target& target,

From 1e423147a20200c954cf96cd72e9afdefb8e788f Mon Sep 17 00:00:00 2001
From: Jon Soifer <jonso@microsoft.com>
Date: Mon, 23 Sep 2019 17:22:31 -0700
Subject: [PATCH 10/18] Fix style

---
 topi/include/topi/cuda/injective.h    | 5 ++++-
 topi/include/topi/generic/injective.h | 4 +++-
 topi/include/topi/x86/injective.h     | 5 ++++-
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/topi/include/topi/cuda/injective.h b/topi/include/topi/cuda/injective.h
index 885acdef230d..5c697692fca3 100644
--- a/topi/include/topi/cuda/injective.h
+++ b/topi/include/topi/cuda/injective.h
@@ -37,12 +37,15 @@ namespace cuda {
 /*!
  * \brief Updates an existing schedule for the given injective ops.
  *
+ * \param target The target to update the schedule for.
  * \param sch The schedule to update.
  * \param out The tensor representing the injective op.
  * 
  * \return The updated schedule.
  */
-inline Schedule schedule_injective_from_existing(const Target& target, Schedule sch, const Tensor& out) {
+inline Schedule schedule_injective_from_existing(const Target& target,
+                                                 Schedule sch,
+                                                 const Tensor& out) {
   auto fused = detail::Fuse(sch[out], sch[out]->op.as<ComputeOpNode>()->axis);
   auto num_thread = target->max_num_threads;
   IterVar bx, tx;
diff --git a/topi/include/topi/generic/injective.h b/topi/include/topi/generic/injective.h
index 2cf48b54c294..ccb714f512c3 100644
--- a/topi/include/topi/generic/injective.h
+++ b/topi/include/topi/generic/injective.h
@@ -43,7 +43,9 @@ namespace generic {
  * 
  * \return The updated schedule.
  */
-inline Schedule schedule_injective_from_existing(const Target& target, Schedule sch, const Tensor& out) {
+inline Schedule schedule_injective_from_existing(const Target& target,
+                                                 Schedule sch,
+                                                 const Tensor& out) {
   detail::Fuse(sch[out], sch[out]->op.as<ComputeOpNode>()->axis);
   return sch;
 }
diff --git a/topi/include/topi/x86/injective.h b/topi/include/topi/x86/injective.h
index 614ca8b9ed29..98acb7f75c2a 100644
--- a/topi/include/topi/x86/injective.h
+++ b/topi/include/topi/x86/injective.h
@@ -37,12 +37,15 @@ namespace x86 {
 /*!
  * \brief Updates an existing schedule for the given injective ops.
  *
+ * \param target The target to update the schedule for.
  * \param sch The schedule to update.
  * \param out The tensor representing the injective op.
  * 
  * \return The updated schedule.
  */
-inline Schedule schedule_injective_from_existing(const Target& target, Schedule sch, const Tensor& out) {
+inline Schedule schedule_injective_from_existing(const Target& target,
+                                                 Schedule sch,
+                                                 const Tensor& out) {
   auto axis = sch[out]->op.as<ComputeOpNode>()->axis;
   if (axis.size() == 4) {
     auto n = axis[0];

From 7da8ea25e2158a424b6fc91833385f3b90897a8b Mon Sep 17 00:00:00 2001
From: Jon <jonso@microsoft.com>
Date: Tue, 24 Sep 2019 09:46:14 -0700
Subject: [PATCH 11/18] Injective schedule calls local
 schedule_injective_from_existing

---
 topi/include/topi/cuda/injective.h    | 2 +-
 topi/include/topi/generic/injective.h | 2 +-
 topi/include/topi/x86/injective.h     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/topi/include/topi/cuda/injective.h b/topi/include/topi/cuda/injective.h
index 5c697692fca3..491bbf4b1c40 100644
--- a/topi/include/topi/cuda/injective.h
+++ b/topi/include/topi/cuda/injective.h
@@ -71,7 +71,7 @@ inline Schedule schedule_injective(const Target &target, const Array<Tensor>& ou
   auto s = create_schedule(out_ops);
   tvm::schedule::AutoInlineInjective(s);
   for (auto out : outs) {
-    tvm::GenericFunc::Get("schedule_injective_from_existing")(target, s, out);
+    schedule_injective_from_existing(target, s, out);
   }
   return s;
 }
diff --git a/topi/include/topi/generic/injective.h b/topi/include/topi/generic/injective.h
index ccb714f512c3..6fe46b215c26 100644
--- a/topi/include/topi/generic/injective.h
+++ b/topi/include/topi/generic/injective.h
@@ -66,7 +66,7 @@ inline Schedule schedule_injective(const Target &target, const Array<Tensor>& ou
   auto s = create_schedule(out_ops);
   tvm::schedule::AutoInlineInjective(s);
   auto x = outs[0];
-  tvm::GenericFunc::Get("schedule_injective_from_existing")(target, s, x);
+  schedule_injective_from_existing(target, s, x);
 
   return s;
 }
diff --git a/topi/include/topi/x86/injective.h b/topi/include/topi/x86/injective.h
index 98acb7f75c2a..8efe7ff7d86e 100644
--- a/topi/include/topi/x86/injective.h
+++ b/topi/include/topi/x86/injective.h
@@ -75,7 +75,7 @@ inline Schedule schedule_injective(const Target &target, const Array<Tensor>& ou
   tvm::schedule::AutoInlineInjective(s);
 
   auto x = outs[0];
-  tvm::GenericFunc::Get("schedule_injective_from_existing")(target, s, x);
+  schedule_injective_from_existing(target, s, x);
 
   return s;
 }

From decbb5e960488599fd4ed54efef0181fb8f9b677 Mon Sep 17 00:00:00 2001
From: Jon Soifer <jonso@microsoft.com>
Date: Tue, 24 Sep 2019 13:08:19 -0700
Subject: [PATCH 12/18] Fix

---
 topi/python/topi/arm_cpu/injective.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/topi/python/topi/arm_cpu/injective.py b/topi/python/topi/arm_cpu/injective.py
index cd14e3a0f058..3727754988f3 100644
--- a/topi/python/topi/arm_cpu/injective.py
+++ b/topi/python/topi/arm_cpu/injective.py
@@ -41,7 +41,7 @@ def schedule_injective_from_existing(sch, out):
     elif len(sch[out].op.axis) >= 3:
         fused = sch[out].fuse(sch[out].op.axis[0], sch[out].op.axis[1])
         sch[out].parallel(fused)
-    elif len(sch[x].op.axis) >= 2:
+    elif len(sch[out].op.axis) >= 2:
         sch[out].parallel(sch[out].op.axis[0])
     return sch
 

From 847bb5de1f98079b63bdebb2619d286f2178ed08 Mon Sep 17 00:00:00 2001
From: Jon Soifer <jonso@microsoft.com>
Date: Wed, 25 Sep 2019 09:07:33 -0700
Subject: [PATCH 13/18] Remove target arg from schedule_injective_from_existing

---
 topi/include/topi/cuda/injective.h    |  7 +++----
 topi/include/topi/generic/injective.h |  6 ++----
 topi/include/topi/x86/injective.h     |  6 ++----
 topi/src/topi.cc                      | 10 +++++-----
 4 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/topi/include/topi/cuda/injective.h b/topi/include/topi/cuda/injective.h
index 491bbf4b1c40..888b41b37be4 100644
--- a/topi/include/topi/cuda/injective.h
+++ b/topi/include/topi/cuda/injective.h
@@ -43,10 +43,9 @@ namespace cuda {
  * 
  * \return The updated schedule.
  */
-inline Schedule schedule_injective_from_existing(const Target& target,
-                                                 Schedule sch,
-                                                 const Tensor& out) {
+inline Schedule schedule_injective_from_existing(Schedule sch, const Tensor& out) {
   auto fused = detail::Fuse(sch[out], sch[out]->op.as<ComputeOpNode>()->axis);
+  auto target = Target::Current(false);
   auto num_thread = target->max_num_threads;
   IterVar bx, tx;
   sch[out].split(fused, num_thread, &bx, &tx);
@@ -71,7 +70,7 @@ inline Schedule schedule_injective(const Target &target, const Array<Tensor>& ou
   auto s = create_schedule(out_ops);
   tvm::schedule::AutoInlineInjective(s);
   for (auto out : outs) {
-    schedule_injective_from_existing(target, s, out);
+    schedule_injective_from_existing(s, out);
   }
   return s;
 }
diff --git a/topi/include/topi/generic/injective.h b/topi/include/topi/generic/injective.h
index 6fe46b215c26..24810edee07c 100644
--- a/topi/include/topi/generic/injective.h
+++ b/topi/include/topi/generic/injective.h
@@ -43,9 +43,7 @@ namespace generic {
  * 
  * \return The updated schedule.
  */
-inline Schedule schedule_injective_from_existing(const Target& target,
-                                                 Schedule sch,
-                                                 const Tensor& out) {
+inline Schedule schedule_injective_from_existing(Schedule sch, const Tensor& out) {
   detail::Fuse(sch[out], sch[out]->op.as<ComputeOpNode>()->axis);
   return sch;
 }
@@ -66,7 +64,7 @@ inline Schedule schedule_injective(const Target &target, const Array<Tensor>& ou
   auto s = create_schedule(out_ops);
   tvm::schedule::AutoInlineInjective(s);
   auto x = outs[0];
-  schedule_injective_from_existing(target, s, x);
+  schedule_injective_from_existing(s, x);
 
   return s;
 }
diff --git a/topi/include/topi/x86/injective.h b/topi/include/topi/x86/injective.h
index 8efe7ff7d86e..85b18aaed195 100644
--- a/topi/include/topi/x86/injective.h
+++ b/topi/include/topi/x86/injective.h
@@ -43,9 +43,7 @@ namespace x86 {
  * 
  * \return The updated schedule.
  */
-inline Schedule schedule_injective_from_existing(const Target& target,
-                                                 Schedule sch,
-                                                 const Tensor& out) {
+inline Schedule schedule_injective_from_existing(Schedule sch, const Tensor& out) {
   auto axis = sch[out]->op.as<ComputeOpNode>()->axis;
   if (axis.size() == 4) {
     auto n = axis[0];
@@ -75,7 +73,7 @@ inline Schedule schedule_injective(const Target &target, const Array<Tensor>& ou
   tvm::schedule::AutoInlineInjective(s);
 
   auto x = outs[0];
-  schedule_injective_from_existing(target, s, x);
+  schedule_injective_from_existing(s, x);
 
   return s;
 }
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index b7bea8dd1717..71cc6f8858db 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -587,7 +587,7 @@ TVM_REGISTER_GLOBAL("topi.generic.schedule_injective")
 
 TVM_REGISTER_GLOBAL("topi.generic.schedule_injective_from_existing")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = topi::generic::schedule_injective_from_existing(args[0], args[1], args[2]);
+  *rv = topi::generic::schedule_injective_from_existing(args[0], args[1]);
  });
 
 /* x86 schedules */
@@ -617,7 +617,7 @@ TVM_REGISTER_GLOBAL("topi.x86.schedule_injective")
 
 TVM_REGISTER_GLOBAL("topi.x86.schedule_injective_from_existing")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = topi::x86::schedule_injective_from_existing(args[0], args[1], args[2]);
+  *rv = topi::x86::schedule_injective_from_existing(args[0], args[1]);
  });
 
 /* ROCm schedules */
@@ -659,7 +659,7 @@ TVM_REGISTER_GLOBAL("topi.cuda.schedule_injective")
 
 TVM_REGISTER_GLOBAL("topi.cuda.schedule_injective_from_existing")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = topi::cuda::schedule_injective_from_existing(args[0], args[1], args[2]);
+  *rv = topi::cuda::schedule_injective_from_existing(args[0], args[1]);
  });
 
 TVM_REGISTER_GLOBAL("topi.cuda.schedule_pool")
@@ -763,7 +763,7 @@ TVM_REGISTER_GENERIC_FUNC(schedule_binary_dense)
 
 /*! \brief Builder function for instantiating schedules from existing schedules. */
 using FTVMScheduleFromExistingBuilder = std::function<
-  tvm::Schedule(const Target& target, tvm::Schedule sch, const tvm::Tensor& out)>;
+  tvm::Schedule(tvm::Schedule sch, const tvm::Tensor& out)>;
 
 /*!
  * \brief Helper function for registering generic functions matching the
@@ -776,7 +776,7 @@ using FTVMScheduleFromExistingBuilder = std::function<
  */
 inline PackedFunc WrapScheduleFromExisting(FTVMScheduleFromExistingBuilder builder) {
   return PackedFunc([builder](TVMArgs args, TVMRetValue* ret) {
-    *ret = builder(args[0], args[1], args[2]);
+    *ret = builder(args[1], args[2]);
   });
 }
 

From eab0b01c1301dce306f74a652465dceeb74493a5 Mon Sep 17 00:00:00 2001
From: Jon Soifer <jonso@microsoft.com>
Date: Wed, 25 Sep 2019 09:12:23 -0700
Subject: [PATCH 14/18] Fix docs

---
 topi/include/topi/cuda/injective.h    | 1 -
 topi/include/topi/generic/injective.h | 1 -
 topi/include/topi/x86/injective.h     | 1 -
 3 files changed, 3 deletions(-)

diff --git a/topi/include/topi/cuda/injective.h b/topi/include/topi/cuda/injective.h
index 888b41b37be4..663bc1fcbeb2 100644
--- a/topi/include/topi/cuda/injective.h
+++ b/topi/include/topi/cuda/injective.h
@@ -37,7 +37,6 @@ namespace cuda {
 /*!
  * \brief Updates an existing schedule for the given injective ops.
  *
- * \param target The target to update the schedule for.
  * \param sch The schedule to update.
  * \param out The tensor representing the injective op.
  * 
diff --git a/topi/include/topi/generic/injective.h b/topi/include/topi/generic/injective.h
index 24810edee07c..fa7df4ccfd46 100644
--- a/topi/include/topi/generic/injective.h
+++ b/topi/include/topi/generic/injective.h
@@ -37,7 +37,6 @@ namespace generic {
 /*!
  * \brief Updates an existing schedule for the given injective ops.
  *
- * \param target The target to update the schedule for.
  * \param sch The schedule to update.
  * \param out The tensor representing the injective op.
  * 
diff --git a/topi/include/topi/x86/injective.h b/topi/include/topi/x86/injective.h
index 85b18aaed195..7cb79ae322c2 100644
--- a/topi/include/topi/x86/injective.h
+++ b/topi/include/topi/x86/injective.h
@@ -37,7 +37,6 @@ namespace x86 {
 /*!
  * \brief Updates an existing schedule for the given injective ops.
  *
- * \param target The target to update the schedule for.
  * \param sch The schedule to update.
  * \param out The tensor representing the injective op.
  * 

From 2496cd77771ec805bcd658f907bb387a48a93ebc Mon Sep 17 00:00:00 2001
From: Jon Soifer <jonso@microsoft.com>
Date: Wed, 25 Sep 2019 09:52:55 -0700
Subject: [PATCH 15/18] Try to fix unit test

---
 tests/cpp/build_module_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc
index 1a7f791f65a0..697c07154ab9 100644
--- a/tests/cpp/build_module_test.cc
+++ b/tests/cpp/build_module_test.cc
@@ -114,7 +114,7 @@ TEST(BuildModule, Heterogeneous) {
   auto lowered_s2 = lower(s2, args2, "elemwise_sub", binds, config);
   Map<tvm::Target, Array<LoweredFunc>> inputs = {{target_cuda, lowered_s1},
                                                  {target_llvm, lowered_s2}};
-  auto module = build(inputs, Target(), config);
+  auto module = build(inputs, target_llvm, config);
 
   // Assertion for build.
   CHECK_EQ(module->imports().size(), 1);

From 419f8464b2741d83a33adc3550afdc314b3c3496 Mon Sep 17 00:00:00 2001
From: Jon Soifer <jonso@microsoft.com>
Date: Wed, 25 Sep 2019 14:49:56 -0700
Subject: [PATCH 16/18] Fix test

---
 tests/cpp/build_module_test.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc
index 697c07154ab9..a7237db482ac 100644
--- a/tests/cpp/build_module_test.cc
+++ b/tests/cpp/build_module_test.cc
@@ -102,7 +102,11 @@ TEST(BuildModule, Heterogeneous) {
     return copy[i] - C[i];
   }, "elemwise_sub");
 
+  const runtime::PackedFunc* enter_target_scope_func = runtime::Registry::Get("_EnterTargetScope");
+  (*enter_target_scope_func)(target_cuda);
   auto s1 = topi::cuda::schedule_injective(target_cuda, {elemwise_add});
+
+  (*enter_target_scope_func)(target_llvm);
   auto s2 = create_schedule({elemwise_sub->op});
 
   auto config = BuildConfig::Create();
@@ -114,7 +118,7 @@ TEST(BuildModule, Heterogeneous) {
   auto lowered_s2 = lower(s2, args2, "elemwise_sub", binds, config);
   Map<tvm::Target, Array<LoweredFunc>> inputs = {{target_cuda, lowered_s1},
                                                  {target_llvm, lowered_s2}};
-  auto module = build(inputs, target_llvm, config);
+  auto module = build(inputs, Target(), config);
 
   // Assertion for build.
   CHECK_EQ(module->imports().size(), 1);

From 7aeac5ec1269f06ba13206bfb81cc3afe637dce1 Mon Sep 17 00:00:00 2001
From: Jon Soifer <jonso@microsoft.com>
Date: Wed, 25 Sep 2019 16:09:28 -0700
Subject: [PATCH 17/18] Fix other tests

---
 tests/python/unittest/test_runtime_heterogeneous.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/python/unittest/test_runtime_heterogeneous.py b/tests/python/unittest/test_runtime_heterogeneous.py
index e0ae2e18a3d6..eb51faef9090 100644
--- a/tests/python/unittest/test_runtime_heterogeneous.py
+++ b/tests/python/unittest/test_runtime_heterogeneous.py
@@ -174,7 +174,8 @@ def check_device(device, target_device):
 
     dev_tar = {"cuda": "cuda", "opencl": "opencl"}
     for device, target in dev_tar.items():
-        check_device(device, target)
+        with tvm.target.create(device):
+            check_device(device, target)
 
 
 def get_duplex_graph(host_dev_type, device_dev_type):
@@ -394,7 +395,8 @@ def check_load_module():
 
     dev_tar = {"cuda": "cuda", "opencl": "opencl"}
     for device, target in dev_tar.items():
-        check_device(device, target)
+        with tvm.target.create(device):
+            check_device(device, target)
 
 if __name__ == "__main__":
     test_simplex_data_transferring()

From b698a15f68d01233eda79fc2092761c052c07880 Mon Sep 17 00:00:00 2001
From: Unknown <soiferj@gmail.com>
Date: Wed, 25 Sep 2019 20:09:57 -0700
Subject: [PATCH 18/18] Fix bug

---
 topi/src/topi.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index 71cc6f8858db..7114f4d878a8 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -776,7 +776,7 @@ using FTVMScheduleFromExistingBuilder = std::function<
  */
 inline PackedFunc WrapScheduleFromExisting(FTVMScheduleFromExistingBuilder builder) {
   return PackedFunc([builder](TVMArgs args, TVMRetValue* ret) {
-    *ret = builder(args[1], args[2]);
+    *ret = builder(args[0], args[1]);
   });
 }