From 5f0145f5eb723cd5290c1985838422c8157ae7b5 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Tue, 20 Apr 2021 14:46:00 -0700
Subject: [PATCH 01/28] Pipeline Compute Graph With New Subgraph Executor

Issue:
SOC hardware plarform have multiple types compute chipset like
GPU,FPGA,APU,RPU etc, there is a requirement that use these compute
unit in parallel to reach best performance.

Solution:
In these pipeline solution, we first split the compute graph into
a group of subgraph, then run these subgraph in a pipeline module
to make the GPU/FPGA/APU/RPU parallel running become possible.
---
 CMakeLists.txt                               |   7 +
 cmake/config.cmake                           |   2 +
 python/tvm/contrib/subgraph_executor.py      | 176 ++++++++
 python/tvm/relay/analysis/analysis.py        | 129 ++++++
 src/runtime/subgraph/subgraph_data.h         |  78 ++++
 src/runtime/subgraph/subgraph_executor.cc    | 191 +++++++++
 src/runtime/subgraph/subgraph_executor.h     | 108 +++++
 src/runtime/subgraph/subgraph_function.cc    | 132 ++++++
 src/runtime/subgraph/subgraph_function.h     |  37 ++
 src/runtime/subgraph/subgraph_struct.h       | 400 +++++++++++++++++++
 tests/python/relay/test_analysis_pipeline.py | 152 +++++++
 11 files changed, 1412 insertions(+)
 create mode 100644 python/tvm/contrib/subgraph_executor.py
 create mode 100644 src/runtime/subgraph/subgraph_data.h
 create mode 100644 src/runtime/subgraph/subgraph_executor.cc
 create mode 100644 src/runtime/subgraph/subgraph_executor.h
 create mode 100644 src/runtime/subgraph/subgraph_function.cc
 create mode 100644 src/runtime/subgraph/subgraph_function.h
 create mode 100644 src/runtime/subgraph/subgraph_struct.h
 create mode 100644 tests/python/relay/test_analysis_pipeline.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 127ba50b3720..ad3fbc06791d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -375,6 +375,7 @@ if(USE_PROFILER)
   set_source_files_properties(${RUNTIME_GRAPH_EXECUTOR_SRCS}
     PROPERTIES COMPILE_DEFINITIONS "TVM_GRAPH_EXECUTOR_DEBUG")
 
+
   file(GLOB RUNTIME_VM_PROFILER_SRCS src/runtime/vm/profiler/*.cc)
   list(APPEND RUNTIME_SRCS ${RUNTIME_VM_PROFILER_SRCS})
 endif(USE_PROFILER)
@@ -388,6 +389,12 @@ if(GTEST_INCLUDE_DIR AND GTEST_LIB)
   include(GoogleTest)
 endif()
 
+if(USE_SUBGRAPH_EXECUTOR)
+    message(STATUS "Build with Subgraph Executor support...")
+  file(GLOB RUNTIME_SUBGRAPH_SRCS src/runtime/subgraph/*.cc)
+  list(APPEND RUNTIME_SRCS ${RUNTIME_SUBGRAPH_SRCS})
+endif(USE_SUBGRAPH_EXECUTOR)
+
 # Module rules
 include(cmake/modules/VTA.cmake)
 include(cmake/modules/StandaloneCrt.cmake)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 8d8186c1b4f0..515d041d49b8 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -101,6 +101,8 @@ set(USE_STACKVM_RUNTIME OFF)
 
 # Whether enable tiny embedded graph executor.
 set(USE_GRAPH_EXECUTOR ON)
+# Whether enable subgraph runtime.
+set(USE_SUBGRAPH_EXECUTOR ON)
 
 # Whether enable tiny graph executor with CUDA Graph
 set(USE_GRAPH_EXECUTOR_CUDA_GRAPH OFF)
diff --git a/python/tvm/contrib/subgraph_executor.py b/python/tvm/contrib/subgraph_executor.py
new file mode 100644
index 000000000000..3d323c565569
--- /dev/null
+++ b/python/tvm/contrib/subgraph_executor.py
@@ -0,0 +1,176 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Minimum subgraph executor that executes subgraph containing TVM PackedFunc."""
+import tvm._ffi
+from tvm.contrib import graph_executor
+
+
+def create(sub_mods):
+    """Create a subgraph runtime executor.
+
+    Parameters
+    ----------
+    sub_mods :
+        {"lib": <module>,
+         "dev": <device>}
+
+    Returns
+    -------
+    submodule : SubGraphModule
+        Runtime subgraph module.
+    """
+    mods = []
+    for sub_mod in sub_mods:
+        m = graph_executor.GraphModule(sub_mod["lib"]["default"](sub_mod["dev"]))
+        mods.append(m)
+
+    submodule = SubGraphModule(mods)
+    return submodule
+
+
+class SubGraphModule(object):
+    """Wrapper runtime module.
+
+    This is a thin wrapper of the underlying TVM module.
+    you can also directly call set_input, run, and get_output
+    of underlying module functions
+
+    Parameters
+    ----------
+    module : tvm.runtime.Module
+        The internal tvm module that holds the actual graph functions.
+
+    Attributes
+    ----------
+    module : tvm.runtime.Module
+        The internal tvm module that holds the actual graph functions.
+
+    """
+
+    def __init__(self, graph_modules):
+        mods = []
+        for module in graph_modules:
+            mods.append(module.module)
+
+        subgraphcreate = tvm._ffi.get_global_func("tvm.subgraph_executor.create")
+        module = subgraphcreate(mods)
+
+        self.graph_modules_ = graph_modules
+
+        self._set_input = module["set_input"]
+        self._run = module["run"]
+        self._stop = module["stop"]
+        self._get_output = module["get_output"]
+        self._get_input = module["get_input"]
+        self._get_num_outputs = module["get_num_outputs"]
+        self._get_num_inputs = module["get_num_inputs"]
+
+    def set_input(self, key=None, value=None, params=None):
+        """Set inputs to the module via kwargs
+
+        Parameters
+        ----------
+        key : int or str
+           The input key
+
+        value : the input value.
+           The input key
+
+        params : dict of str to NDArray
+           Additional arguments
+        """
+        if key is not None:
+            self.graph_modules_[0].set_input(key, value)
+
+        if params:
+            indx = 0
+            for param in params:
+                self.graph_modules_[indx].set_input(**param)
+                indx = indx + 1
+
+    def run(self, **input_dict):
+        """Run forward execution of the graph
+
+        Parameters
+        ----------
+        input_dict: dict of str to NDArray
+            List of input values to be feed to
+        """
+        if input_dict:
+            self.set_input(**input_dict)
+        self._run()
+
+    def stop(self):
+        """Stop subgraph run"""
+        self._stop()
+
+    def get_num_outputs(self):
+        """Get the number of outputs from the graph
+
+        Returns
+        -------
+        count : int
+            The number of outputs.
+        """
+        return self._get_num_outputs()
+
+    def get_num_inputs(self):
+        """Get the number of inputs to the graph
+
+        Returns
+        -------
+        count : int
+            The number of inputs.
+        """
+        return self._get_num_inputs()
+
+    def get_input(self, input_indx, runtime_index=0, out=None):
+        """Get index-th input to out
+
+        Parameters
+        ----------
+        index : int
+            The input index
+
+        out : NDArray
+            The output array container
+        """
+        if out:
+            self._get_input(input_indx, runtime_index).copyto(out)
+            return out
+
+        return self._get_input(input_indx, runtime_index)
+
+    def get_output(self):
+        """Get index-th output to out
+
+        Parameters
+        ----------
+        index : int
+            The output index
+        """
+        return self._get_output()
+
+    def __getitem__(self, key):
+        """Get internal module function
+
+        Parameters
+        ----------
+        key : str
+            The key to the module.
+        """
+        return self.module[key]
diff --git a/python/tvm/relay/analysis/analysis.py b/python/tvm/relay/analysis/analysis.py
index c7b6c60849a1..0192621b0901 100644
--- a/python/tvm/relay/analysis/analysis.py
+++ b/python/tvm/relay/analysis/analysis.py
@@ -20,6 +20,7 @@
 This file contains the set of passes for Relay, which exposes an interface for
 configuring the passes and scripting them in Python.
 """
+import tvm
 from ...ir import IRModule
 from ...relay import transform, build_module
 from ...runtime.ndarray import cpu
@@ -447,3 +448,131 @@ def get_calibration_data(mod, data):
         calib_data[gvar] = value
 
     return calib_data
+
+
+def pipeline_graph(expr, indices):
+    """Split Graph Into A Group Of Subgraph
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+    indices : Array[int]
+
+    Returns
+    -------
+    ret : Array[tvm.relay.IRModule]
+    """
+
+    def run_opt_pass(expr, opt_pass):
+        """Exectue a relay pass"""
+        assert isinstance(opt_pass, tvm.transform.Pass)
+        mod = tvm.IRModule.from_expr(expr)
+        mod = tvm.relay.transform.InferType()(mod)
+        mod = opt_pass(mod)
+        entry = mod["main"]
+        return entry if isinstance(expr, tvm.relay.Function) else entry.body
+
+    def _operator_idx_inc(expr, operator_current_idx):
+        """Increase operator index"""
+        if not isinstance(expr, tvm.relay.expr.Constant):
+            operator_current_idx = operator_current_idx + 1
+
+        return operator_current_idx
+
+    def merge_constant_expr(constant_expr, expr):
+        # merge constant express with a express
+        # Parameters
+        # ----------
+        # constant_expr:
+        #     constant expression
+        # expr:
+        #     expression to merge with constant expression
+
+        # If body not let, then reached end of the express
+        if not isinstance(constant_expr.body, tvm.relay.expr.Let):
+            return tvm.relay.expr.Let(constant_expr.var, constant_expr.value, expr)
+
+        return tvm.relay.expr.Let(
+            constant_expr.var, constant_expr.value, merge_constant_expr(constant_expr.body, expr)
+        )
+
+    def _recursion(anf, operator_indx, pipeline_mods, indices, constant_expr):
+        # Enumrate all operator of compute graph then split the compute graph
+        # into a group subgraph.
+        # Parameters
+        # ----------
+        # anf:
+        #     ANF format expression
+        # operator_indx:
+        #     current operator indice
+        # pipeline_mods:
+        #     the subgraph list get storage in this variable
+        # indices:
+        #     Array of indices use to define the subgraph scope
+        # constant_expr:
+        #     constant defined before current operator
+
+        # Do the split work
+        if isinstance(anf, tvm.relay.Function):
+            return tvm.relay.Function(
+                anf.params,
+                _recursion(anf.body, operator_indx, pipeline_mods, indices, constant_expr),
+                anf.ret_type,
+                anf.type_params,
+                anf.attrs,
+            )
+        if isinstance(anf, tvm.relay.expr.Let):
+            value = anf.value
+            operator_indx = _operator_idx_inc(value, operator_indx)
+
+            # record constan expr to make sure all sugraph can find correct
+            # constant.
+            if isinstance(value, tvm.relay.expr.Constant):
+                if not constant_expr:
+                    constant_expr = tvm.relay.expr.Let(anf.var, value, anf.var)
+                else:
+                    constant_expr = tvm.relay.expr.Let(anf.var, value, constant_expr)
+
+            if isinstance(value, tvm.relay.expr.Call):
+                if isinstance(value.op, tvm.ir.Op):
+
+                    # if have expr a(b(c(d(e)))) and indexes are [1,2,3]
+                    # then would get separate modules for a(b),c,d(e).
+                    # the split area is a(b)[0,1] c[2,2] d(e)[2,3]
+                    if indices and operator_indx == indices[0]:
+                        indices.pop(0)
+                        ann = _recursion(
+                            anf.body, operator_indx, pipeline_mods, indices, constant_expr
+                        )
+
+                        # when current subgraph use previous subgraph constant,
+                        # such constant may become free varaible due to the constant
+                        # not exist, merge the previous constant with current subgraph
+                        # to avoid such issue.
+                        if constant_expr:
+                            ann = merge_constant_expr(constant_expr, ann)
+
+                        ann = run_opt_pass(ann, transform.ToGraphNormalForm())
+                        mod = tvm.IRModule.from_expr(ann)
+                        pipeline_mods.insert(0, mod)
+                        return tvm.relay.expr.Let(anf.var, value, anf.var)
+            return tvm.relay.expr.Let(
+                anf.var,
+                value,
+                _recursion(anf.body, operator_indx, pipeline_mods, indices, constant_expr),
+            )
+        else:
+            return anf
+
+    pipeline_mods = []
+
+    # operator count start from 0, then initial value get set into -1
+    operator_indx = -1
+    constant_expr = None
+    subgraph_indices = indices.copy()
+    anf = run_opt_pass(expr, transform.ToANormalForm())
+    anf = run_opt_pass(anf, transform.InferType())
+    ann = _recursion(anf, operator_indx, pipeline_mods, subgraph_indices, constant_expr)
+    ann = run_opt_pass(ann.body, transform.ToGraphNormalForm())
+    mod = tvm.IRModule.from_expr(ann)
+    pipeline_mods.insert(0, mod)
+    return pipeline_mods
diff --git a/src/runtime/subgraph/subgraph_data.h b/src/runtime/subgraph/subgraph_data.h
new file mode 100644
index 000000000000..3ec22d37b11f
--- /dev/null
+++ b/src/runtime/subgraph/subgraph_data.h
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_RUNTIME_SUBGRAPH_SUBGRAPH_DATA_H_
+#define TVM_RUNTIME_SUBGRAPH_SUBGRAPH_DATA_H_
+#define EXPORT __attribute__((visibility("default")))
+#define IMPORT
+#include <cstddef>
+#include <mutex>
+#include <thread>
+#include <condition_variable>
+#include "subgraph_struct.h"
+#ifdef __cplusplus
+
+#if defined(__x86_64)
+#define read_barrier()  __asm__ __volatile__("":::"memory")
+#else
+#define dsb(opt) asm volatile("dsb " #opt : : : "memory")
+#define read_barrier() dsb(st)
+#endif
+
+template <typename SLOT_TYPE = SLOT>
+squeue<SLOT_TYPE>* createQueue(squeue<SLOT_TYPE>* q, size_t size) {
+  squeue<SLOT_TYPE>* rq = new squeue<SLOT_TYPE>();
+  return rq;
+}
+
+template <typename SLOT_TYPE = SLOT>
+void deleteQueue(squeue<SLOT_TYPE>* q) {
+  free(q);
+}
+
+template <typename SLOT_TYPE = SLOT>
+inline bool full(squeue<SLOT_TYPE>* q) {
+  return ((q->tail + 1) % q->len) == q->head;
+}
+
+template <typename SLOT_TYPE = SLOT>
+inline bool empty(squeue<SLOT_TYPE>* q) {
+  return q->head == q->tail;
+}
+
+template <typename SLOT_TYPE = SLOT, typename VARIABLE_TYPE = SLOT>
+void q_push(squeue<SLOT_TYPE>* q, const VARIABLE_TYPE& s) {
+  while (full(q)) {
+  }
+  q->q[q->tail] = s;
+  read_barrier();
+  q->tail = (q->tail + 1) % q->len;
+}
+
+template <typename SLOT_TYPE = SLOT, typename VARIABLE_TYPE = SLOT>
+bool q_poll(squeue<SLOT_TYPE>* q, VARIABLE_TYPE* s) {
+  if (empty(q)) return false;
+  *s = q->q[q->head];
+  read_barrier();
+  q->head = (q->head + 1) % q->len;
+  return true;
+}
+// extern "C"
+#endif
+
+#endif  // TVM_RUNTIME_SUBGRAPH_SUBGRAPH_DATA_H_
diff --git a/src/runtime/subgraph/subgraph_executor.cc b/src/runtime/subgraph/subgraph_executor.cc
new file mode 100644
index 000000000000..5ce39af03246
--- /dev/null
+++ b/src/runtime/subgraph/subgraph_executor.cc
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file subgraph_runtime.cc
+ */
+#include "subgraph_executor.h"
+#include <tvm/runtime/registry.h>
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ *\bief Stop subgraph run.
+ */
+void SubGraphRuntime::Stop() { subgraph_stop(runtimes); }
+/*!
+ * \brief Run all the operations one by one.
+ */
+void SubGraphRuntime::Run() {
+  // setup the array and requirements.
+  int graphNum = runtimes.size();
+  subgraph_run(runtimes, true);
+}
+
+void SubGraphRuntime::Init(const Array<tvm::runtime::Module>& modules) {
+  subgraph_init(modules, &runtimes);
+  SetupStorage();
+  return;
+}
+
+void SubGraphRuntime::SetupStorage(void) {
+  auto lastGraphRuntime = runtimes.back();
+  int outputNum = lastGraphRuntime->runtimePtr->NumOutputs();
+  for (int i = 0; i < outputNum; i++) {
+    NDArray array = lastGraphRuntime->runtimePtr->GetOutput(i);
+    auto dltensor = const_cast<DLTensor*>(array.operator->());
+    vector<int64_t> shape;
+    for (int i = 0; i < dltensor->ndim; i++) {
+      shape.push_back(dltensor->shape[i]);
+    }
+
+    auto ndarray = NDArray::Empty(shape, dltensor->dtype, dltensor->device);
+    ndarray.CreateView(shape, dltensor->dtype);
+    output_entry_.push_back(ndarray);
+  }
+}
+
+/*!
+ * \brief set index-th input to the graph.
+ * \param index The input index.
+ * \param data_in The input data.
+ */
+void SubGraphRuntime::SetInput(int index, DLTensor* data_in) {
+  auto gruntime = runtimes.front();
+  gruntime->runtimePtr->SetInput(index, data_in);
+}
+
+void SubGraphRuntime::SetInput(const std::string& name, DLTensor* data_in) {
+  auto gruntime = runtimes.front();
+  gruntime->runtimePtr->SetInput(name, data_in);
+}
+
+/*!
+ * \brief Get the number of outputs
+ *
+ * \return The number of outputs from last subgraph.
+ */
+int SubGraphRuntime::NumOutputs() const { return runtimes.back()->runtimePtr->NumOutputs(); }
+
+/*!
+ * \brief Get the number of inputs
+ *
+ * \return The number of inputs to the first subgraph.
+ */
+int SubGraphRuntime::NumInputs() const { return runtimes.front()->runtimePtr->NumInputs(); }
+
+/*!
+ * \brief Return NDArray for given input index.
+ * \param index The input index.
+ *
+ * \return NDArray corresponding to given input node index.
+ */
+NDArray SubGraphRuntime::GetInput(int index, int mIndx) const {
+  auto gruntime = runtimes[mIndx];
+  return gruntime->runtimePtr->GetInput(index);
+}
+
+NDArray SubGraphRuntime::GetInput(const std::string& name, int mIndx) const {
+  auto gruntime = runtimes[mIndx];
+  return gruntime->runtimePtr->GetInput(name);
+}
+
+/*!
+ * \brief Return NDArray Array for all output.
+ *
+ * \return NDArray Array for all output.
+ */
+Array<NDArray> SubGraphRuntime::GetOutput(bool syncPoll) {
+  Array<NDArray> nd;
+#ifdef SERIALIZE
+  auto gruntime = runtimes.back();
+  nd.push_back(gruntime->runtimePtr->GetOutput(0));
+  DLTensor* dt = const_cast<DLTensor*>(nd.back().operator->());
+#else
+  if (subgraph_poll(&output_entry_, runtimes, syncPoll)) {
+    for (auto output : output_entry_) {
+      nd.push_back(output);
+    }
+  }
+#endif
+  return nd;
+}
+
+PackedFunc SubGraphRuntime::GetFunction(const std::string& name,
+                                        const ObjectPtr<Object>& sptr_to_self) {
+  // Return member functions during query.
+  if (name == "set_input") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      if (String::CanConvertFrom(args[0])) {
+        this->SetInput(args[0].operator String(), args[1]);
+      } else {
+        this->SetInput(static_cast<int>(args[0]), args[1]);
+      }
+    });
+  } else if (name == "get_output") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      if (args.num_args == 1) {
+        *rv = this->GetOutput(static_cast<bool>(args[0]));
+      } else {
+        *rv = this->GetOutput();
+      }
+    });
+  } else if (name == "get_input") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      int in_idx = 0, graph_idx = 0;
+      if (args.num_args == 2) {
+        graph_idx = args[1];
+      }
+
+      if (String::CanConvertFrom(args[0])) {
+        *rv = this->GetInput(args[0].operator String(), graph_idx);
+      } else {
+        in_idx = args[0];
+        if (in_idx >= 0) {
+          *rv = this->GetInput(in_idx, graph_idx);
+        }
+      }
+    });
+  } else if (name == "get_num_outputs") {
+    return PackedFunc(
+        [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->NumOutputs(); });
+  } else if (name == "get_num_inputs") {
+    return PackedFunc(
+        [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->NumInputs(); });
+  } else if (name == "run") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->Run(); });
+  } else if (name == "stop") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->Stop();});
+  } else {
+    return PackedFunc();
+  }
+}
+
+Module SubGraphRuntimeCreate(const Array<tvm::runtime::Module>& m) {
+  auto exec = make_object<SubGraphRuntime>();
+  exec->Init(m);
+  return Module(exec);
+}
+
+TVM_REGISTER_GLOBAL("tvm.subgraph_executor.create").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = SubGraphRuntimeCreate(args[0]);
+});
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/subgraph/subgraph_executor.h b/src/runtime/subgraph/subgraph_executor.h
new file mode 100644
index 000000000000..3018b70d7f62
--- /dev/null
+++ b/src/runtime/subgraph/subgraph_executor.h
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief Tiny graph runtime that can run graph
+ *        containing only tvm PackedFunc.
+ * \file graph_runtime.h
+ */
+#ifndef TVM_RUNTIME_SUBGRAPH_SUBGRAPH_EXECUTOR_H_
+#define TVM_RUNTIME_SUBGRAPH_SUBGRAPH_EXECUTOR_H_
+#include "subgraph_function.h"
+#include <string>
+#include <memory>
+#include <vector>
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief subgraph runtime.
+ *
+ *  This runtime can be acccesibly in various language via
+ *  TVM runtime PackedFunc API.
+ */
+class TVM_DLL SubGraphRuntime : public ModuleNode {
+ public:
+  /*!
+   * \brief Get member function to front-end
+   * \param name The name of the function.
+   * \param sptr_to_self The pointer to the module node.
+   * \return The corresponding member function.
+   */
+  virtual PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self);
+
+  /*!
+   * \return The type key of the executor.
+   */
+  const char* type_key() const final { return "SubGraphRuntime"; }
+  void Run();
+  void Stop();
+  void SetupStorage();
+
+  /*!
+   * \brief Initialize the graph executor with graph and context.
+   * \param graph_json The execution graph.
+   * \param module The module containing the compiled functions for the host
+   *  processor.
+   * \param ctxs The context of the host and devices where graph nodes will be
+   *  executed on.
+   * \param lookup_linked_param_func If given, a PackedFunc invoked to lookup linked parameters
+   *  by storage_id. If not given, linked parameters are looked-up using an internal implementation,
+   *  which is not compatible with RPCModules.
+   */
+  void Init(const Array<tvm::runtime::Module>& modules);
+
+  /*!
+   * \brief set index-th input to the graph.
+   * \param index The input index.
+   * \param data_in The input data.
+   */
+  void SetInput(int index, DLTensor* data_in);
+  void SetInput(const std::string& name, DLTensor* data_in);
+  NDArray GetInput(int index, int mIndx) const;
+  NDArray GetInput(const std::string& name, int mIndx) const;
+  /*!
+   * \brief Get the number of outputs
+   *
+   * \return The number of outputs from graph.
+   */
+  int NumOutputs() const;
+  /*!
+   * \brief Get the number of inputs
+   *
+   * \return The number of inputs to the graph.
+   */
+  int NumInputs() const;
+  /*!
+   * \brief Return NDArray Array for all output.
+   *
+   * \param syncPoll Syncholization poll mode or ASyncholization.
+   * \return NDArray Array for all output.
+   */
+  Array<NDArray> GetOutput(bool syncPoll = true);
+
+ protected:
+    std::vector<NDArray> output_entry_;
+    std::vector<shared_ptr<RuntimeItem>> runtimes;
+};
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_SUBGRAPH_SUBGRAPH_EXECUTOR_H_
diff --git a/src/runtime/subgraph/subgraph_function.cc b/src/runtime/subgraph/subgraph_function.cc
new file mode 100644
index 000000000000..b10c924a7b11
--- /dev/null
+++ b/src/runtime/subgraph/subgraph_function.cc
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "subgraph_function.h"
+#include <utility>
+using namespace tvm::runtime;
+
+void subgraph_pipeline_run(const int& num, const shared_ptr<RuntimeItem>& curRunItem) {
+  QUEUE* curQueue = curRunItem->queue;
+  QUEUE* nextQueue = curRunItem->next->queue;
+
+  auto id = std::this_thread::get_id();
+
+  bool suc = false;
+  while (curRunItem->waitPipeLineData(suc)) {
+    suc = subgraph_queue_poll(curQueue, &curRunItem->rData);
+    if (!suc) {
+      continue;
+    }
+
+    curRunItem->Run();
+
+    auto output = curRunItem->GetOutput();
+    subgraph_queue_push(nextQueue, output);
+    cout << num << " subgraph run..." << endl;
+    curRunItem->notifyDataReadyToNext();
+  }
+  curRunItem->notifyNextExit();
+
+  cout << "end " << __FUNCTION__ << " num " << num << endl;
+}
+
+thread* subgraph_pipeline_init(SHARED_RUNTIME_VEC* runtimes) {
+  for (int i = 1; i < runtimes->size(); i++) {
+    (*runtimes)[i]->t = move(thread(subgraph_pipeline_run, i, (*runtimes)[i]));
+  }
+  return NULL;
+}
+
+void subgraph_init(Array<Module> graphRuntimes, SHARED_RUNTIME_VEC* runtimes) {
+  int len = graphRuntimes.size();
+  for (int i = 0; i < len; i++) {
+    QUEUE* sub_queue = createQueue<SLOT>(NULL, SUB_Q_SIZE);
+    auto runItem = make_shared<RuntimeItem>(graphRuntimes[i], sub_queue);
+    runtimes->push_back(runItem);
+    /*
+       set prev and next for RuntimeItem, runtime need these information to
+       poll data from prev and do notification for next.
+       */
+    if (i > 0) {
+      (*runtimes)[i - 1]->next = (*runtimes)[i];
+    }
+    if (i == len - 1) {
+      (*runtimes)[i]->next = (*runtimes)[0];
+    }
+  }
+#ifndef SERIALIZE
+  subgraph_pipeline_init(runtimes);
+#endif
+  return;
+}
+
+inline void subgraph_queue_push(QUEUE* queue, Array<NDArray> arrays) {
+  q_push<SLOT, Array<NDArray>>(queue, arrays);
+  return;
+}
+
+bool subgraph_queue_poll(QUEUE* queue, RuntimeData* runtimeData) {
+  return q_poll<SLOT, RuntimeData>(queue, runtimeData);
+}
+
+void subgraph_run_serial(const SHARED_RUNTIME_VEC runtimes) {
+  runtimes[0]->Run();
+  for (int i = 1; i < runtimes.size(); i++) {
+    int oNum = runtimes[i - 1]->runtimePtr->NumOutputs();
+    for (int j = 0; j < oNum; j++) {
+      auto o = runtimes[i - 1]->runtimePtr->GetOutput(j);
+      DLTensor* ptr = const_cast<DLTensor*>(o.operator->());
+      runtimes[i]->runtimePtr->SetInput(j, ptr);
+    }
+    runtimes[i]->Run();
+  }
+}
+
+void subgraph_run(const SHARED_RUNTIME_VEC& runtimes, bool synch) {
+#ifdef SERIALIZE
+  subgraph_run_serial(runtimes);
+  return;
+#endif
+  shared_ptr<RuntimeItem> runtime = runtimes.front();
+  runtime->Run();
+  subgraph_queue_push(runtime->next->queue, runtime->GetOutput());
+  runtime->notifyDataReadyToNext();
+  return;
+}
+
+bool subgraph_poll(vector<NDArray>* output, const SHARED_RUNTIME_VEC& runtimes, const bool synch) {
+  shared_ptr<RuntimeItem> firstRuntime = runtimes.front();
+  QUEUE* queue = firstRuntime->queue;
+#ifndef SERIALIZE
+  bool suc = false;
+  if (firstRuntime->waitPipeLineData(suc || !synch)) {
+    subgraphOutputData<> outputData(output);
+    suc = q_poll<SLOT, subgraphOutputData<>>(queue, &outputData);
+    cout << "run done suc is " << suc << endl;
+  }
+  return suc;
+#else
+  subgraphOutputData<> outputData(output);
+  return q_poll<SLOT, subgraphOutputData<>>(queue, &outputData);
+#endif
+}
+
+void subgraph_stop(const SHARED_RUNTIME_VEC &runtimes) {
+  cout << __FUNCTION__ << endl;
+  runtimes.front()->notifyNextExit();
+}
diff --git a/src/runtime/subgraph/subgraph_function.h b/src/runtime/subgraph/subgraph_function.h
new file mode 100644
index 000000000000..d2329d74c056
--- /dev/null
+++ b/src/runtime/subgraph/subgraph_function.h
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_RUNTIME_SUBGRAPH_SUBGRAPH_FUNCTION_H_
+#define TVM_RUNTIME_SUBGRAPH_SUBGRAPH_FUNCTION_H_
+#include "subgraph_data.h"
+#include <memory>
+#include <vector>
+
+using namespace std;
+using namespace tvm::runtime;
+typedef vector<shared_ptr<RuntimeItem>> SHARED_RUNTIME_VEC;
+
+void subgraph_init(Array<Module> graphRuntimes, SHARED_RUNTIME_VEC* runtimes);
+void subgraph_run(const SHARED_RUNTIME_VEC& runtimes, bool synch = false);
+inline void subgraph_queue_push(QUEUE* queue, Array<NDArray> arrays);
+bool subgraph_queue_poll(QUEUE* queue, RuntimeData* runtimeData);
+bool subgraph_poll(vector<NDArray>* output, const SHARED_RUNTIME_VEC& runtimes,
+                   const bool sync = false);
+void subgraph_stop(const SHARED_RUNTIME_VEC &runtimes);
+
+#endif  // TVM_RUNTIME_SUBGRAPH_SUBGRAPH_FUNCTION_H_
diff --git a/src/runtime/subgraph/subgraph_struct.h b/src/runtime/subgraph/subgraph_struct.h
new file mode 100644
index 000000000000..0bc5dff3f7e0
--- /dev/null
+++ b/src/runtime/subgraph/subgraph_struct.h
@@ -0,0 +1,400 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_RUNTIME_SUBGRAPH_SUBGRAPH_STRUCT_H_
+#define TVM_RUNTIME_SUBGRAPH_SUBGRAPH_STRUCT_H_
+#include <assert.h>
+#include <sched.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/container.h>
+#include <tvm/runtime/ndarray.h>
+#include <string.h>
+#include <condition_variable>
+#include <thread>
+#include <mutex>
+#include <vector>
+#include <string>
+#include <memory>
+#define SLOT slot_t<>
+#define SUB_Q_SIZE 1024
+// #define SERIALIZE
+using namespace tvm::runtime;
+using namespace std;
+// thread control struction, for single consumer single producer mode
+class TControl {
+ private:
+  condition_variable cond;
+  volatile bool bWait = false;
+  mutex m;
+
+ public:
+  volatile bool bExit = false;
+  bool wait(bool bPollSuc) {
+    if (bPollSuc) {
+      return true;
+    }
+
+    unique_lock<mutex> lock(m);
+    cond.wait(lock, [&] { return this->bWait; });
+    bWait = false;
+
+    return !bExit;
+  }
+
+  void notify(void) {
+    bWait = true;
+    cond.notify_one();
+  }
+
+  void exit_notify(thread* t) {
+    /*
+     * set bExit first then notify
+     */
+    bExit = true;
+    notify();
+    if (t->joinable()) {
+      t->join();
+      cout << "exit_notify suc" << endl;
+    }
+  }
+};
+
+class subgraphData {
+ private:
+  void ResetDataList(size_t num) {
+    if (max_num < num) {
+      for (int i = 0; i < max_num; i++) {
+        TVMArrayFree(dataList[i]);
+      }
+
+      if (dataList) {
+        free(dataList);
+      }
+
+      dataList = reinterpret_cast<DLTensor**>(calloc(num, sizeof(DLTensor*)));
+      max_num = num;
+    }
+    return;
+  }
+
+  DLTensor* CreateCopyFrom(const DLTensor* from, DLTensor** to, int device_type, int device_id) {
+    size_t fromLen = tvm::runtime::GetDataSize(*from);
+    size_t toLen = *to ? tvm::runtime::GetDataSize(*(*to)) : 0;
+
+    if (fromLen != toLen) {
+      if (*to) {
+        TVMArrayFree(*to);
+        *to = nullptr;
+      }
+      TVMArrayAlloc(from->shape, from->ndim, from->dtype.code, from->dtype.bits, from->dtype.lanes,
+                    device_type, device_id, to);
+    }
+    TVMArrayCopyFromTo(const_cast<DLTensor*>(from), *to, nullptr);
+    return *to;
+  }
+
+ public:
+  void Copy(const Array<NDArray>& dlArray, int device_type, int device_id) {
+    num = dlArray.size();
+    ResetDataList(num);
+
+    for (int i = 0; i < num; i++) {
+      CreateCopyFrom(const_cast<const DLTensor*>(dlArray[i].operator->()), &dataList[i],
+                     device_type, device_id);
+    }
+    return;
+  }
+
+  void Copy(const DLTensor* dlTensor, int device_type, int device_id) {
+    num = 1;
+    ResetDataList(num);
+    CreateCopyFrom(dlTensor, &dataList[0], device_type, device_id);
+    return;
+  }
+
+  void Copy(const vector<const DLTensor*>& dlTensors, int device_type, int device_id) {
+    num = dlTensors.size();
+    ResetDataList(num);
+
+    for (int i = 0; i < num; i++) {
+      CreateCopyFrom(dlTensors[i], &dataList[i], device_type, device_id);
+    }
+    return;
+  }
+
+  void Copy(DLTensor** dlTensors, size_t dlNum, int device_type, int device_id) {
+    num = dlNum;
+    ResetDataList(num);
+
+    for (int i = 0; i < num; i++) {
+      auto dlTensor = const_cast<DLTensor*>(dlTensors[i]);
+      CreateCopyFrom(dlTensor, &dataList[i], device_type, device_id);
+    }
+    return;
+  }
+  size_t num;
+  size_t max_num;
+  DLTensor** dataList;
+  TControl controlData;
+  subgraphData(void) : num(0), max_num(0), dataList(nullptr) {}
+};
+
+template<int device_type = kDLCPU, int device_id = 0>
+class slot_t{
+ public:
+  bool bExit = false;
+  subgraphData data;
+  slot_t(void) {}
+
+  // overwrite operator = to handle "(slot) s = (OutputData) d;"
+  slot_t<device_type, device_id>& operator=(const DLTensor* dlTensor) {
+    data.Copy(dlTensor, device_type, device_id);
+    return *this;
+  }
+
+  slot_t<device_type, device_id>& operator=(const vector<const DLTensor*> dlTensors) {
+    data.Copy(dlTensors, device_type, device_id);
+    return *this;
+  }
+
+  slot_t<device_type, device_id>& operator=(const Array<NDArray> dlTensors) {
+    data.Copy(dlTensors, device_type, device_id);
+    return *this;
+  }
+
+  slot_t<device_type, device_id>& operator=(const slot_t<device_type, device_id>& slot) {
+    data.Copy(slot.data.dataList, slot.data.num, device_type, device_id);
+    return *this;
+  }
+};
+
+template<int device_type = kDLCPU, int device_id = 0>
+class subgraphOutputData {
+ public:
+  explicit subgraphOutputData(vector<NDArray>* datas) : datas_(datas) { ; }
+  subgraphOutputData& operator=(const slot_t<device_type, device_id>& slot) {
+    size_t num = slot.data.num;
+    assert(datas_->size() >= num);
+    for (size_t i = 0; i < slot.data.num; i++) {
+      auto dlTensor = slot.data.dataList[i];
+      (*datas_)[i].CopyFrom(dlTensor);
+    }
+    return *this;
+  }
+
+ private:
+  vector<NDArray>* datas_;
+};
+
+template<typename SLOT_TYPE = SLOT, int QLEN = 1024>
+class squeue{
+ public:
+  size_t len;
+  volatile size_t head;
+  volatile size_t tail;
+  SLOT_TYPE q[QLEN];
+  squeue(void) : len(QLEN), head(0), tail(0) {}
+};
+typedef squeue<SLOT> QUEUE;
+
+class RuntimeFunction{
+ public:
+  DLTensor* dlLocal = nullptr;
+  Module module_;
+  tvm::runtime::PackedFunc get_num_output;
+  tvm::runtime::PackedFunc get_num_inputs;
+  tvm::runtime::PackedFunc set_input;
+  tvm::runtime::PackedFunc get_output;
+  tvm::runtime::PackedFunc get_input;
+  tvm::runtime::PackedFunc run;
+  explicit RuntimeFunction(const Module& m) {
+    module_ = m;
+    get_num_output = module_.GetFunction("get_num_outputs");
+    get_num_inputs = module_.GetFunction("get_num_inputs");
+    set_input = module_.GetFunction("set_input");
+    get_output = module_.GetFunction("get_output");
+    get_input = module_.GetFunction("get_input");
+    run = module_.GetFunction("run");
+  }
+  ~RuntimeFunction() {
+    if (dlLocal) {
+      TVMArrayFree(dlLocal);
+      dlLocal = nullptr;
+    }
+  }
+
+  DLTensor* CreateFromDLTensor(const DLTensor* from) {
+    DLTensor* ret = NULL;
+    TVMArrayAlloc(from->shape, from->ndim, from->dtype.code, from->dtype.bits, from->dtype.lanes,
+                  kDLCPU, 0, &ret);
+    return ret;
+  }
+
+  int NumOutputs() const { return get_num_output(); }
+  int NumInputs() const { return get_num_inputs(); }
+
+  /*
+     when doing subgraph pipeline, the from data and to
+     data may comming from different device, for example
+     one from GPU another from VTA, here we need first
+     copy it into cpu type memory from GPU then copy the
+     cpu type memory into VTA, because current NDArray
+     copy not support cross device memory copy.
+     */
+  void CopyFromTo(DLTensor* from, DLTensor* to) {
+    if (!(from->device.device_type == to->device.device_type ||
+          from->device.device_type == kDLCPU || to->device.device_type == kDLCPU ||
+          from->device.device_type == kDLCPUPinned || to->device.device_type == kDLCPUPinned)) {
+      if (dlLocal == nullptr) {
+        dlLocal = CreateFromDLTensor(from);
+      }
+      TVMArrayCopyFromTo(from, dlLocal, nullptr);
+      from = dlLocal;
+    }
+
+    TVMArrayCopyFromTo(from, to, nullptr);
+  }
+
+  void SetInput(int index, DLTensor* data_in) {
+    /*
+       Here we can not use 'GetInput' of this class to replace
+       'get_input' although it just be one more level wrap for
+       'get_input', doing one more level wrap would
+       cause a NDArray copy and deconstruction after GetInput call,
+       when such NDArray comming from a RPC value, the deconstruction may
+       cause the remote data get free. then following operation for
+       such NDArray which linked a corrupt data would cause crash.
+       */
+    NDArray input = get_input(index);
+    DLTensor* dlInput = const_cast<DLTensor*>(input.operator->());
+    CopyFromTo(data_in, dlInput);
+  }
+
+  void SetInput(const std::string& name, DLTensor* data_in) {
+    NDArray input = get_input(name);
+    DLTensor* dlInput = const_cast<DLTensor*>(input.operator->());
+    CopyFromTo(data_in, dlInput);
+  }
+
+  NDArray GetInput(const std::string& name) { return get_input(name); }
+
+  NDArray GetOutput(int index) const { return get_output(index); }
+
+  NDArray GetInput(int index) const { return get_input(index); }
+
+  void Run() { run(); }
+};
+
+class RuntimeData {
+ private:
+  shared_ptr<RuntimeFunction> runtimePtr;
+  template <typename type>
+  void ImportData(type dlTensors, size_t inputsLen) {
+    size_t num = runtimePtr->NumInputs();
+    assert(num >= inputsLen);
+    for (int i = 0; i < inputsLen; i++) {
+      /*
+       * Use SetInput which have logic to handle
+       * cross device memory copy to set input data.
+       */
+      runtimePtr->SetInput(i, dlTensors[i]);
+    }
+    return;
+  }
+
+ public:
+  void Init(shared_ptr<RuntimeFunction> runtime) { runtimePtr = runtime; }
+
+  RuntimeData& operator=(const SLOT& slot) {
+    ImportData<DLTensor**>(slot.data.dataList, slot.data.num);
+    return *this;
+  }
+
+  RuntimeData& operator=(vector<DLTensor*> dlTensors) {
+    ImportData<vector<DLTensor*>>(dlTensors, dlTensors.size());
+    return *this;
+  }
+};
+
+class RuntimeItem {
+ public:
+  shared_ptr<RuntimeItem> prev = nullptr;
+  shared_ptr<RuntimeItem> next = nullptr;
+
+  int inputsNum;
+  RuntimeData rData;
+  TControl control;
+  QUEUE* queue = nullptr;
+  thread t;
+  shared_ptr<RuntimeFunction> runtimePtr = nullptr;
+  RuntimeItem(Module mod, QUEUE* inputQueue) {
+    if (runtimePtr == nullptr) {
+      runtimePtr = make_shared<RuntimeFunction>(mod);
+      inputsNum = runtimePtr->NumOutputs();
+      rData.Init(runtimePtr);
+    }
+
+    if (!queue) {
+      queue = inputQueue;
+    }
+  }
+
+  RuntimeItem(void) {}
+
+  void Run(void) { runtimePtr->Run(); }
+
+  bool waitPipeLineData(bool bPollSuc) {
+    bool ret = false;
+    /*
+       wait input data ready.
+       */
+    return control.wait(bPollSuc);
+  }
+
+  void notifyDataReadyToNext(void) {
+    if (next) {
+      next->control.notify();
+    }
+  }
+
+  void notifyNextExit(void) {
+    if (next) {
+      next->control.exit_notify(&next->t);
+    }
+  }
+
+  /*
+   * Here we need to use a container to storage NDArray that from
+   * GetOutput, if just copy the data but not storage NDArray, the
+   * memory of data may get freed, especially for RPC device data,
+   */
+  Array<NDArray> GetOutput(void) {
+    Array<NDArray> outputs;
+    size_t outputsNum = runtimePtr->NumOutputs();
+    for (int i = 0; i < outputsNum; i++) {
+      auto output = runtimePtr->GetOutput(i);
+      outputs.push_back(output);
+    }
+    return outputs;
+  }
+};
+
+
+#endif  //  TVM_RUNTIME_SUBGRAPH_SUBGRAPH_STRUCT_H_
diff --git a/tests/python/relay/test_analysis_pipeline.py b/tests/python/relay/test_analysis_pipeline.py
new file mode 100644
index 000000000000..e7d66f83d13f
--- /dev/null
+++ b/tests/python/relay/test_analysis_pipeline.py
@@ -0,0 +1,152 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.relay import transform
+from tvm.contrib import graph_executor, subgraph_executor
+from tvm.relay.analysis import pipeline_graph
+
+
+def run_module(mod, dev, target, dname, data):
+    with tvm.transform.PassContext(opt_level=3):
+        lib = relay.build(mod, target)
+
+    m = graph_executor.GraphModule(lib["default"](dev))
+    m.set_input(dname, data)
+    m.run()
+    n = m.get_num_outputs()
+    output = m.get_output(0).asnumpy()
+    return output
+
+def get_network():
+    dshape = (3, 3)
+    mvalue1 = np.full((1), 5).astype("float32")
+    mvalue2 = np.full((1), 2).astype("float32")
+    mvalue3 = np.full((1), 3).astype("float32")
+    mvalue4 = np.full((1), 4).astype("float32")
+    mv1 = relay.Constant(tvm.nd.array(mvalue1))
+    mv2 = relay.Constant(tvm.nd.array(mvalue2))
+    mv3 = relay.Constant(tvm.nd.array(mvalue3))
+    mv4 = relay.Constant(tvm.nd.array(mvalue4))
+    data = relay.var("data", relay.TensorType(dshape, "float32"))
+    net = relay.multiply(data, mv1)
+    net = relay.add(net, mv2)
+    net = relay.add(net, mv3)
+    net = relay.multiply(net, mv4)
+    net = relay.subtract(net, mv1)
+    func = relay.Function([data], net)
+    mod = tvm.IRModule.from_expr(func)
+    return mod, dshape
+
+
+mod, dshape = get_network()
+"""
+#split compute graph into 4 subgraph
+"""
+pl = [0, 2, 3]
+mods = pipeline_graph(mod["main"], pl)
+
+"""
+#Prepare batch data for pipeline feeding
+"""
+datas = []
+for i in range(len(pl) +1):
+    datas.append(np.full(dshape, 3 + i).astype("float32"))
+
+"""
+#Run with graph executor for verification purpose
+"""
+outs = []
+for data in datas:
+    outs.append(run_module(mod, tvm.cpu(), "llvm", "data", data))
+
+"""
+#Parameter use for subgraph executor creation
+"""
+sub_mod = []
+
+"""
+#Build module and append module and device type into variable that
+#use for subgraph creation.
+#first and second subgraph use cuda when cuda enable, second and 
+#last subgraph use cpu
+"""
+with relay.build_config(opt_level=3):
+    """
+    #Build first subgraph and create parameter for subgraph
+    #creation.
+    """
+    if tvm.testing.device_enabled("cuda"):
+        lib, dev = relay.build(mods[0], "cuda"), tvm.gpu()
+    else:
+        lib, dev = relay.build(mods[0], "llvm"), tvm.cpu()
+    sub_mod.append({"lib":lib, "dev":dev})
+
+    """
+    #Build second subgraph, append module and device into
+    #subgraph creation parameter.
+    """
+    if tvm.testing.device_enabled("cuda"):
+        lib, dev = relay.build(mods[1], "cuda"), tvm.gpu()
+    else:
+        lib, dev = relay.build(mods[1], "llvm"), tvm.cpu()
+    sub_mod.append({"lib":lib, "dev":dev})
+
+    """
+    #third subgraph
+    """
+    lib = relay.build(mods[2], "llvm")
+    sub_mod.append({"lib":lib, "dev":tvm.cpu()})
+
+    """
+    #last subgraph
+    """
+    lib = relay.build(mods[3], "llvm")
+    sub_mod.append({"lib":lib, "dev":tvm.cpu()})
+
+"""
+#Create subgraph executor
+"""
+smod = subgraph_executor.create(sub_mod)
+
+"""
+#Use subgraph executor to pipeline the said subgraph which use different backend
+"""
+for data in datas:
+    smod.set_input("data", data)
+    smod.run()
+
+"""
+Get result
+"""
+sub_outputs = []
+for i in range(len(datas)):
+    sub_outputs.append(smod.get_output()[0].asnumpy())
+
+"""
+#Stop pipeline execution.
+"""
+smod.stop()
+"""
+
+#Verify result
+"""
+for i in range(len(datas)):
+    tvm.testing.assert_allclose(outs[i],sub_outputs[i])

From 76fe55ab65369388d9e3b8a6486c88ae16a602bb Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Tue, 20 Apr 2021 22:37:51 -0700
Subject: [PATCH 02/28] Fix plint error

---
 src/runtime/subgraph/subgraph_data.h         |  5 ++-
 src/runtime/subgraph/subgraph_executor.cc    |  9 ++--
 src/runtime/subgraph/subgraph_executor.h     |  9 ++--
 src/runtime/subgraph/subgraph_function.cc    | 19 ++++++---
 src/runtime/subgraph/subgraph_function.h     |  5 ++-
 src/runtime/subgraph/subgraph_struct.h       | 45 +++++++++-----------
 tests/python/relay/test_analysis_pipeline.py | 13 +++---
 7 files changed, 54 insertions(+), 51 deletions(-)

diff --git a/src/runtime/subgraph/subgraph_data.h b/src/runtime/subgraph/subgraph_data.h
index 3ec22d37b11f..13e3a25a979c 100644
--- a/src/runtime/subgraph/subgraph_data.h
+++ b/src/runtime/subgraph/subgraph_data.h
@@ -20,15 +20,16 @@
 #define TVM_RUNTIME_SUBGRAPH_SUBGRAPH_DATA_H_
 #define EXPORT __attribute__((visibility("default")))
 #define IMPORT
+#include <condition_variable>
 #include <cstddef>
 #include <mutex>
 #include <thread>
-#include <condition_variable>
+
 #include "subgraph_struct.h"
 #ifdef __cplusplus
 
 #if defined(__x86_64)
-#define read_barrier()  __asm__ __volatile__("":::"memory")
+#define read_barrier() __asm__ __volatile__("" ::: "memory")
 #else
 #define dsb(opt) asm volatile("dsb " #opt : : : "memory")
 #define read_barrier() dsb(st)
diff --git a/src/runtime/subgraph/subgraph_executor.cc b/src/runtime/subgraph/subgraph_executor.cc
index 5ce39af03246..3476cb03d15c 100644
--- a/src/runtime/subgraph/subgraph_executor.cc
+++ b/src/runtime/subgraph/subgraph_executor.cc
@@ -21,6 +21,7 @@
  * \file subgraph_runtime.cc
  */
 #include "subgraph_executor.h"
+
 #include <tvm/runtime/registry.h>
 
 namespace tvm {
@@ -33,11 +34,7 @@ void SubGraphRuntime::Stop() { subgraph_stop(runtimes); }
 /*!
  * \brief Run all the operations one by one.
  */
-void SubGraphRuntime::Run() {
-  // setup the array and requirements.
-  int graphNum = runtimes.size();
-  subgraph_run(runtimes, true);
-}
+void SubGraphRuntime::Run() { subgraph_run(runtimes, true); }
 
 void SubGraphRuntime::Init(const Array<tvm::runtime::Module>& modules) {
   subgraph_init(modules, &runtimes);
@@ -172,7 +169,7 @@ PackedFunc SubGraphRuntime::GetFunction(const std::string& name,
   } else if (name == "run") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->Run(); });
   } else if (name == "stop") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->Stop();});
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->Stop(); });
   } else {
     return PackedFunc();
   }
diff --git a/src/runtime/subgraph/subgraph_executor.h b/src/runtime/subgraph/subgraph_executor.h
index 3018b70d7f62..14ac0d5355ae 100644
--- a/src/runtime/subgraph/subgraph_executor.h
+++ b/src/runtime/subgraph/subgraph_executor.h
@@ -24,11 +24,12 @@
  */
 #ifndef TVM_RUNTIME_SUBGRAPH_SUBGRAPH_EXECUTOR_H_
 #define TVM_RUNTIME_SUBGRAPH_SUBGRAPH_EXECUTOR_H_
-#include "subgraph_function.h"
-#include <string>
 #include <memory>
+#include <string>
 #include <vector>
 
+#include "subgraph_function.h"
+
 namespace tvm {
 namespace runtime {
 
@@ -99,8 +100,8 @@ class TVM_DLL SubGraphRuntime : public ModuleNode {
   Array<NDArray> GetOutput(bool syncPoll = true);
 
  protected:
-    std::vector<NDArray> output_entry_;
-    std::vector<shared_ptr<RuntimeItem>> runtimes;
+  std::vector<NDArray> output_entry_;
+  std::vector<shared_ptr<RuntimeItem>> runtimes;
 };
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/subgraph/subgraph_function.cc b/src/runtime/subgraph/subgraph_function.cc
index b10c924a7b11..3e9813006e9c 100644
--- a/src/runtime/subgraph/subgraph_function.cc
+++ b/src/runtime/subgraph/subgraph_function.cc
@@ -17,6 +17,7 @@
  * under the License.
  */
 #include "subgraph_function.h"
+
 #include <utility>
 using namespace tvm::runtime;
 
@@ -24,8 +25,10 @@ void subgraph_pipeline_run(const int& num, const shared_ptr<RuntimeItem>& curRun
   QUEUE* curQueue = curRunItem->queue;
   QUEUE* nextQueue = curRunItem->next->queue;
 
-  auto id = std::this_thread::get_id();
-
+  /*
+   * Wait at beginning, then only do wait once last time data poll failed,
+   * the loop would break after an exit notification get received.
+   */
   bool suc = false;
   while (curRunItem->waitPipeLineData(suc)) {
     suc = subgraph_queue_poll(curQueue, &curRunItem->rData);
@@ -46,7 +49,7 @@ void subgraph_pipeline_run(const int& num, const shared_ptr<RuntimeItem>& curRun
 }
 
 thread* subgraph_pipeline_init(SHARED_RUNTIME_VEC* runtimes) {
-  for (int i = 1; i < runtimes->size(); i++) {
+  for (size_t i = 1; i < runtimes->size(); i++) {
     (*runtimes)[i]->t = move(thread(subgraph_pipeline_run, i, (*runtimes)[i]));
   }
   return NULL;
@@ -86,7 +89,7 @@ bool subgraph_queue_poll(QUEUE* queue, RuntimeData* runtimeData) {
 
 void subgraph_run_serial(const SHARED_RUNTIME_VEC runtimes) {
   runtimes[0]->Run();
-  for (int i = 1; i < runtimes.size(); i++) {
+  for (size_t i = 1; i < runtimes.size(); i++) {
     int oNum = runtimes[i - 1]->runtimePtr->NumOutputs();
     for (int j = 0; j < oNum; j++) {
       auto o = runtimes[i - 1]->runtimePtr->GetOutput(j);
@@ -114,8 +117,10 @@ bool subgraph_poll(vector<NDArray>* output, const SHARED_RUNTIME_VEC& runtimes,
   QUEUE* queue = firstRuntime->queue;
 #ifndef SERIALIZE
   bool suc = false;
-  if (firstRuntime->waitPipeLineData(suc || !synch)) {
-    subgraphOutputData<> outputData(output);
+  subgraphOutputData<> outputData(output);
+  suc = q_poll<SLOT, subgraphOutputData<>>(queue, &outputData);
+  if (!suc) {
+    firstRuntime->waitPipeLineData(!synch);
     suc = q_poll<SLOT, subgraphOutputData<>>(queue, &outputData);
     cout << "run done suc is " << suc << endl;
   }
@@ -126,7 +131,7 @@ bool subgraph_poll(vector<NDArray>* output, const SHARED_RUNTIME_VEC& runtimes,
 #endif
 }
 
-void subgraph_stop(const SHARED_RUNTIME_VEC &runtimes) {
+void subgraph_stop(const SHARED_RUNTIME_VEC& runtimes) {
   cout << __FUNCTION__ << endl;
   runtimes.front()->notifyNextExit();
 }
diff --git a/src/runtime/subgraph/subgraph_function.h b/src/runtime/subgraph/subgraph_function.h
index d2329d74c056..de0cdc26ff03 100644
--- a/src/runtime/subgraph/subgraph_function.h
+++ b/src/runtime/subgraph/subgraph_function.h
@@ -18,10 +18,11 @@
  */
 #ifndef TVM_RUNTIME_SUBGRAPH_SUBGRAPH_FUNCTION_H_
 #define TVM_RUNTIME_SUBGRAPH_SUBGRAPH_FUNCTION_H_
-#include "subgraph_data.h"
 #include <memory>
 #include <vector>
 
+#include "subgraph_data.h"
+
 using namespace std;
 using namespace tvm::runtime;
 typedef vector<shared_ptr<RuntimeItem>> SHARED_RUNTIME_VEC;
@@ -32,6 +33,6 @@ inline void subgraph_queue_push(QUEUE* queue, Array<NDArray> arrays);
 bool subgraph_queue_poll(QUEUE* queue, RuntimeData* runtimeData);
 bool subgraph_poll(vector<NDArray>* output, const SHARED_RUNTIME_VEC& runtimes,
                    const bool sync = false);
-void subgraph_stop(const SHARED_RUNTIME_VEC &runtimes);
+void subgraph_stop(const SHARED_RUNTIME_VEC& runtimes);
 
 #endif  // TVM_RUNTIME_SUBGRAPH_SUBGRAPH_FUNCTION_H_
diff --git a/src/runtime/subgraph/subgraph_struct.h b/src/runtime/subgraph/subgraph_struct.h
index 0bc5dff3f7e0..f869b058ac9b 100644
--- a/src/runtime/subgraph/subgraph_struct.h
+++ b/src/runtime/subgraph/subgraph_struct.h
@@ -20,18 +20,19 @@
 #define TVM_RUNTIME_SUBGRAPH_SUBGRAPH_STRUCT_H_
 #include <assert.h>
 #include <sched.h>
-#include <unistd.h>
+#include <string.h>
 #include <sys/syscall.h>
-#include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/container.h>
 #include <tvm/runtime/ndarray.h>
-#include <string.h>
+#include <tvm/runtime/packed_func.h>
+#include <unistd.h>
+
 #include <condition_variable>
-#include <thread>
+#include <memory>
 #include <mutex>
-#include <vector>
 #include <string>
-#include <memory>
+#include <thread>
+#include <vector>
 #define SLOT slot_t<>
 #define SUB_Q_SIZE 1024
 // #define SERIALIZE
@@ -80,7 +81,7 @@ class subgraphData {
  private:
   void ResetDataList(size_t num) {
     if (max_num < num) {
-      for (int i = 0; i < max_num; i++) {
+      for (size_t i = 0; i < max_num; i++) {
         TVMArrayFree(dataList[i]);
       }
 
@@ -115,7 +116,7 @@ class subgraphData {
     num = dlArray.size();
     ResetDataList(num);
 
-    for (int i = 0; i < num; i++) {
+    for (size_t i = 0; i < num; i++) {
       CreateCopyFrom(const_cast<const DLTensor*>(dlArray[i].operator->()), &dataList[i],
                      device_type, device_id);
     }
@@ -133,7 +134,7 @@ class subgraphData {
     num = dlTensors.size();
     ResetDataList(num);
 
-    for (int i = 0; i < num; i++) {
+    for (size_t i = 0; i < num; i++) {
       CreateCopyFrom(dlTensors[i], &dataList[i], device_type, device_id);
     }
     return;
@@ -143,7 +144,7 @@ class subgraphData {
     num = dlNum;
     ResetDataList(num);
 
-    for (int i = 0; i < num; i++) {
+    for (size_t i = 0; i < num; i++) {
       auto dlTensor = const_cast<DLTensor*>(dlTensors[i]);
       CreateCopyFrom(dlTensor, &dataList[i], device_type, device_id);
     }
@@ -156,8 +157,8 @@ class subgraphData {
   subgraphData(void) : num(0), max_num(0), dataList(nullptr) {}
 };
 
-template<int device_type = kDLCPU, int device_id = 0>
-class slot_t{
+template <int device_type = kDLCPU, int device_id = 0>
+class slot_t {
  public:
   bool bExit = false;
   subgraphData data;
@@ -185,13 +186,12 @@ class slot_t{
   }
 };
 
-template<int device_type = kDLCPU, int device_id = 0>
+template <int device_type = kDLCPU, int device_id = 0>
 class subgraphOutputData {
  public:
   explicit subgraphOutputData(vector<NDArray>* datas) : datas_(datas) { ; }
   subgraphOutputData& operator=(const slot_t<device_type, device_id>& slot) {
-    size_t num = slot.data.num;
-    assert(datas_->size() >= num);
+    assert(datas_->size() >= slot.data.num);
     for (size_t i = 0; i < slot.data.num; i++) {
       auto dlTensor = slot.data.dataList[i];
       (*datas_)[i].CopyFrom(dlTensor);
@@ -203,8 +203,8 @@ class subgraphOutputData {
   vector<NDArray>* datas_;
 };
 
-template<typename SLOT_TYPE = SLOT, int QLEN = 1024>
-class squeue{
+template <typename SLOT_TYPE = SLOT, int QLEN = 1024>
+class squeue {
  public:
   size_t len;
   volatile size_t head;
@@ -214,7 +214,7 @@ class squeue{
 };
 typedef squeue<SLOT> QUEUE;
 
-class RuntimeFunction{
+class RuntimeFunction {
  public:
   DLTensor* dlLocal = nullptr;
   Module module_;
@@ -307,9 +307,8 @@ class RuntimeData {
   shared_ptr<RuntimeFunction> runtimePtr;
   template <typename type>
   void ImportData(type dlTensors, size_t inputsLen) {
-    size_t num = runtimePtr->NumInputs();
-    assert(num >= inputsLen);
-    for (int i = 0; i < inputsLen; i++) {
+    assert(runtimePtr->NumInputs() >= inputsLen);
+    for (size_t i = 0; i < inputsLen; i++) {
       /*
        * Use SetInput which have logic to handle
        * cross device memory copy to set input data.
@@ -361,7 +360,6 @@ class RuntimeItem {
   void Run(void) { runtimePtr->Run(); }
 
   bool waitPipeLineData(bool bPollSuc) {
-    bool ret = false;
     /*
        wait input data ready.
        */
@@ -388,7 +386,7 @@ class RuntimeItem {
   Array<NDArray> GetOutput(void) {
     Array<NDArray> outputs;
     size_t outputsNum = runtimePtr->NumOutputs();
-    for (int i = 0; i < outputsNum; i++) {
+    for (size_t i = 0; i < outputsNum; i++) {
       auto output = runtimePtr->GetOutput(i);
       outputs.push_back(output);
     }
@@ -396,5 +394,4 @@ class RuntimeItem {
   }
 };
 
-
 #endif  //  TVM_RUNTIME_SUBGRAPH_SUBGRAPH_STRUCT_H_
diff --git a/tests/python/relay/test_analysis_pipeline.py b/tests/python/relay/test_analysis_pipeline.py
index e7d66f83d13f..a3938df61d0e 100644
--- a/tests/python/relay/test_analysis_pipeline.py
+++ b/tests/python/relay/test_analysis_pipeline.py
@@ -35,6 +35,7 @@ def run_module(mod, dev, target, dname, data):
     output = m.get_output(0).asnumpy()
     return output
 
+
 def get_network():
     dshape = (3, 3)
     mvalue1 = np.full((1), 5).astype("float32")
@@ -67,7 +68,7 @@ def get_network():
 #Prepare batch data for pipeline feeding
 """
 datas = []
-for i in range(len(pl) +1):
+for i in range(len(pl) + 1):
     datas.append(np.full(dshape, 3 + i).astype("float32"))
 
 """
@@ -97,7 +98,7 @@ def get_network():
         lib, dev = relay.build(mods[0], "cuda"), tvm.gpu()
     else:
         lib, dev = relay.build(mods[0], "llvm"), tvm.cpu()
-    sub_mod.append({"lib":lib, "dev":dev})
+    sub_mod.append({"lib": lib, "dev": dev})
 
     """
     #Build second subgraph, append module and device into
@@ -107,19 +108,19 @@ def get_network():
         lib, dev = relay.build(mods[1], "cuda"), tvm.gpu()
     else:
         lib, dev = relay.build(mods[1], "llvm"), tvm.cpu()
-    sub_mod.append({"lib":lib, "dev":dev})
+    sub_mod.append({"lib": lib, "dev": dev})
 
     """
     #third subgraph
     """
     lib = relay.build(mods[2], "llvm")
-    sub_mod.append({"lib":lib, "dev":tvm.cpu()})
+    sub_mod.append({"lib": lib, "dev": tvm.cpu()})
 
     """
     #last subgraph
     """
     lib = relay.build(mods[3], "llvm")
-    sub_mod.append({"lib":lib, "dev":tvm.cpu()})
+    sub_mod.append({"lib": lib, "dev": tvm.cpu()})
 
 """
 #Create subgraph executor
@@ -149,4 +150,4 @@ def get_network():
 #Verify result
 """
 for i in range(len(datas)):
-    tvm.testing.assert_allclose(outs[i],sub_outputs[i])
+    tvm.testing.assert_allclose(outs[i], sub_outputs[i])

From 90b5761aa91097a4cc4c04d6d79f1434e2cf79c4 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Wed, 21 Apr 2021 17:47:55 -0700
Subject: [PATCH 03/28] fix task_build error.

---
 src/runtime/subgraph/subgraph_data.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/runtime/subgraph/subgraph_data.h b/src/runtime/subgraph/subgraph_data.h
index 13e3a25a979c..0f2fd8816edd 100644
--- a/src/runtime/subgraph/subgraph_data.h
+++ b/src/runtime/subgraph/subgraph_data.h
@@ -28,12 +28,7 @@
 #include "subgraph_struct.h"
 #ifdef __cplusplus
 
-#if defined(__x86_64)
-#define read_barrier() __asm__ __volatile__("" ::: "memory")
-#else
-#define dsb(opt) asm volatile("dsb " #opt : : : "memory")
-#define read_barrier() dsb(st)
-#endif
+#define read_barrier() std::atomic_thread_fence(std::memory_order_acquire)
 
 template <typename SLOT_TYPE = SLOT>
 squeue<SLOT_TYPE>* createQueue(squeue<SLOT_TYPE>* q, size_t size) {

From 4b9f92f731cffb6f1710b303dd62615f922ab745 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Wed, 21 Apr 2021 22:06:54 -0700
Subject: [PATCH 04/28] fix gpu task_build issue.

---
 src/runtime/subgraph/subgraph_executor.cc |  8 +----
 src/runtime/subgraph/subgraph_function.cc | 44 ++++++-----------------
 src/runtime/subgraph/subgraph_function.h  |  4 +--
 src/runtime/subgraph/subgraph_struct.h    |  2 --
 4 files changed, 13 insertions(+), 45 deletions(-)

diff --git a/src/runtime/subgraph/subgraph_executor.cc b/src/runtime/subgraph/subgraph_executor.cc
index 3476cb03d15c..1ecf32fb8be7 100644
--- a/src/runtime/subgraph/subgraph_executor.cc
+++ b/src/runtime/subgraph/subgraph_executor.cc
@@ -34,7 +34,7 @@ void SubGraphRuntime::Stop() { subgraph_stop(runtimes); }
 /*!
  * \brief Run all the operations one by one.
  */
-void SubGraphRuntime::Run() { subgraph_run(runtimes, true); }
+void SubGraphRuntime::Run() { subgraph_run(runtimes); }
 
 void SubGraphRuntime::Init(const Array<tvm::runtime::Module>& modules) {
   subgraph_init(modules, &runtimes);
@@ -111,17 +111,11 @@ NDArray SubGraphRuntime::GetInput(const std::string& name, int mIndx) const {
  */
 Array<NDArray> SubGraphRuntime::GetOutput(bool syncPoll) {
   Array<NDArray> nd;
-#ifdef SERIALIZE
-  auto gruntime = runtimes.back();
-  nd.push_back(gruntime->runtimePtr->GetOutput(0));
-  DLTensor* dt = const_cast<DLTensor*>(nd.back().operator->());
-#else
   if (subgraph_poll(&output_entry_, runtimes, syncPoll)) {
     for (auto output : output_entry_) {
       nd.push_back(output);
     }
   }
-#endif
   return nd;
 }
 
diff --git a/src/runtime/subgraph/subgraph_function.cc b/src/runtime/subgraph/subgraph_function.cc
index 3e9813006e9c..ea2408d675d0 100644
--- a/src/runtime/subgraph/subgraph_function.cc
+++ b/src/runtime/subgraph/subgraph_function.cc
@@ -40,17 +40,14 @@ void subgraph_pipeline_run(const int& num, const shared_ptr<RuntimeItem>& curRun
 
     auto output = curRunItem->GetOutput();
     subgraph_queue_push(nextQueue, output);
-    cout << num << " subgraph run..." << endl;
     curRunItem->notifyDataReadyToNext();
   }
   curRunItem->notifyNextExit();
-
-  cout << "end " << __FUNCTION__ << " num " << num << endl;
 }
 
 thread* subgraph_pipeline_init(SHARED_RUNTIME_VEC* runtimes) {
   for (size_t i = 1; i < runtimes->size(); i++) {
-    (*runtimes)[i]->t = move(thread(subgraph_pipeline_run, i, (*runtimes)[i]));
+    (*runtimes)[i]->t = thread(subgraph_pipeline_run, i, (*runtimes)[i]);
   }
   return NULL;
 }
@@ -72,9 +69,7 @@ void subgraph_init(Array<Module> graphRuntimes, SHARED_RUNTIME_VEC* runtimes) {
       (*runtimes)[i]->next = (*runtimes)[0];
     }
   }
-#ifndef SERIALIZE
   subgraph_pipeline_init(runtimes);
-#endif
   return;
 }
 
@@ -87,24 +82,7 @@ bool subgraph_queue_poll(QUEUE* queue, RuntimeData* runtimeData) {
   return q_poll<SLOT, RuntimeData>(queue, runtimeData);
 }
 
-void subgraph_run_serial(const SHARED_RUNTIME_VEC runtimes) {
-  runtimes[0]->Run();
-  for (size_t i = 1; i < runtimes.size(); i++) {
-    int oNum = runtimes[i - 1]->runtimePtr->NumOutputs();
-    for (int j = 0; j < oNum; j++) {
-      auto o = runtimes[i - 1]->runtimePtr->GetOutput(j);
-      DLTensor* ptr = const_cast<DLTensor*>(o.operator->());
-      runtimes[i]->runtimePtr->SetInput(j, ptr);
-    }
-    runtimes[i]->Run();
-  }
-}
-
-void subgraph_run(const SHARED_RUNTIME_VEC& runtimes, bool synch) {
-#ifdef SERIALIZE
-  subgraph_run_serial(runtimes);
-  return;
-#endif
+void subgraph_run(const SHARED_RUNTIME_VEC& runtimes) {
   shared_ptr<RuntimeItem> runtime = runtimes.front();
   runtime->Run();
   subgraph_queue_push(runtime->next->queue, runtime->GetOutput());
@@ -112,26 +90,24 @@ void subgraph_run(const SHARED_RUNTIME_VEC& runtimes, bool synch) {
   return;
 }
 
-bool subgraph_poll(vector<NDArray>* output, const SHARED_RUNTIME_VEC& runtimes, const bool synch) {
+bool subgraph_poll(vector<NDArray>* output, const SHARED_RUNTIME_VEC& runtimes, const bool bSynch) {
   shared_ptr<RuntimeItem> firstRuntime = runtimes.front();
   QUEUE* queue = firstRuntime->queue;
-#ifndef SERIALIZE
   bool suc = false;
   subgraphOutputData<> outputData(output);
   suc = q_poll<SLOT, subgraphOutputData<>>(queue, &outputData);
-  if (!suc) {
-    firstRuntime->waitPipeLineData(!synch);
+  while (!suc && bSynch) {
+    /*
+     * If get exit notify then break.
+     */
+    if (!firstRuntime->waitPipeLineData(!bSynch)) {
+      break;
+    }
     suc = q_poll<SLOT, subgraphOutputData<>>(queue, &outputData);
-    cout << "run done suc is " << suc << endl;
   }
   return suc;
-#else
-  subgraphOutputData<> outputData(output);
-  return q_poll<SLOT, subgraphOutputData<>>(queue, &outputData);
-#endif
 }
 
 void subgraph_stop(const SHARED_RUNTIME_VEC& runtimes) {
-  cout << __FUNCTION__ << endl;
   runtimes.front()->notifyNextExit();
 }
diff --git a/src/runtime/subgraph/subgraph_function.h b/src/runtime/subgraph/subgraph_function.h
index de0cdc26ff03..6c933d241a90 100644
--- a/src/runtime/subgraph/subgraph_function.h
+++ b/src/runtime/subgraph/subgraph_function.h
@@ -28,11 +28,11 @@ using namespace tvm::runtime;
 typedef vector<shared_ptr<RuntimeItem>> SHARED_RUNTIME_VEC;
 
 void subgraph_init(Array<Module> graphRuntimes, SHARED_RUNTIME_VEC* runtimes);
-void subgraph_run(const SHARED_RUNTIME_VEC& runtimes, bool synch = false);
+void subgraph_run(const SHARED_RUNTIME_VEC& runtimes);
 inline void subgraph_queue_push(QUEUE* queue, Array<NDArray> arrays);
 bool subgraph_queue_poll(QUEUE* queue, RuntimeData* runtimeData);
 bool subgraph_poll(vector<NDArray>* output, const SHARED_RUNTIME_VEC& runtimes,
-                   const bool sync = false);
+                   const bool bSync = false);
 void subgraph_stop(const SHARED_RUNTIME_VEC& runtimes);
 
 #endif  // TVM_RUNTIME_SUBGRAPH_SUBGRAPH_FUNCTION_H_
diff --git a/src/runtime/subgraph/subgraph_struct.h b/src/runtime/subgraph/subgraph_struct.h
index f869b058ac9b..4f10348aade1 100644
--- a/src/runtime/subgraph/subgraph_struct.h
+++ b/src/runtime/subgraph/subgraph_struct.h
@@ -35,7 +35,6 @@
 #include <vector>
 #define SLOT slot_t<>
 #define SUB_Q_SIZE 1024
-// #define SERIALIZE
 using namespace tvm::runtime;
 using namespace std;
 // thread control struction, for single consumer single producer mode
@@ -72,7 +71,6 @@ class TControl {
     notify();
     if (t->joinable()) {
       t->join();
-      cout << "exit_notify suc" << endl;
     }
   }
 };

From 82ea9adf0777e361998386c1617ac10f5e0c2630 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Wed, 21 Apr 2021 23:42:04 -0700
Subject: [PATCH 05/28] Fix plint issue

---
 src/runtime/subgraph/subgraph_function.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/runtime/subgraph/subgraph_function.cc b/src/runtime/subgraph/subgraph_function.cc
index ea2408d675d0..d3b1f076ad11 100644
--- a/src/runtime/subgraph/subgraph_function.cc
+++ b/src/runtime/subgraph/subgraph_function.cc
@@ -108,6 +108,4 @@ bool subgraph_poll(vector<NDArray>* output, const SHARED_RUNTIME_VEC& runtimes,
   return suc;
 }
 
-void subgraph_stop(const SHARED_RUNTIME_VEC& runtimes) {
-  runtimes.front()->notifyNextExit();
-}
+void subgraph_stop(const SHARED_RUNTIME_VEC& runtimes) { runtimes.front()->notifyNextExit(); }

From 0dc1d74a06494fb48ee52ac182d91b3994d084ff Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Fri, 30 Apr 2021 21:04:45 -0700
Subject: [PATCH 06/28] add build_pipeline logic

---
 python/tvm/contrib/subgraph_executor.py      |  4 +-
 python/tvm/relay/build_module.py             | 48 +++++++++-
 tests/python/relay/test_analysis_pipeline.py | 93 +++++++++-----------
 3 files changed, 91 insertions(+), 54 deletions(-)

diff --git a/python/tvm/contrib/subgraph_executor.py b/python/tvm/contrib/subgraph_executor.py
index 3d323c565569..56e239bcb2c0 100644
--- a/python/tvm/contrib/subgraph_executor.py
+++ b/python/tvm/contrib/subgraph_executor.py
@@ -35,8 +35,8 @@ def create(sub_mods):
     """
     mods = []
     for sub_mod in sub_mods:
-        m = graph_executor.GraphModule(sub_mod["lib"]["default"](sub_mod["dev"]))
-        mods.append(m)
+        mod = graph_executor.GraphModule(sub_mod["default"](sub_mods[sub_mod]["dev"]))
+        mods.append(mod)
 
     submodule = SubGraphModule(mods)
     return submodule
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index c67ac1dc423d..649b8ef04fed 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -246,13 +246,19 @@ def _module_export(module, file_name):  # fcompile, addons, kwargs?
 
 
 @register_func("tvm.relay.build")
-def _build_module_no_factory(mod, target=None, target_host=None, params=None, mod_name="default"):
+def _build_module_no_factory(
+    mod, target=None, target_host=None, params=None, mod_name="default", config=None
+):
     """A wrapper around build which discards the Python GraphFactoryRuntime.
     This wrapper is suitable to be used from other programming languages as
     the runtime::Module can be freely passed between language boundaries.
     """
     target, target_host = Target.check_and_update_host_consist(target, target_host)
-    return build(mod, target, params=params, mod_name=mod_name).module
+    ret = build(mod, target, params=params, mod_name=mod_name, config=config)
+    if isinstance(ret, dict):
+        return ret
+
+    return ret.module
 
 
 def get_executor_from_target(target, target_host):
@@ -373,6 +379,44 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
         return executor_factory
 
 
+def build_pipeline(ir_mods, config):
+    """build module list that can use for pipeline execution.
+    Parameters:
+    ir_mods:
+        list of IRModule
+
+    config:
+        build configuration informaiton, structure like following.
+        {IRModule: {"target":target,
+                    "target_host":target_host,
+                    "params":params,
+                    "mod_name"mod_name,
+                    "build":build}}
+
+    Return:
+        list of IRModule
+    """
+    mods = {}
+    for ir_mod in ir_mods:
+        mod_config = config[ir_mod]
+        build_func = build
+        # if there is a self defined build function then use it.
+        if mod_config["build"]:
+            build_func = mod_config.build
+
+        mod = build_func(
+            ir_mod,
+            mod_config["target"],
+            params=mod_config["params"],
+            target_host=mod_config["target_host"],
+            mod_name=mod_config["mod_name"],
+        )
+
+        mods[mod] = {"dev": mod_config["dev"]}
+
+    return mods
+
+
 def optimize(mod, target=None, params=None):
     """Helper function that optimizes a Relay module.
 
diff --git a/tests/python/relay/test_analysis_pipeline.py b/tests/python/relay/test_analysis_pipeline.py
index a3938df61d0e..ef05262fba56 100644
--- a/tests/python/relay/test_analysis_pipeline.py
+++ b/tests/python/relay/test_analysis_pipeline.py
@@ -36,8 +36,17 @@ def run_module(mod, dev, target, dname, data):
     return output
 
 
-def get_network():
+def run_modules(mods, dev, target, dname, data):
+    for mod in mods:
+        data = run_module(mod, dev, target, dname, data)
+
+    return data
+
+
+def get_mannual_mod():
+    mods = []
     dshape = (3, 3)
+    data = relay.var("data", relay.TensorType(dshape, "float32"))
     mvalue1 = np.full((1), 5).astype("float32")
     mvalue2 = np.full((1), 2).astype("float32")
     mvalue3 = np.full((1), 3).astype("float32")
@@ -46,29 +55,32 @@ def get_network():
     mv2 = relay.Constant(tvm.nd.array(mvalue2))
     mv3 = relay.Constant(tvm.nd.array(mvalue3))
     mv4 = relay.Constant(tvm.nd.array(mvalue4))
-    data = relay.var("data", relay.TensorType(dshape, "float32"))
-    net = relay.multiply(data, mv1)
-    net = relay.add(net, mv2)
-    net = relay.add(net, mv3)
-    net = relay.multiply(net, mv4)
-    net = relay.subtract(net, mv1)
-    func = relay.Function([data], net)
-    mod = tvm.IRModule.from_expr(func)
-    return mod, dshape
+    net1 = relay.multiply(data, mv1)
+
+    net2 = relay.add(data, mv2)
+    net2 = relay.add(net2, mv3)
+
+    net3 = relay.multiply(data, mv4)
+
+    net4 = relay.subtract(data, mv1)
+
+    mods.append(tvm.IRModule.from_expr(relay.Function([data], net1)))
+    mods.append(tvm.IRModule.from_expr(relay.Function([data], net2)))
+    mods.append(tvm.IRModule.from_expr(relay.Function([data], net3)))
+    mods.append(tvm.IRModule.from_expr(relay.Function([data], net4)))
+
+    return mods, dshape
 
 
-mod, dshape = get_network()
 """
 #split compute graph into 4 subgraph
 """
-pl = [0, 2, 3]
-mods = pipeline_graph(mod["main"], pl)
-
+mods, dshape = get_mannual_mod()
 """
 #Prepare batch data for pipeline feeding
 """
 datas = []
-for i in range(len(pl) + 1):
+for i in range(len(mods) + 1):
     datas.append(np.full(dshape, 3 + i).astype("float32"))
 
 """
@@ -76,12 +88,22 @@ def get_network():
 """
 outs = []
 for data in datas:
-    outs.append(run_module(mod, tvm.cpu(), "llvm", "data", data))
+    outs.append(run_modules(mods, tvm.cpu(), "llvm", "data", data))
 
 """
 #Parameter use for subgraph executor creation
 """
-sub_mod = []
+mod_config = {}
+for i in range(len(mods)):
+    mconfig = {"target_host": None, "mod_name": "default", "build": None, "params": None}
+    if i < 2:
+        mconfig["target"] = "cuda"
+        mconfig["dev"] = tvm.gpu()
+    else:
+        mconfig["target"] = "llvm"
+        mconfig["dev"] = tvm.cpu()
+
+    mod_config[mods[i]] = mconfig
 
 """
 #Build module and append module and device type into variable that
@@ -90,42 +112,11 @@ def get_network():
 #last subgraph use cpu
 """
 with relay.build_config(opt_level=3):
-    """
-    #Build first subgraph and create parameter for subgraph
-    #creation.
-    """
-    if tvm.testing.device_enabled("cuda"):
-        lib, dev = relay.build(mods[0], "cuda"), tvm.gpu()
-    else:
-        lib, dev = relay.build(mods[0], "llvm"), tvm.cpu()
-    sub_mod.append({"lib": lib, "dev": dev})
-
-    """
-    #Build second subgraph, append module and device into
-    #subgraph creation parameter.
-    """
-    if tvm.testing.device_enabled("cuda"):
-        lib, dev = relay.build(mods[1], "cuda"), tvm.gpu()
-    else:
-        lib, dev = relay.build(mods[1], "llvm"), tvm.cpu()
-    sub_mod.append({"lib": lib, "dev": dev})
-
-    """
-    #third subgraph
-    """
-    lib = relay.build(mods[2], "llvm")
-    sub_mod.append({"lib": lib, "dev": tvm.cpu()})
-
-    """
-    #last subgraph
-    """
-    lib = relay.build(mods[3], "llvm")
-    sub_mod.append({"lib": lib, "dev": tvm.cpu()})
-
+    pipeline_mod = tvm.relay.build(mods, config=mod_config)
 """
 #Create subgraph executor
 """
-smod = subgraph_executor.create(sub_mod)
+smod = subgraph_executor.create(pipeline_mod)
 
 """
 #Use subgraph executor to pipeline the said subgraph which use different backend
@@ -151,3 +142,5 @@ def get_network():
 """
 for i in range(len(datas)):
     tvm.testing.assert_allclose(outs[i], sub_outputs[i])
+
+print("run suc")

From f8504790f92df3177d32133e912516863e4c566c Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Fri, 30 Apr 2021 23:16:09 -0700
Subject: [PATCH 07/28] remove pipeline_graph

---
 python/tvm/relay/analysis/analysis.py | 128 --------------------------
 1 file changed, 128 deletions(-)

diff --git a/python/tvm/relay/analysis/analysis.py b/python/tvm/relay/analysis/analysis.py
index 0192621b0901..38bcf4b904b3 100644
--- a/python/tvm/relay/analysis/analysis.py
+++ b/python/tvm/relay/analysis/analysis.py
@@ -448,131 +448,3 @@ def get_calibration_data(mod, data):
         calib_data[gvar] = value
 
     return calib_data
-
-
-def pipeline_graph(expr, indices):
-    """Split Graph Into A Group Of Subgraph
-    Parameters
-    ----------
-    expr : tvm.relay.Expr
-    indices : Array[int]
-
-    Returns
-    -------
-    ret : Array[tvm.relay.IRModule]
-    """
-
-    def run_opt_pass(expr, opt_pass):
-        """Exectue a relay pass"""
-        assert isinstance(opt_pass, tvm.transform.Pass)
-        mod = tvm.IRModule.from_expr(expr)
-        mod = tvm.relay.transform.InferType()(mod)
-        mod = opt_pass(mod)
-        entry = mod["main"]
-        return entry if isinstance(expr, tvm.relay.Function) else entry.body
-
-    def _operator_idx_inc(expr, operator_current_idx):
-        """Increase operator index"""
-        if not isinstance(expr, tvm.relay.expr.Constant):
-            operator_current_idx = operator_current_idx + 1
-
-        return operator_current_idx
-
-    def merge_constant_expr(constant_expr, expr):
-        # merge constant express with a express
-        # Parameters
-        # ----------
-        # constant_expr:
-        #     constant expression
-        # expr:
-        #     expression to merge with constant expression
-
-        # If body not let, then reached end of the express
-        if not isinstance(constant_expr.body, tvm.relay.expr.Let):
-            return tvm.relay.expr.Let(constant_expr.var, constant_expr.value, expr)
-
-        return tvm.relay.expr.Let(
-            constant_expr.var, constant_expr.value, merge_constant_expr(constant_expr.body, expr)
-        )
-
-    def _recursion(anf, operator_indx, pipeline_mods, indices, constant_expr):
-        # Enumrate all operator of compute graph then split the compute graph
-        # into a group subgraph.
-        # Parameters
-        # ----------
-        # anf:
-        #     ANF format expression
-        # operator_indx:
-        #     current operator indice
-        # pipeline_mods:
-        #     the subgraph list get storage in this variable
-        # indices:
-        #     Array of indices use to define the subgraph scope
-        # constant_expr:
-        #     constant defined before current operator
-
-        # Do the split work
-        if isinstance(anf, tvm.relay.Function):
-            return tvm.relay.Function(
-                anf.params,
-                _recursion(anf.body, operator_indx, pipeline_mods, indices, constant_expr),
-                anf.ret_type,
-                anf.type_params,
-                anf.attrs,
-            )
-        if isinstance(anf, tvm.relay.expr.Let):
-            value = anf.value
-            operator_indx = _operator_idx_inc(value, operator_indx)
-
-            # record constan expr to make sure all sugraph can find correct
-            # constant.
-            if isinstance(value, tvm.relay.expr.Constant):
-                if not constant_expr:
-                    constant_expr = tvm.relay.expr.Let(anf.var, value, anf.var)
-                else:
-                    constant_expr = tvm.relay.expr.Let(anf.var, value, constant_expr)
-
-            if isinstance(value, tvm.relay.expr.Call):
-                if isinstance(value.op, tvm.ir.Op):
-
-                    # if have expr a(b(c(d(e)))) and indexes are [1,2,3]
-                    # then would get separate modules for a(b),c,d(e).
-                    # the split area is a(b)[0,1] c[2,2] d(e)[2,3]
-                    if indices and operator_indx == indices[0]:
-                        indices.pop(0)
-                        ann = _recursion(
-                            anf.body, operator_indx, pipeline_mods, indices, constant_expr
-                        )
-
-                        # when current subgraph use previous subgraph constant,
-                        # such constant may become free varaible due to the constant
-                        # not exist, merge the previous constant with current subgraph
-                        # to avoid such issue.
-                        if constant_expr:
-                            ann = merge_constant_expr(constant_expr, ann)
-
-                        ann = run_opt_pass(ann, transform.ToGraphNormalForm())
-                        mod = tvm.IRModule.from_expr(ann)
-                        pipeline_mods.insert(0, mod)
-                        return tvm.relay.expr.Let(anf.var, value, anf.var)
-            return tvm.relay.expr.Let(
-                anf.var,
-                value,
-                _recursion(anf.body, operator_indx, pipeline_mods, indices, constant_expr),
-            )
-        else:
-            return anf
-
-    pipeline_mods = []
-
-    # operator count start from 0, then initial value get set into -1
-    operator_indx = -1
-    constant_expr = None
-    subgraph_indices = indices.copy()
-    anf = run_opt_pass(expr, transform.ToANormalForm())
-    anf = run_opt_pass(anf, transform.InferType())
-    ann = _recursion(anf, operator_indx, pipeline_mods, subgraph_indices, constant_expr)
-    ann = run_opt_pass(ann.body, transform.ToGraphNormalForm())
-    mod = tvm.IRModule.from_expr(ann)
-    pipeline_mods.insert(0, mod)
-    return pipeline_mods

From 3a2e2a777ea3cee8b4391cdf835a6938b3a1d1b7 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Fri, 30 Apr 2021 23:43:47 -0700
Subject: [PATCH 08/28] rename subgraph executor into pipeline executor

---
 cmake/config.cmake                            |  2 +-
 ...graph_executor.py => pipeline_executor.py} | 18 +++++-----
 .../pipeline_data.h}                          |  8 ++---
 .../pipeline_executor.cc}                     | 24 ++++++-------
 .../pipeline_executor.h}                      | 10 +++---
 .../pipeline_function.cc}                     | 34 +++++++++----------
 .../pipeline_function.h}                      | 20 +++++------
 .../pipeline_struct.h}                        | 20 +++++------
 tests/python/relay/test_analysis_pipeline.py  | 33 ++++++++----------
 9 files changed, 83 insertions(+), 86 deletions(-)
 rename python/tvm/contrib/{subgraph_executor.py => pipeline_executor.py} (92%)
 rename src/runtime/{subgraph/subgraph_data.h => pipeline/pipeline_data.h} (92%)
 rename src/runtime/{subgraph/subgraph_executor.cc => pipeline/pipeline_executor.cc} (89%)
 rename src/runtime/{subgraph/subgraph_executor.h => pipeline/pipeline_executor.h} (93%)
 rename src/runtime/{subgraph/subgraph_function.cc => pipeline/pipeline_function.cc} (73%)
 rename src/runtime/{subgraph/subgraph_function.h => pipeline/pipeline_function.h} (65%)
 rename src/runtime/{subgraph/subgraph_struct.h => pipeline/pipeline_struct.h} (95%)

diff --git a/cmake/config.cmake b/cmake/config.cmake
index 515d041d49b8..408463e772a5 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -102,7 +102,7 @@ set(USE_STACKVM_RUNTIME OFF)
 # Whether enable tiny embedded graph executor.
 set(USE_GRAPH_EXECUTOR ON)
 # Whether enable subgraph runtime.
-set(USE_SUBGRAPH_EXECUTOR ON)
+set(USE_PIPELINE_EXECUTOR ON)
 
 # Whether enable tiny graph executor with CUDA Graph
 set(USE_GRAPH_EXECUTOR_CUDA_GRAPH OFF)
diff --git a/python/tvm/contrib/subgraph_executor.py b/python/tvm/contrib/pipeline_executor.py
similarity index 92%
rename from python/tvm/contrib/subgraph_executor.py
rename to python/tvm/contrib/pipeline_executor.py
index 56e239bcb2c0..f4958d363f75 100644
--- a/python/tvm/contrib/subgraph_executor.py
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -14,13 +14,13 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Minimum subgraph executor that executes subgraph containing TVM PackedFunc."""
+"""Minimum pipeline executor that executes pipeline containing TVM PackedFunc."""
 import tvm._ffi
 from tvm.contrib import graph_executor
 
 
 def create(sub_mods):
-    """Create a subgraph runtime executor.
+    """Create a pipeline runtime executor.
 
     Parameters
     ----------
@@ -30,19 +30,19 @@ def create(sub_mods):
 
     Returns
     -------
-    submodule : SubGraphModule
-        Runtime subgraph module.
+    submodule : PipelineModule
+        Runtime pipeline module.
     """
     mods = []
     for sub_mod in sub_mods:
         mod = graph_executor.GraphModule(sub_mod["default"](sub_mods[sub_mod]["dev"]))
         mods.append(mod)
 
-    submodule = SubGraphModule(mods)
+    submodule = PipelineModule(mods)
     return submodule
 
 
-class SubGraphModule(object):
+class PipelineModule(object):
     """Wrapper runtime module.
 
     This is a thin wrapper of the underlying TVM module.
@@ -66,8 +66,8 @@ def __init__(self, graph_modules):
         for module in graph_modules:
             mods.append(module.module)
 
-        subgraphcreate = tvm._ffi.get_global_func("tvm.subgraph_executor.create")
-        module = subgraphcreate(mods)
+        pipelinecreate = tvm._ffi.get_global_func("tvm.pipeline_executor.create")
+        module = pipelinecreate(mods)
 
         self.graph_modules_ = graph_modules
 
@@ -115,7 +115,7 @@ def run(self, **input_dict):
         self._run()
 
     def stop(self):
-        """Stop subgraph run"""
+        """Stop pipeline run"""
         self._stop()
 
     def get_num_outputs(self):
diff --git a/src/runtime/subgraph/subgraph_data.h b/src/runtime/pipeline/pipeline_data.h
similarity index 92%
rename from src/runtime/subgraph/subgraph_data.h
rename to src/runtime/pipeline/pipeline_data.h
index 0f2fd8816edd..0f91a014cd35 100644
--- a/src/runtime/subgraph/subgraph_data.h
+++ b/src/runtime/pipeline/pipeline_data.h
@@ -16,8 +16,8 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#ifndef TVM_RUNTIME_SUBGRAPH_SUBGRAPH_DATA_H_
-#define TVM_RUNTIME_SUBGRAPH_SUBGRAPH_DATA_H_
+#ifndef TVM_RUNTIME_PIPELINE_PIPELINE_DATA_H_
+#define TVM_RUNTIME_PIPELINE_PIPELINE_DATA_H_
 #define EXPORT __attribute__((visibility("default")))
 #define IMPORT
 #include <condition_variable>
@@ -25,7 +25,7 @@
 #include <mutex>
 #include <thread>
 
-#include "subgraph_struct.h"
+#include "pipeline_struct.h"
 #ifdef __cplusplus
 
 #define read_barrier() std::atomic_thread_fence(std::memory_order_acquire)
@@ -71,4 +71,4 @@ bool q_poll(squeue<SLOT_TYPE>* q, VARIABLE_TYPE* s) {
 // extern "C"
 #endif
 
-#endif  // TVM_RUNTIME_SUBGRAPH_SUBGRAPH_DATA_H_
+#endif  // TVM_RUNTIME_PIPELINE_PIPELINE_DATA_H_
diff --git a/src/runtime/subgraph/subgraph_executor.cc b/src/runtime/pipeline/pipeline_executor.cc
similarity index 89%
rename from src/runtime/subgraph/subgraph_executor.cc
rename to src/runtime/pipeline/pipeline_executor.cc
index 1ecf32fb8be7..d20292bf1852 100644
--- a/src/runtime/subgraph/subgraph_executor.cc
+++ b/src/runtime/pipeline/pipeline_executor.cc
@@ -18,9 +18,9 @@
  */
 
 /*!
- * \file subgraph_runtime.cc
+ * \file pipeline_runtime.cc
  */
-#include "subgraph_executor.h"
+#include "pipeline_executor.h"
 
 #include <tvm/runtime/registry.h>
 
@@ -28,16 +28,16 @@ namespace tvm {
 namespace runtime {
 
 /*!
- *\bief Stop subgraph run.
+ *\bief Stop pipeline run.
  */
-void SubGraphRuntime::Stop() { subgraph_stop(runtimes); }
+void SubGraphRuntime::Stop() { pipeline_stop(runtimes); }
 /*!
  * \brief Run all the operations one by one.
  */
-void SubGraphRuntime::Run() { subgraph_run(runtimes); }
+void SubGraphRuntime::Run() { pipeline_run(runtimes); }
 
 void SubGraphRuntime::Init(const Array<tvm::runtime::Module>& modules) {
-  subgraph_init(modules, &runtimes);
+  pipeline_init(modules, &runtimes);
   SetupStorage();
   return;
 }
@@ -77,14 +77,14 @@ void SubGraphRuntime::SetInput(const std::string& name, DLTensor* data_in) {
 /*!
  * \brief Get the number of outputs
  *
- * \return The number of outputs from last subgraph.
+ * \return The number of outputs from last pipeline.
  */
 int SubGraphRuntime::NumOutputs() const { return runtimes.back()->runtimePtr->NumOutputs(); }
 
 /*!
  * \brief Get the number of inputs
  *
- * \return The number of inputs to the first subgraph.
+ * \return The number of inputs to the first pipeline.
  */
 int SubGraphRuntime::NumInputs() const { return runtimes.front()->runtimePtr->NumInputs(); }
 
@@ -111,7 +111,7 @@ NDArray SubGraphRuntime::GetInput(const std::string& name, int mIndx) const {
  */
 Array<NDArray> SubGraphRuntime::GetOutput(bool syncPoll) {
   Array<NDArray> nd;
-  if (subgraph_poll(&output_entry_, runtimes, syncPoll)) {
+  if (pipeline_poll(&output_entry_, runtimes, syncPoll)) {
     for (auto output : output_entry_) {
       nd.push_back(output);
     }
@@ -169,14 +169,14 @@ PackedFunc SubGraphRuntime::GetFunction(const std::string& name,
   }
 }
 
-Module SubGraphRuntimeCreate(const Array<tvm::runtime::Module>& m) {
+Module PipelineRuntimeCreate(const Array<tvm::runtime::Module>& m) {
   auto exec = make_object<SubGraphRuntime>();
   exec->Init(m);
   return Module(exec);
 }
 
-TVM_REGISTER_GLOBAL("tvm.subgraph_executor.create").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = SubGraphRuntimeCreate(args[0]);
+TVM_REGISTER_GLOBAL("tvm.pipeline_executor.create").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = PipelineRuntimeCreate(args[0]);
 });
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/subgraph/subgraph_executor.h b/src/runtime/pipeline/pipeline_executor.h
similarity index 93%
rename from src/runtime/subgraph/subgraph_executor.h
rename to src/runtime/pipeline/pipeline_executor.h
index 14ac0d5355ae..91d2fc0b3dba 100644
--- a/src/runtime/subgraph/subgraph_executor.h
+++ b/src/runtime/pipeline/pipeline_executor.h
@@ -22,19 +22,19 @@
  *        containing only tvm PackedFunc.
  * \file graph_runtime.h
  */
-#ifndef TVM_RUNTIME_SUBGRAPH_SUBGRAPH_EXECUTOR_H_
-#define TVM_RUNTIME_SUBGRAPH_SUBGRAPH_EXECUTOR_H_
+#ifndef TVM_RUNTIME_PIPELINE_PIPELINE_EXECUTOR_H_
+#define TVM_RUNTIME_PIPELINE_PIPELINE_EXECUTOR_H_
 #include <memory>
 #include <string>
 #include <vector>
 
-#include "subgraph_function.h"
+#include "pipeline_function.h"
 
 namespace tvm {
 namespace runtime {
 
 /*!
- * \brief subgraph runtime.
+ * \brief pipeline runtime.
  *
  *  This runtime can be acccesibly in various language via
  *  TVM runtime PackedFunc API.
@@ -106,4 +106,4 @@ class TVM_DLL SubGraphRuntime : public ModuleNode {
 }  // namespace runtime
 }  // namespace tvm
 
-#endif  // TVM_RUNTIME_SUBGRAPH_SUBGRAPH_EXECUTOR_H_
+#endif  // TVM_RUNTIME_PIPELINE_PIPELINE_EXECUTOR_H_
diff --git a/src/runtime/subgraph/subgraph_function.cc b/src/runtime/pipeline/pipeline_function.cc
similarity index 73%
rename from src/runtime/subgraph/subgraph_function.cc
rename to src/runtime/pipeline/pipeline_function.cc
index d3b1f076ad11..7e3656d5181b 100644
--- a/src/runtime/subgraph/subgraph_function.cc
+++ b/src/runtime/pipeline/pipeline_function.cc
@@ -16,12 +16,12 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#include "subgraph_function.h"
+#include "pipeline_function.h"
 
 #include <utility>
 using namespace tvm::runtime;
 
-void subgraph_pipeline_run(const int& num, const shared_ptr<RuntimeItem>& curRunItem) {
+void pipeline_pipeline_run(const int& num, const shared_ptr<RuntimeItem>& curRunItem) {
   QUEUE* curQueue = curRunItem->queue;
   QUEUE* nextQueue = curRunItem->next->queue;
 
@@ -31,7 +31,7 @@ void subgraph_pipeline_run(const int& num, const shared_ptr<RuntimeItem>& curRun
    */
   bool suc = false;
   while (curRunItem->waitPipeLineData(suc)) {
-    suc = subgraph_queue_poll(curQueue, &curRunItem->rData);
+    suc = pipeline_queue_poll(curQueue, &curRunItem->rData);
     if (!suc) {
       continue;
     }
@@ -39,20 +39,20 @@ void subgraph_pipeline_run(const int& num, const shared_ptr<RuntimeItem>& curRun
     curRunItem->Run();
 
     auto output = curRunItem->GetOutput();
-    subgraph_queue_push(nextQueue, output);
+    pipeline_queue_push(nextQueue, output);
     curRunItem->notifyDataReadyToNext();
   }
   curRunItem->notifyNextExit();
 }
 
-thread* subgraph_pipeline_init(SHARED_RUNTIME_VEC* runtimes) {
+thread* pipeline_pipeline_init(SHARED_RUNTIME_VEC* runtimes) {
   for (size_t i = 1; i < runtimes->size(); i++) {
-    (*runtimes)[i]->t = thread(subgraph_pipeline_run, i, (*runtimes)[i]);
+    (*runtimes)[i]->t = thread(pipeline_pipeline_run, i, (*runtimes)[i]);
   }
   return NULL;
 }
 
-void subgraph_init(Array<Module> graphRuntimes, SHARED_RUNTIME_VEC* runtimes) {
+void pipeline_init(Array<Module> graphRuntimes, SHARED_RUNTIME_VEC* runtimes) {
   int len = graphRuntimes.size();
   for (int i = 0; i < len; i++) {
     QUEUE* sub_queue = createQueue<SLOT>(NULL, SUB_Q_SIZE);
@@ -69,33 +69,33 @@ void subgraph_init(Array<Module> graphRuntimes, SHARED_RUNTIME_VEC* runtimes) {
       (*runtimes)[i]->next = (*runtimes)[0];
     }
   }
-  subgraph_pipeline_init(runtimes);
+  pipeline_pipeline_init(runtimes);
   return;
 }
 
-inline void subgraph_queue_push(QUEUE* queue, Array<NDArray> arrays) {
+inline void pipeline_queue_push(QUEUE* queue, Array<NDArray> arrays) {
   q_push<SLOT, Array<NDArray>>(queue, arrays);
   return;
 }
 
-bool subgraph_queue_poll(QUEUE* queue, RuntimeData* runtimeData) {
+bool pipeline_queue_poll(QUEUE* queue, RuntimeData* runtimeData) {
   return q_poll<SLOT, RuntimeData>(queue, runtimeData);
 }
 
-void subgraph_run(const SHARED_RUNTIME_VEC& runtimes) {
+void pipeline_run(const SHARED_RUNTIME_VEC& runtimes) {
   shared_ptr<RuntimeItem> runtime = runtimes.front();
   runtime->Run();
-  subgraph_queue_push(runtime->next->queue, runtime->GetOutput());
+  pipeline_queue_push(runtime->next->queue, runtime->GetOutput());
   runtime->notifyDataReadyToNext();
   return;
 }
 
-bool subgraph_poll(vector<NDArray>* output, const SHARED_RUNTIME_VEC& runtimes, const bool bSynch) {
+bool pipeline_poll(vector<NDArray>* output, const SHARED_RUNTIME_VEC& runtimes, const bool bSynch) {
   shared_ptr<RuntimeItem> firstRuntime = runtimes.front();
   QUEUE* queue = firstRuntime->queue;
   bool suc = false;
-  subgraphOutputData<> outputData(output);
-  suc = q_poll<SLOT, subgraphOutputData<>>(queue, &outputData);
+  pipelineOutputData<> outputData(output);
+  suc = q_poll<SLOT, pipelineOutputData<>>(queue, &outputData);
   while (!suc && bSynch) {
     /*
      * If get exit notify then break.
@@ -103,9 +103,9 @@ bool subgraph_poll(vector<NDArray>* output, const SHARED_RUNTIME_VEC& runtimes,
     if (!firstRuntime->waitPipeLineData(!bSynch)) {
       break;
     }
-    suc = q_poll<SLOT, subgraphOutputData<>>(queue, &outputData);
+    suc = q_poll<SLOT, pipelineOutputData<>>(queue, &outputData);
   }
   return suc;
 }
 
-void subgraph_stop(const SHARED_RUNTIME_VEC& runtimes) { runtimes.front()->notifyNextExit(); }
+void pipeline_stop(const SHARED_RUNTIME_VEC& runtimes) { runtimes.front()->notifyNextExit(); }
diff --git a/src/runtime/subgraph/subgraph_function.h b/src/runtime/pipeline/pipeline_function.h
similarity index 65%
rename from src/runtime/subgraph/subgraph_function.h
rename to src/runtime/pipeline/pipeline_function.h
index 6c933d241a90..8236399c4057 100644
--- a/src/runtime/subgraph/subgraph_function.h
+++ b/src/runtime/pipeline/pipeline_function.h
@@ -16,23 +16,23 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#ifndef TVM_RUNTIME_SUBGRAPH_SUBGRAPH_FUNCTION_H_
-#define TVM_RUNTIME_SUBGRAPH_SUBGRAPH_FUNCTION_H_
+#ifndef TVM_RUNTIME_PIPELINE_PIPELINE_FUNCTION_H_
+#define TVM_RUNTIME_PIPELINE_PIPELINE_FUNCTION_H_
 #include <memory>
 #include <vector>
 
-#include "subgraph_data.h"
+#include "pipeline_data.h"
 
 using namespace std;
 using namespace tvm::runtime;
 typedef vector<shared_ptr<RuntimeItem>> SHARED_RUNTIME_VEC;
 
-void subgraph_init(Array<Module> graphRuntimes, SHARED_RUNTIME_VEC* runtimes);
-void subgraph_run(const SHARED_RUNTIME_VEC& runtimes);
-inline void subgraph_queue_push(QUEUE* queue, Array<NDArray> arrays);
-bool subgraph_queue_poll(QUEUE* queue, RuntimeData* runtimeData);
-bool subgraph_poll(vector<NDArray>* output, const SHARED_RUNTIME_VEC& runtimes,
+void pipeline_init(Array<Module> graphRuntimes, SHARED_RUNTIME_VEC* runtimes);
+void pipeline_run(const SHARED_RUNTIME_VEC& runtimes);
+inline void pipeline_queue_push(QUEUE* queue, Array<NDArray> arrays);
+bool pipeline_queue_poll(QUEUE* queue, RuntimeData* runtimeData);
+bool pipeline_poll(vector<NDArray>* output, const SHARED_RUNTIME_VEC& runtimes,
                    const bool bSync = false);
-void subgraph_stop(const SHARED_RUNTIME_VEC& runtimes);
+void pipeline_stop(const SHARED_RUNTIME_VEC& runtimes);
 
-#endif  // TVM_RUNTIME_SUBGRAPH_SUBGRAPH_FUNCTION_H_
+#endif  // TVM_RUNTIME_PIPELINE_PIPELINE_FUNCTION_H_
diff --git a/src/runtime/subgraph/subgraph_struct.h b/src/runtime/pipeline/pipeline_struct.h
similarity index 95%
rename from src/runtime/subgraph/subgraph_struct.h
rename to src/runtime/pipeline/pipeline_struct.h
index 4f10348aade1..0e99fb991793 100644
--- a/src/runtime/subgraph/subgraph_struct.h
+++ b/src/runtime/pipeline/pipeline_struct.h
@@ -16,8 +16,8 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#ifndef TVM_RUNTIME_SUBGRAPH_SUBGRAPH_STRUCT_H_
-#define TVM_RUNTIME_SUBGRAPH_SUBGRAPH_STRUCT_H_
+#ifndef TVM_RUNTIME_PIPELINE_PIPELINE_STRUCT_H_
+#define TVM_RUNTIME_PIPELINE_PIPELINE_STRUCT_H_
 #include <assert.h>
 #include <sched.h>
 #include <string.h>
@@ -75,7 +75,7 @@ class TControl {
   }
 };
 
-class subgraphData {
+class pipelineData {
  private:
   void ResetDataList(size_t num) {
     if (max_num < num) {
@@ -152,14 +152,14 @@ class subgraphData {
   size_t max_num;
   DLTensor** dataList;
   TControl controlData;
-  subgraphData(void) : num(0), max_num(0), dataList(nullptr) {}
+  pipelineData(void) : num(0), max_num(0), dataList(nullptr) {}
 };
 
 template <int device_type = kDLCPU, int device_id = 0>
 class slot_t {
  public:
   bool bExit = false;
-  subgraphData data;
+  pipelineData data;
   slot_t(void) {}
 
   // overwrite operator = to handle "(slot) s = (OutputData) d;"
@@ -185,10 +185,10 @@ class slot_t {
 };
 
 template <int device_type = kDLCPU, int device_id = 0>
-class subgraphOutputData {
+class pipelineOutputData {
  public:
-  explicit subgraphOutputData(vector<NDArray>* datas) : datas_(datas) { ; }
-  subgraphOutputData& operator=(const slot_t<device_type, device_id>& slot) {
+  explicit pipelineOutputData(vector<NDArray>* datas) : datas_(datas) { ; }
+  pipelineOutputData& operator=(const slot_t<device_type, device_id>& slot) {
     assert(datas_->size() >= slot.data.num);
     for (size_t i = 0; i < slot.data.num; i++) {
       auto dlTensor = slot.data.dataList[i];
@@ -249,7 +249,7 @@ class RuntimeFunction {
   int NumInputs() const { return get_num_inputs(); }
 
   /*
-     when doing subgraph pipeline, the from data and to
+     when doing pipeline, the from data and to
      data may comming from different device, for example
      one from GPU another from VTA, here we need first
      copy it into cpu type memory from GPU then copy the
@@ -392,4 +392,4 @@ class RuntimeItem {
   }
 };
 
-#endif  //  TVM_RUNTIME_SUBGRAPH_SUBGRAPH_STRUCT_H_
+#endif  //  TVM_RUNTIME_PIPELINE_PIPELINE_STRUCT_H_
diff --git a/tests/python/relay/test_analysis_pipeline.py b/tests/python/relay/test_analysis_pipeline.py
index ef05262fba56..f1a89600c09d 100644
--- a/tests/python/relay/test_analysis_pipeline.py
+++ b/tests/python/relay/test_analysis_pipeline.py
@@ -20,8 +20,7 @@
 import tvm.testing
 from tvm import relay
 from tvm.relay import transform
-from tvm.contrib import graph_executor, subgraph_executor
-from tvm.relay.analysis import pipeline_graph
+from tvm.contrib import graph_executor, pipeline_executor
 
 
 def run_module(mod, dev, target, dname, data):
@@ -73,7 +72,7 @@ def get_mannual_mod():
 
 
 """
-#split compute graph into 4 subgraph
+#split compute graph into 4 pipeline
 """
 mods, dshape = get_mannual_mod()
 """
@@ -91,7 +90,7 @@ def get_mannual_mod():
     outs.append(run_modules(mods, tvm.cpu(), "llvm", "data", data))
 
 """
-#Parameter use for subgraph executor creation
+#Parameter use for pipeline executor creation
 """
 mod_config = {}
 for i in range(len(mods)):
@@ -107,40 +106,38 @@ def get_mannual_mod():
 
 """
 #Build module and append module and device type into variable that
-#use for subgraph creation.
-#first and second subgraph use cuda when cuda enable, second and 
-#last subgraph use cpu
+#use for pipeline creation.
+#first and second pipeline use cuda when cuda enable, second and 
+#last pipeline use cpu
 """
 with relay.build_config(opt_level=3):
     pipeline_mod = tvm.relay.build(mods, config=mod_config)
 """
-#Create subgraph executor
+#Create pipeline executor
 """
-smod = subgraph_executor.create(pipeline_mod)
+pipeline_module = pipeline_executor.create(pipeline_mod)
 
 """
-#Use subgraph executor to pipeline the said subgraph which use different backend
+#Use pipeline executor to pipeline the said pipeline which use different backend
 """
 for data in datas:
-    smod.set_input("data", data)
-    smod.run()
+    pipeline_module.set_input("data", data)
+    pipeline_module.run()
 
 """
 Get result
 """
-sub_outputs = []
+pipeline_outputs = []
 for i in range(len(datas)):
-    sub_outputs.append(smod.get_output()[0].asnumpy())
+    pipeline_outputs.append(pipeline_module.get_output()[0].asnumpy())
 
 """
 #Stop pipeline execution.
 """
-smod.stop()
+pipeline_module.stop()
 """
 
 #Verify result
 """
 for i in range(len(datas)):
-    tvm.testing.assert_allclose(outs[i], sub_outputs[i])
-
-print("run suc")
+    tvm.testing.assert_allclose(outs[i], pipeline_outputs[i])

From 8793cdd05051e3796dce54aa7e50e13b0daec756 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Mon, 3 May 2021 10:33:11 -0700
Subject: [PATCH 09/28] fix plint issue.

---
 python/tvm/relay/analysis/analysis.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/tvm/relay/analysis/analysis.py b/python/tvm/relay/analysis/analysis.py
index 38bcf4b904b3..c7b6c60849a1 100644
--- a/python/tvm/relay/analysis/analysis.py
+++ b/python/tvm/relay/analysis/analysis.py
@@ -20,7 +20,6 @@
 This file contains the set of passes for Relay, which exposes an interface for
 configuring the passes and scripting them in Python.
 """
-import tvm
 from ...ir import IRModule
 from ...relay import transform, build_module
 from ...runtime.ndarray import cpu

From 7e175dedc066e4eefc5779927811778f9e59b9ae Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Mon, 3 May 2021 13:42:28 -0700
Subject: [PATCH 10/28] add cuda check

---
 python/tvm/contrib/pipeline_executor.py      |  48 ++++++-
 tests/python/relay/test_analysis_pipeline.py | 143 -------------------
 tests/python/relay/test_pipeline_executor.py | 140 ++++++++++++++++++
 3 files changed, 185 insertions(+), 146 deletions(-)
 delete mode 100644 tests/python/relay/test_analysis_pipeline.py
 create mode 100644 tests/python/relay/test_pipeline_executor.py

diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
index f4958d363f75..0c10811d3512 100644
--- a/python/tvm/contrib/pipeline_executor.py
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -16,10 +16,48 @@
 # under the License.
 """Minimum pipeline executor that executes pipeline containing TVM PackedFunc."""
 import tvm._ffi
+from tvm import relay
 from tvm.contrib import graph_executor
 
 
-def create(sub_mods):
+def build_pipeline(ir_mods, config):
+    """build module list that can use for pipeline execution.
+    Parameters:
+    ir_mods:
+        list of IRModule
+
+    config:
+        build configuration informaiton, structure like following.
+        {IRModule: {"target":target,
+                    "target_host":target_host,
+                    "params":params,
+                    "mod_name"mod_name,
+                    "build":build}}
+
+    Return:
+        list of IRModule
+    """
+    mods = {}
+    for ir_mod in ir_mods:
+        mod_config = config[ir_mod]
+        build_func = relay.build
+        # if there is a self defined build function then use it.
+        if mod_config["build"]:
+            build_func = mod_config.build
+
+        mod = build_func(
+            ir_mod,
+            mod_config["target"],
+            params=mod_config["params"],
+            target_host=mod_config["target_host"],
+            mod_name=mod_config["mod_name"],
+        )
+
+        mods[mod] = {"dev": mod_config["dev"]}
+
+    return mods
+
+def create(mods, mod_config):
     """Create a pipeline runtime executor.
 
     Parameters
@@ -33,9 +71,13 @@ def create(sub_mods):
     submodule : PipelineModule
         Runtime pipeline module.
     """
+    pipeline_mods = build_pipeline(mods, config=mod_config)
+
     mods = []
-    for sub_mod in sub_mods:
-        mod = graph_executor.GraphModule(sub_mod["default"](sub_mods[sub_mod]["dev"]))
+    for pipeline_mod in pipeline_mods:
+        mod = graph_executor.GraphModule(
+              pipeline_mod["default"](pipeline_mods[pipeline_mod]["dev"]))
+
         mods.append(mod)
 
     submodule = PipelineModule(mods)
diff --git a/tests/python/relay/test_analysis_pipeline.py b/tests/python/relay/test_analysis_pipeline.py
deleted file mode 100644
index f1a89600c09d..000000000000
--- a/tests/python/relay/test_analysis_pipeline.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import numpy as np
-import tvm
-import tvm.testing
-from tvm import relay
-from tvm.relay import transform
-from tvm.contrib import graph_executor, pipeline_executor
-
-
-def run_module(mod, dev, target, dname, data):
-    with tvm.transform.PassContext(opt_level=3):
-        lib = relay.build(mod, target)
-
-    m = graph_executor.GraphModule(lib["default"](dev))
-    m.set_input(dname, data)
-    m.run()
-    n = m.get_num_outputs()
-    output = m.get_output(0).asnumpy()
-    return output
-
-
-def run_modules(mods, dev, target, dname, data):
-    for mod in mods:
-        data = run_module(mod, dev, target, dname, data)
-
-    return data
-
-
-def get_mannual_mod():
-    mods = []
-    dshape = (3, 3)
-    data = relay.var("data", relay.TensorType(dshape, "float32"))
-    mvalue1 = np.full((1), 5).astype("float32")
-    mvalue2 = np.full((1), 2).astype("float32")
-    mvalue3 = np.full((1), 3).astype("float32")
-    mvalue4 = np.full((1), 4).astype("float32")
-    mv1 = relay.Constant(tvm.nd.array(mvalue1))
-    mv2 = relay.Constant(tvm.nd.array(mvalue2))
-    mv3 = relay.Constant(tvm.nd.array(mvalue3))
-    mv4 = relay.Constant(tvm.nd.array(mvalue4))
-    net1 = relay.multiply(data, mv1)
-
-    net2 = relay.add(data, mv2)
-    net2 = relay.add(net2, mv3)
-
-    net3 = relay.multiply(data, mv4)
-
-    net4 = relay.subtract(data, mv1)
-
-    mods.append(tvm.IRModule.from_expr(relay.Function([data], net1)))
-    mods.append(tvm.IRModule.from_expr(relay.Function([data], net2)))
-    mods.append(tvm.IRModule.from_expr(relay.Function([data], net3)))
-    mods.append(tvm.IRModule.from_expr(relay.Function([data], net4)))
-
-    return mods, dshape
-
-
-"""
-#split compute graph into 4 pipeline
-"""
-mods, dshape = get_mannual_mod()
-"""
-#Prepare batch data for pipeline feeding
-"""
-datas = []
-for i in range(len(mods) + 1):
-    datas.append(np.full(dshape, 3 + i).astype("float32"))
-
-"""
-#Run with graph executor for verification purpose
-"""
-outs = []
-for data in datas:
-    outs.append(run_modules(mods, tvm.cpu(), "llvm", "data", data))
-
-"""
-#Parameter use for pipeline executor creation
-"""
-mod_config = {}
-for i in range(len(mods)):
-    mconfig = {"target_host": None, "mod_name": "default", "build": None, "params": None}
-    if i < 2:
-        mconfig["target"] = "cuda"
-        mconfig["dev"] = tvm.gpu()
-    else:
-        mconfig["target"] = "llvm"
-        mconfig["dev"] = tvm.cpu()
-
-    mod_config[mods[i]] = mconfig
-
-"""
-#Build module and append module and device type into variable that
-#use for pipeline creation.
-#first and second pipeline use cuda when cuda enable, second and 
-#last pipeline use cpu
-"""
-with relay.build_config(opt_level=3):
-    pipeline_mod = tvm.relay.build(mods, config=mod_config)
-"""
-#Create pipeline executor
-"""
-pipeline_module = pipeline_executor.create(pipeline_mod)
-
-"""
-#Use pipeline executor to pipeline the said pipeline which use different backend
-"""
-for data in datas:
-    pipeline_module.set_input("data", data)
-    pipeline_module.run()
-
-"""
-Get result
-"""
-pipeline_outputs = []
-for i in range(len(datas)):
-    pipeline_outputs.append(pipeline_module.get_output()[0].asnumpy())
-
-"""
-#Stop pipeline execution.
-"""
-pipeline_module.stop()
-"""
-
-#Verify result
-"""
-for i in range(len(datas)):
-    tvm.testing.assert_allclose(outs[i], pipeline_outputs[i])
diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
new file mode 100644
index 000000000000..614e62ef5899
--- /dev/null
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -0,0 +1,140 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.relay import transform
+from tvm.contrib import graph_executor, pipeline_executor
+
+
+def run_modules(mods, dev, target, dname, data):
+    for mod in mods:
+        with tvm.transform.PassContext(opt_level=3):
+            lib = relay.build(mod, target)
+
+        m = graph_executor.GraphModule(lib["default"](dev))
+        m.set_input(dname, data)
+        m.run()
+        n = m.get_num_outputs()
+        output = m.get_output(0).asnumpy()
+        data = output
+
+    return output
+
+
+def get_mannual_mod():
+    mods = []
+    dshape = (3, 3)
+    data = relay.var("data", relay.TensorType(dshape, "float32"))
+    mvalue1 = np.full((1), 5).astype("float32")
+    mvalue2 = np.full((1), 2).astype("float32")
+    mvalue3 = np.full((1), 3).astype("float32")
+    mvalue4 = np.full((1), 4).astype("float32")
+    mv1 = relay.Constant(tvm.nd.array(mvalue1))
+    mv2 = relay.Constant(tvm.nd.array(mvalue2))
+    mv3 = relay.Constant(tvm.nd.array(mvalue3))
+    mv4 = relay.Constant(tvm.nd.array(mvalue4))
+    net1 = relay.multiply(data, mv1)
+
+    net2 = relay.add(data, mv2)
+    net2 = relay.add(net2, mv3)
+
+    net3 = relay.multiply(data, mv4)
+
+    net4 = relay.subtract(data, mv1)
+
+    mods.append(tvm.IRModule.from_expr(relay.Function([data], net1)))
+    mods.append(tvm.IRModule.from_expr(relay.Function([data], net2)))
+    mods.append(tvm.IRModule.from_expr(relay.Function([data], net3)))
+    mods.append(tvm.IRModule.from_expr(relay.Function([data], net4)))
+
+    return mods, dshape
+
+
+def test_pipeline():
+    """
+    #split compute graph into 4 pipeline
+    """
+    mods, dshape = get_mannual_mod()
+    """
+    #Prepare batch data for pipeline feeding
+    """
+    datas = []
+    for i in range(len(mods) + 1):
+        datas.append(np.full(dshape, 3 + i).astype("float32"))
+
+    """
+    #Run with graph executor for verification purpose
+    """
+    outs = [run_modules(mods, tvm.cpu(), "llvm", "data", data) for data in datas]
+
+    """
+    #Parameter use for pipeline executor creation
+    #Build module and append module and device type into variable that
+    #use for pipeline creation.
+    #first and second pipeline use cuda when cuda enable, second and 
+    #last pipeline use cpu
+    """
+    mod_config = {}
+    for i in range(len(mods)):
+        mconfig = {"target_host": None, "mod_name": "default", "build": None, "params": None}
+        # if cuda enabled, first 2 module us cuda as target
+        if i < 2 and tvm.testing.device_enabled("cuda"):
+            mconfig["target"] = "cuda"
+            mconfig["dev"] = tvm.gpu()
+        else:
+            mconfig["target"] = "llvm"
+            mconfig["dev"] = tvm.cpu()
+
+        mod_config[mods[i]] = mconfig
+
+    """
+    #build and create pipeline module
+    """
+    with relay.build_config(opt_level=3):
+        pipeline_module = pipeline_executor.create(mods, mod_config)
+
+    """
+    #Use pipeline executor to pipeline the said pipeline which use different backend
+    """
+    for data in datas:
+        pipeline_module.set_input("data", data)
+        pipeline_module.run()
+
+    """
+    Get result
+    """
+    pipeline_outputs = []
+    for i in range(len(datas)):
+        pipeline_outputs.append(pipeline_module.get_output()[0].asnumpy())
+
+    """
+    #Stop pipeline execution.
+    """
+    pipeline_module.stop()
+    """
+
+    #Verify result
+    """
+    for ref_out, out in zip(outs, pipeline_outputs):
+        tvm.testing.assert_allclose(ref_out, out)
+
+
+if __name__ == "__main__":
+    test_pipeline()

From ee9082847cc041a3439aa1538727c70fd9d023b1 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Tue, 4 May 2021 19:07:25 -0700
Subject: [PATCH 11/28] address review comments.

---
 python/tvm/contrib/pipeline_executor.py      |  4 +-
 python/tvm/relay/build_module.py             | 48 +-------------------
 tests/python/relay/test_pipeline_executor.py | 31 +++++++------
 3 files changed, 21 insertions(+), 62 deletions(-)

diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
index 0c10811d3512..de8040ff739c 100644
--- a/python/tvm/contrib/pipeline_executor.py
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -57,6 +57,7 @@ def build_pipeline(ir_mods, config):
 
     return mods
 
+
 def create(mods, mod_config):
     """Create a pipeline runtime executor.
 
@@ -76,7 +77,8 @@ def create(mods, mod_config):
     mods = []
     for pipeline_mod in pipeline_mods:
         mod = graph_executor.GraphModule(
-              pipeline_mod["default"](pipeline_mods[pipeline_mod]["dev"]))
+            pipeline_mod["default"](pipeline_mods[pipeline_mod]["dev"])
+        )
 
         mods.append(mod)
 
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 649b8ef04fed..c67ac1dc423d 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -246,19 +246,13 @@ def _module_export(module, file_name):  # fcompile, addons, kwargs?
 
 
 @register_func("tvm.relay.build")
-def _build_module_no_factory(
-    mod, target=None, target_host=None, params=None, mod_name="default", config=None
-):
+def _build_module_no_factory(mod, target=None, target_host=None, params=None, mod_name="default"):
     """A wrapper around build which discards the Python GraphFactoryRuntime.
     This wrapper is suitable to be used from other programming languages as
     the runtime::Module can be freely passed between language boundaries.
     """
     target, target_host = Target.check_and_update_host_consist(target, target_host)
-    ret = build(mod, target, params=params, mod_name=mod_name, config=config)
-    if isinstance(ret, dict):
-        return ret
-
-    return ret.module
+    return build(mod, target, params=params, mod_name=mod_name).module
 
 
 def get_executor_from_target(target, target_host):
@@ -379,44 +373,6 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
         return executor_factory
 
 
-def build_pipeline(ir_mods, config):
-    """build module list that can use for pipeline execution.
-    Parameters:
-    ir_mods:
-        list of IRModule
-
-    config:
-        build configuration informaiton, structure like following.
-        {IRModule: {"target":target,
-                    "target_host":target_host,
-                    "params":params,
-                    "mod_name"mod_name,
-                    "build":build}}
-
-    Return:
-        list of IRModule
-    """
-    mods = {}
-    for ir_mod in ir_mods:
-        mod_config = config[ir_mod]
-        build_func = build
-        # if there is a self defined build function then use it.
-        if mod_config["build"]:
-            build_func = mod_config.build
-
-        mod = build_func(
-            ir_mod,
-            mod_config["target"],
-            params=mod_config["params"],
-            target_host=mod_config["target_host"],
-            mod_name=mod_config["mod_name"],
-        )
-
-        mods[mod] = {"dev": mod_config["dev"]}
-
-    return mods
-
-
 def optimize(mod, target=None, params=None):
     """Helper function that optimizes a Relay module.
 
diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index 614e62ef5899..d4d6b5d8ae07 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -67,9 +67,9 @@ def get_mannual_mod():
     return mods, dshape
 
 
-def test_pipeline():
+def run_pipeline(target):
     """
-    #split compute graph into 4 pipeline
+    #Get 4 pipeline module.
     """
     mods, dshape = get_mannual_mod()
     """
@@ -84,25 +84,20 @@ def test_pipeline():
     """
     outs = [run_modules(mods, tvm.cpu(), "llvm", "data", data) for data in datas]
 
-    """
-    #Parameter use for pipeline executor creation
-    #Build module and append module and device type into variable that
-    #use for pipeline creation.
-    #first and second pipeline use cuda when cuda enable, second and 
-    #last pipeline use cpu
-    """
     mod_config = {}
-    for i in range(len(mods)):
+    indx = 0
+    for mod in mods:
         mconfig = {"target_host": None, "mod_name": "default", "build": None, "params": None}
-        # if cuda enabled, first 2 module us cuda as target
-        if i < 2 and tvm.testing.device_enabled("cuda"):
-            mconfig["target"] = "cuda"
-            mconfig["dev"] = tvm.gpu()
+        # first two module use target that could be "cuda", "nvptx" etc.
+        if indx < 2:
+            mconfig["target"] = target[0]
+            mconfig["dev"] = target[1]
         else:
             mconfig["target"] = "llvm"
             mconfig["dev"] = tvm.cpu()
 
-        mod_config[mods[i]] = mconfig
+        mod_config[mod] = mconfig
+        indx = indx + 1
 
     """
     #build and create pipeline module
@@ -136,5 +131,11 @@ def test_pipeline():
         tvm.testing.assert_allclose(ref_out, out)
 
 
+def test_pipeline():
+    target_list = tvm.testing.enabled_targets()
+    for target in target_list:
+        run_pipeline(target)
+
+
 if __name__ == "__main__":
     test_pipeline()

From ca76cb26fc3f985926985bdc6221112625a0308c Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Wed, 5 May 2021 17:42:21 -0700
Subject: [PATCH 12/28] do pipeline stop in executor deconstructor.

---
 src/runtime/pipeline/pipeline_executor.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/runtime/pipeline/pipeline_executor.h b/src/runtime/pipeline/pipeline_executor.h
index 91d2fc0b3dba..78941eecb101 100644
--- a/src/runtime/pipeline/pipeline_executor.h
+++ b/src/runtime/pipeline/pipeline_executor.h
@@ -41,6 +41,11 @@ namespace runtime {
  */
 class TVM_DLL SubGraphRuntime : public ModuleNode {
  public:
+   ~SubGraphRuntime() {
+     /* stop pipeline threads and release data in deconstructor.
+      */
+     Stop();
+   }
   /*!
    * \brief Get member function to front-end
    * \param name The name of the function.

From 01cc700e62697d860ed889916a4b9e30737fa01a Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Wed, 5 May 2021 17:56:02 -0700
Subject: [PATCH 13/28] fix plint issue.

---
 src/runtime/pipeline/pipeline_executor.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/runtime/pipeline/pipeline_executor.h b/src/runtime/pipeline/pipeline_executor.h
index 78941eecb101..a6cd1df6c7b2 100644
--- a/src/runtime/pipeline/pipeline_executor.h
+++ b/src/runtime/pipeline/pipeline_executor.h
@@ -41,11 +41,11 @@ namespace runtime {
  */
 class TVM_DLL SubGraphRuntime : public ModuleNode {
  public:
-   ~SubGraphRuntime() {
-     /* stop pipeline threads and release data in deconstructor.
-      */
-     Stop();
-   }
+  ~SubGraphRuntime() {
+    /* stop pipeline threads and release data in deconstructor.
+     */
+    Stop();
+  }
   /*!
    * \brief Get member function to front-end
    * \param name The name of the function.

From 681c4ef6afc6b6bbe09e3923d7d39e326bb87882 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Wed, 26 May 2021 23:54:44 -0700
Subject: [PATCH 14/28] Address review comments, handle final output and cross
 subgraph output reference issue.

solution:
1. add output dependency config parsing logic.
2. add output dependent field.
---
 python/tvm/contrib/pipeline_executor.py      |  17 +-
 src/runtime/pipeline/pipeline_executor.cc    |  34 +-
 src/runtime/pipeline/pipeline_executor.h     |  66 +++-
 src/runtime/pipeline/pipeline_function.cc    |  36 ++-
 src/runtime/pipeline/pipeline_function.h     |   7 +-
 src/runtime/pipeline/pipeline_struct.h       | 323 ++++++++++++++-----
 tests/python/relay/test_pipeline_executor.py | 138 +++++---
 7 files changed, 460 insertions(+), 161 deletions(-)

diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
index de8040ff739c..dc482d141f3a 100644
--- a/python/tvm/contrib/pipeline_executor.py
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -15,12 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 """Minimum pipeline executor that executes pipeline containing TVM PackedFunc."""
+import json
 import tvm._ffi
 from tvm import relay
 from tvm.contrib import graph_executor
 
 
-def build_pipeline(ir_mods, config):
+def build_pipeline(config):
     """build module list that can use for pipeline execution.
     Parameters:
     ir_mods:
@@ -38,8 +39,10 @@ def build_pipeline(ir_mods, config):
         list of IRModule
     """
     mods = {}
-    for ir_mod in ir_mods:
+    string_config = [{}] * len(config)
+    for ir_mod in config:
         mod_config = config[ir_mod]
+        string_config[mod_config["pipeline"]["mod_indx"] - 1] = mod_config["pipeline"]
         build_func = relay.build
         # if there is a self defined build function then use it.
         if mod_config["build"]:
@@ -55,7 +58,7 @@ def build_pipeline(ir_mods, config):
 
         mods[mod] = {"dev": mod_config["dev"]}
 
-    return mods
+    return mods, string_config
 
 
 def create(mods, mod_config):
@@ -72,7 +75,7 @@ def create(mods, mod_config):
     submodule : PipelineModule
         Runtime pipeline module.
     """
-    pipeline_mods = build_pipeline(mods, config=mod_config)
+    pipeline_mods, string_config = build_pipeline(mod_config)
 
     mods = []
     for pipeline_mod in pipeline_mods:
@@ -82,7 +85,7 @@ def create(mods, mod_config):
 
         mods.append(mod)
 
-    submodule = PipelineModule(mods)
+    submodule = PipelineModule(mods, json.dumps(string_config))
     return submodule
 
 
@@ -105,13 +108,13 @@ class PipelineModule(object):
 
     """
 
-    def __init__(self, graph_modules):
+    def __init__(self, graph_modules, pipeline_config):
         mods = []
         for module in graph_modules:
             mods.append(module.module)
 
         pipelinecreate = tvm._ffi.get_global_func("tvm.pipeline_executor.create")
-        module = pipelinecreate(mods)
+        module = pipelinecreate(mods, pipeline_config)
 
         self.graph_modules_ = graph_modules
 
diff --git a/src/runtime/pipeline/pipeline_executor.cc b/src/runtime/pipeline/pipeline_executor.cc
index d20292bf1852..5b90a593b63d 100644
--- a/src/runtime/pipeline/pipeline_executor.cc
+++ b/src/runtime/pipeline/pipeline_executor.cc
@@ -21,7 +21,6 @@
  * \file pipeline_runtime.cc
  */
 #include "pipeline_executor.h"
-
 #include <tvm/runtime/registry.h>
 
 namespace tvm {
@@ -36,29 +35,15 @@ void SubGraphRuntime::Stop() { pipeline_stop(runtimes); }
  */
 void SubGraphRuntime::Run() { pipeline_run(runtimes); }
 
-void SubGraphRuntime::Init(const Array<tvm::runtime::Module>& modules) {
-  pipeline_init(modules, &runtimes);
-  SetupStorage();
+void SubGraphRuntime::Init(const Array<tvm::runtime::Module>& modules,
+                           const std::string& pipeline_json) {
+  std::istringstream is(pipeline_json);
+  dmlc::JSONReader reader(&is);
+  this->Load(&reader);
+  pipeline_init(modules, &runtimes, &pipeline_conf);
   return;
 }
 
-void SubGraphRuntime::SetupStorage(void) {
-  auto lastGraphRuntime = runtimes.back();
-  int outputNum = lastGraphRuntime->runtimePtr->NumOutputs();
-  for (int i = 0; i < outputNum; i++) {
-    NDArray array = lastGraphRuntime->runtimePtr->GetOutput(i);
-    auto dltensor = const_cast<DLTensor*>(array.operator->());
-    vector<int64_t> shape;
-    for (int i = 0; i < dltensor->ndim; i++) {
-      shape.push_back(dltensor->shape[i]);
-    }
-
-    auto ndarray = NDArray::Empty(shape, dltensor->dtype, dltensor->device);
-    ndarray.CreateView(shape, dltensor->dtype);
-    output_entry_.push_back(ndarray);
-  }
-}
-
 /*!
  * \brief set index-th input to the graph.
  * \param index The input index.
@@ -169,14 +154,15 @@ PackedFunc SubGraphRuntime::GetFunction(const std::string& name,
   }
 }
 
-Module PipelineRuntimeCreate(const Array<tvm::runtime::Module>& m) {
+Module PipelineRuntimeCreate(const Array<tvm::runtime::Module>& m,
+                             const std::string& pipeline_json) {
   auto exec = make_object<SubGraphRuntime>();
-  exec->Init(m);
+  exec->Init(m, pipeline_json);
   return Module(exec);
 }
 
 TVM_REGISTER_GLOBAL("tvm.pipeline_executor.create").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = PipelineRuntimeCreate(args[0]);
+  *rv = PipelineRuntimeCreate(args[0], args[1]);
 });
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/pipeline/pipeline_executor.h b/src/runtime/pipeline/pipeline_executor.h
index a6cd1df6c7b2..bfbb21839a80 100644
--- a/src/runtime/pipeline/pipeline_executor.h
+++ b/src/runtime/pipeline/pipeline_executor.h
@@ -26,10 +26,12 @@
 #define TVM_RUNTIME_PIPELINE_PIPELINE_EXECUTOR_H_
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
+#include "../file_utils.h"
 #include "pipeline_function.h"
-
+using namespace std;
 namespace tvm {
 namespace runtime {
 
@@ -60,7 +62,6 @@ class TVM_DLL SubGraphRuntime : public ModuleNode {
   const char* type_key() const final { return "SubGraphRuntime"; }
   void Run();
   void Stop();
-  void SetupStorage();
 
   /*!
    * \brief Initialize the graph executor with graph and context.
@@ -73,7 +74,7 @@ class TVM_DLL SubGraphRuntime : public ModuleNode {
    *  by storage_id. If not given, linked parameters are looked-up using an internal implementation,
    *  which is not compatible with RPCModules.
    */
-  void Init(const Array<tvm::runtime::Module>& modules);
+  void Init(const Array<tvm::runtime::Module>& modules, const std::string& pipeline_json);
 
   /*!
    * \brief set index-th input to the graph.
@@ -104,11 +105,64 @@ class TVM_DLL SubGraphRuntime : public ModuleNode {
    */
   Array<NDArray> GetOutput(bool syncPoll = true);
 
+  void Load(dmlc::JSONReader* reader) {
+    reader->BeginArray();
+    while (reader->NextArrayItem()) {
+      std::string key;
+      reader->BeginObject();
+      int mod_indx = 0;
+      unordered_map<int, unordered_map<int, int>> output;
+      while (reader->NextObjectItem(&key)) {
+        if (key == "mod_indx") {
+          reader->Read(&mod_indx);
+        }
+        if (key == "output") {
+          reader->BeginArray();
+          while (reader->NextArrayItem()) {
+            int output_indx = -1;
+            unordered_map<int, int> depend;
+            reader->BeginObject();
+            while (reader->NextObjectItem(&key)) {
+              if (key == "output_indx") {
+                reader->Read(&output_indx);
+              }
+              if (key == "dependent") {
+                reader->BeginArray();
+                int dep_mod_indx = -1, input_indx = -1;
+                while (reader->NextArrayItem()) {
+                  reader->BeginObject();
+                  while (reader->NextObjectItem(&key)) {
+                    if (key == "mod_indx") {
+                      reader->Read(&dep_mod_indx);
+                    }
+                    if (key == "input_indx") {
+                      reader->Read(&input_indx);
+                    }
+                  }
+                  if (dep_mod_indx >= 0 && input_indx >= 0) {
+                    depend[dep_mod_indx] = input_indx;
+                  }
+                }
+              }
+            }
+
+            if (output_indx >= 0) {
+              output[output_indx] = depend;
+            }
+          }
+        }
+      }
+      if (mod_indx >= 0) {
+        pipeline_conf[mod_indx] = output;
+      }
+    }
+  }
+
  protected:
-  std::vector<NDArray> output_entry_;
-  std::vector<shared_ptr<RuntimeItem>> runtimes;
+  vector<NDArray> output_entry_;
+  PIPELINE_CONF pipeline_conf;
+  vector<shared_ptr<RuntimeItem>> runtimes;
 };
 }  // namespace runtime
 }  // namespace tvm
-
 #endif  // TVM_RUNTIME_PIPELINE_PIPELINE_EXECUTOR_H_
diff --git a/src/runtime/pipeline/pipeline_function.cc b/src/runtime/pipeline/pipeline_function.cc
index 7e3656d5181b..cd916082f386 100644
--- a/src/runtime/pipeline/pipeline_function.cc
+++ b/src/runtime/pipeline/pipeline_function.cc
@@ -38,8 +38,9 @@ void pipeline_pipeline_run(const int& num, const shared_ptr<RuntimeItem>& curRun
 
     curRunItem->Run();
 
-    auto output = curRunItem->GetOutput();
-    pipeline_queue_push(nextQueue, output);
+    vector<shared_ptr<OutputData>> outputs;
+    curRunItem->GetOutput2(&outputs);
+    pipeline_queue_push(nextQueue, &outputs);
     curRunItem->notifyDataReadyToNext();
   }
   curRunItem->notifyNextExit();
@@ -52,16 +53,20 @@ thread* pipeline_pipeline_init(SHARED_RUNTIME_VEC* runtimes) {
   return NULL;
 }
 
-void pipeline_init(Array<Module> graphRuntimes, SHARED_RUNTIME_VEC* runtimes) {
+void pipeline_init(Array<Module> graphRuntimes, SHARED_RUNTIME_VEC* runtimes,
+                   PIPELINE_CONF* pipeline_conf) {
   int len = graphRuntimes.size();
   for (int i = 0; i < len; i++) {
     QUEUE* sub_queue = createQueue<SLOT>(NULL, SUB_Q_SIZE);
-    auto runItem = make_shared<RuntimeItem>(graphRuntimes[i], sub_queue);
+    /* runtimeIndx start from 1.
+     */
+    int runtimeIndx = i + 1;
+    auto runItem = make_shared<RuntimeItem>(graphRuntimes[i], sub_queue,
+                                            &((*pipeline_conf)[runtimeIndx]), runtimeIndx);
     runtimes->push_back(runItem);
-    /*
-       set prev and next for RuntimeItem, runtime need these information to
-       poll data from prev and do notification for next.
-       */
+    /*set prev and next for RuntimeItem, runtime need these information to
+     * poll data from prev and do notification for next.
+     */
     if (i > 0) {
       (*runtimes)[i - 1]->next = (*runtimes)[i];
     }
@@ -73,8 +78,8 @@ void pipeline_init(Array<Module> graphRuntimes, SHARED_RUNTIME_VEC* runtimes) {
   return;
 }
 
-inline void pipeline_queue_push(QUEUE* queue, Array<NDArray> arrays) {
-  q_push<SLOT, Array<NDArray>>(queue, arrays);
+inline void pipeline_queue_push(QUEUE* queue, vector<shared_ptr<OutputData>>* outputs) {
+  q_push<SLOT, vector<shared_ptr<OutputData>>*>(queue, outputs);
   return;
 }
 
@@ -85,7 +90,10 @@ bool pipeline_queue_poll(QUEUE* queue, RuntimeData* runtimeData) {
 void pipeline_run(const SHARED_RUNTIME_VEC& runtimes) {
   shared_ptr<RuntimeItem> runtime = runtimes.front();
   runtime->Run();
-  pipeline_queue_push(runtime->next->queue, runtime->GetOutput());
+
+  vector<shared_ptr<OutputData>> outputs;
+  runtime->GetOutput2(&outputs);
+  pipeline_queue_push(runtime->next->queue, &outputs);
   runtime->notifyDataReadyToNext();
   return;
 }
@@ -108,4 +116,8 @@ bool pipeline_poll(vector<NDArray>* output, const SHARED_RUNTIME_VEC& runtimes,
   return suc;
 }
 
-void pipeline_stop(const SHARED_RUNTIME_VEC& runtimes) { runtimes.front()->notifyNextExit(); }
+void pipeline_stop(const SHARED_RUNTIME_VEC& runtimes) {
+  if (!runtimes.empty()) {
+    runtimes.front()->notifyNextExit();
+  }
+}
diff --git a/src/runtime/pipeline/pipeline_function.h b/src/runtime/pipeline/pipeline_function.h
index 8236399c4057..c3a8a29f76d3 100644
--- a/src/runtime/pipeline/pipeline_function.h
+++ b/src/runtime/pipeline/pipeline_function.h
@@ -19,6 +19,7 @@
 #ifndef TVM_RUNTIME_PIPELINE_PIPELINE_FUNCTION_H_
 #define TVM_RUNTIME_PIPELINE_PIPELINE_FUNCTION_H_
 #include <memory>
+#include <unordered_map>
 #include <vector>
 
 #include "pipeline_data.h"
@@ -26,10 +27,12 @@
 using namespace std;
 using namespace tvm::runtime;
 typedef vector<shared_ptr<RuntimeItem>> SHARED_RUNTIME_VEC;
+typedef unordered_map<int, unordered_map<int, unordered_map<int, int>>> PIPELINE_CONF;
 
-void pipeline_init(Array<Module> graphRuntimes, SHARED_RUNTIME_VEC* runtimes);
+void pipeline_init(Array<Module> graphRuntimes, SHARED_RUNTIME_VEC* runtimes,
+                   PIPELINE_CONF* pipeline_conf);
 void pipeline_run(const SHARED_RUNTIME_VEC& runtimes);
-inline void pipeline_queue_push(QUEUE* queue, Array<NDArray> arrays);
+inline void pipeline_queue_push(QUEUE* queue, vector<shared_ptr<OutputData>>* outputs);
 bool pipeline_queue_poll(QUEUE* queue, RuntimeData* runtimeData);
 bool pipeline_poll(vector<NDArray>* output, const SHARED_RUNTIME_VEC& runtimes,
                    const bool bSync = false);
diff --git a/src/runtime/pipeline/pipeline_struct.h b/src/runtime/pipeline/pipeline_struct.h
index 0e99fb991793..4572c1acf288 100644
--- a/src/runtime/pipeline/pipeline_struct.h
+++ b/src/runtime/pipeline/pipeline_struct.h
@@ -32,12 +32,15 @@
 #include <mutex>
 #include <string>
 #include <thread>
+#include <unordered_map>
 #include <vector>
 #define SLOT slot_t<>
 #define SUB_Q_SIZE 1024
 using namespace tvm::runtime;
 using namespace std;
-// thread control struction, for single consumer single producer mode
+typedef unordered_map<int, unordered_map<int, int>> RUNTIME_PIPELINE_OUTPUT_CONF;
+/* thread control struction, for single consumer single producer mode.
+ */
 class TControl {
  private:
   condition_variable cond;
@@ -64,8 +67,7 @@ class TControl {
   }
 
   void exit_notify(thread* t) {
-    /*
-     * set bExit first then notify
+    /* set bExit first then notify
      */
     bExit = true;
     notify();
@@ -75,111 +77,204 @@ class TControl {
   }
 };
 
-class pipelineData {
+#define DEPENDENT_MAX 32
+#define TYP_MAX(type) (1 << size_of(type) - 1)
+typedef uint8_t DEP_INDX_TYPE;
+class Dependent {
  private:
-  void ResetDataList(size_t num) {
-    if (max_num < num) {
-      for (size_t i = 0; i < max_num; i++) {
-        TVMArrayFree(dataList[i]);
-      }
-
-      if (dataList) {
-        free(dataList);
-      }
+  /* index 0  represent output is final output or not.*/
+  uint8_t bFinal = false;
+  /* how many dependent*/
+  uint8_t depNum = 0;
+  /* dependent input index number.*/
+  union {
+    DEP_INDX_TYPE dependent[DEPENDENT_MAX] = {0};
+    DEP_INDX_TYPE outputIndx;
+  };
 
-      dataList = reinterpret_cast<DLTensor**>(calloc(num, sizeof(DLTensor*)));
-      max_num = num;
+ public:
+  void SetDepModInputIndx(const int modIndx, const uint8_t inputIndx) {
+    assert(modIndx <= DEPENDENT_MAX);
+    assert(inputIndx <= TYP_MAX(DEP_INDX_TYPE));
+    if (modIndx == 0) {
+      bFinal = true;
+      outputIndx = inputIndx;
+    } else {
+      dependent[modIndx - 1] = inputIndx;
     }
-    return;
+    depNum++;
+  }
+
+  int GetOutputIndx(void) { return outputIndx; }
+
+  int GetDepModInputIndx(const int modIndx) { return dependent[modIndx - 1]; }
+
+  void RemoveDependentRef(const int modIndx) {
+    dependent[modIndx - 1] = 0;
+    depNum--;
   }
 
-  DLTensor* CreateCopyFrom(const DLTensor* from, DLTensor** to, int device_type, int device_id) {
+  /*
+   * check if the output need get forward to next runtime.
+   */
+  bool NeedForward() { return (bFinal || depNum > 0); }
+};
+
+class InputData {
+ public:
+  Dependent dependent;
+  DLTensor* data = nullptr;
+
+  DLTensor* CreateCopyFrom(const DLTensor* from, int device_type, int device_id) {
     size_t fromLen = tvm::runtime::GetDataSize(*from);
-    size_t toLen = *to ? tvm::runtime::GetDataSize(*(*to)) : 0;
+    size_t toLen = data ? tvm::runtime::GetDataSize(*data) : 0;
 
     if (fromLen != toLen) {
-      if (*to) {
-        TVMArrayFree(*to);
-        *to = nullptr;
+      if (data) {
+        TVMArrayFree(data);
+        data = nullptr;
       }
       TVMArrayAlloc(from->shape, from->ndim, from->dtype.code, from->dtype.bits, from->dtype.lanes,
-                    device_type, device_id, to);
+                    device_type, device_id, &data);
     }
-    TVMArrayCopyFromTo(const_cast<DLTensor*>(from), *to, nullptr);
-    return *to;
+    TVMArrayCopyFromTo(const_cast<DLTensor*>(from), data, nullptr);
+    return data;
   }
+  ~InputData() {
+    if (data) {
+      TVMArrayFree(data);
+      data = nullptr;
+    }
+  }
+};
 
+class OutputData {
  public:
-  void Copy(const Array<NDArray>& dlArray, int device_type, int device_id) {
-    num = dlArray.size();
-    ResetDataList(num);
+  OutputData(const NDArray& data, const size_t Indx,
+             RUNTIME_PIPELINE_OUTPUT_CONF runtime_pipeline_output_conf) {
+    assert(runtime_pipeline_output_conf.size() < DEPENDENT_MAX);
+    /* use data_ to keep the NDArray data reference, to avoid memory
+     * used by DLTensor get freed.
+     */
+    data_ = data;
+    dltensor = const_cast<DLTensor*>(data_.operator->());
+    outputIndx = Indx;
+    for (auto conf : runtime_pipeline_output_conf[outputIndx]) {
+      dependent.SetDepModInputIndx(conf.first, conf.second);
+    }
+  }
 
-    for (size_t i = 0; i < num; i++) {
-      CreateCopyFrom(const_cast<const DLTensor*>(dlArray[i].operator->()), &dataList[i],
-                     device_type, device_id);
+  explicit OutputData(const InputData* pdata) {
+    dependent = pdata->dependent;
+    /* caller need make sure pdata->data is avaialble.
+     */
+    dltensor = pdata->data;
+  }
+
+  OutputData& operator=(const InputData* pdata) {
+    dependent = pdata->dependent;
+    /* caller need make sure pdata->data is avaialble.
+     */
+    dltensor = pdata->data;
+    return *this;
+  }
+
+  int runtimeIdx;
+  /* reserved, for debug purpose
+   */
+  int outputIndx;
+  /* index 0  represent output is final output or not.
+   * index offset is dependent mod index,
+   * value is dependent mode input index
+   */
+  Dependent dependent;
+  DLTensor* dltensor;
+
+ private:
+  NDArray data_;
+};
+
+class PipelineData {
+ private:
+  void FreeData() {
+    for (size_t i = 0; i < max_num; i++) {
+      delete inputList[i];
+    }
+
+    if (inputList) {
+      free(inputList);
+    }
+  }
+
+  void ResetDataList(size_t num) {
+    if (max_num < num) {
+      FreeData();
+      inputList = reinterpret_cast<InputData**>(calloc(num, sizeof(InputData)));
+      max_num = num;
     }
     return;
   }
 
-  void Copy(const DLTensor* dlTensor, int device_type, int device_id) {
-    num = 1;
-    ResetDataList(num);
-    CreateCopyFrom(dlTensor, &dataList[0], device_type, device_id);
+  InputData* CreateCopyFrom(const DLTensor* fromData, const Dependent& fromDep, InputData** to,
+                            int device_type, int device_id) {
+    if (!*to) {
+      *to = new InputData;
+    }
+
+    (*to)->CreateCopyFrom(fromData, device_type, device_id);
+    (*to)->dependent = fromDep;
+    return *to;
+  }
+
+ public:
+  void ExportAppendData(vector<shared_ptr<OutputData>>* outputs) {
+    for (size_t i = 0; i < num; i++) {
+      shared_ptr<OutputData> var = make_shared<OutputData>(inputList[i]);
+      outputs->push_back(var);
+    }
     return;
   }
 
-  void Copy(const vector<const DLTensor*>& dlTensors, int device_type, int device_id) {
-    num = dlTensors.size();
+  void Copy(const vector<InputData*>& dlInput, int device_type, int device_id) {
+    num = dlInput.size();
     ResetDataList(num);
 
     for (size_t i = 0; i < num; i++) {
-      CreateCopyFrom(dlTensors[i], &dataList[i], device_type, device_id);
+      CreateCopyFrom(dlInput[i]->data, dlInput[i]->dependent, &inputList[i], device_type,
+                     device_id);
     }
     return;
   }
 
-  void Copy(DLTensor** dlTensors, size_t dlNum, int device_type, int device_id) {
-    num = dlNum;
+  void Copy(const vector<shared_ptr<OutputData>>* dlOutput, int device_type, int device_id) {
+    num = dlOutput->size();
     ResetDataList(num);
 
     for (size_t i = 0; i < num; i++) {
-      auto dlTensor = const_cast<DLTensor*>(dlTensors[i]);
-      CreateCopyFrom(dlTensor, &dataList[i], device_type, device_id);
+      CreateCopyFrom(dlOutput->at(i)->dltensor, dlOutput->at(i)->dependent, &inputList[i],
+                     device_type, device_id);
     }
     return;
   }
+
   size_t num;
   size_t max_num;
-  DLTensor** dataList;
+  InputData** inputList;
+
   TControl controlData;
-  pipelineData(void) : num(0), max_num(0), dataList(nullptr) {}
+  PipelineData(void) : num(0), max_num(0), inputList(nullptr) {}
+  ~PipelineData(void) { FreeData(); }
 };
 
 template <int device_type = kDLCPU, int device_id = 0>
 class slot_t {
  public:
   bool bExit = false;
-  pipelineData data;
+  PipelineData data;
   slot_t(void) {}
 
-  // overwrite operator = to handle "(slot) s = (OutputData) d;"
-  slot_t<device_type, device_id>& operator=(const DLTensor* dlTensor) {
-    data.Copy(dlTensor, device_type, device_id);
-    return *this;
-  }
-
-  slot_t<device_type, device_id>& operator=(const vector<const DLTensor*> dlTensors) {
-    data.Copy(dlTensors, device_type, device_id);
-    return *this;
-  }
-
-  slot_t<device_type, device_id>& operator=(const Array<NDArray> dlTensors) {
-    data.Copy(dlTensors, device_type, device_id);
-    return *this;
-  }
-
-  slot_t<device_type, device_id>& operator=(const slot_t<device_type, device_id>& slot) {
-    data.Copy(slot.data.dataList, slot.data.num, device_type, device_id);
+  slot_t<device_type, device_id>& operator=(const vector<shared_ptr<OutputData>>* outputData) {
+    data.Copy(outputData, device_type, device_id);
     return *this;
   }
 };
@@ -190,9 +285,35 @@ class pipelineOutputData {
   explicit pipelineOutputData(vector<NDArray>* datas) : datas_(datas) { ; }
   pipelineOutputData& operator=(const slot_t<device_type, device_id>& slot) {
     assert(datas_->size() >= slot.data.num);
+    unordered_map<int, DLTensor*> dataMap;
+    /* output may not ordered by index in slot, use a map to index them.
+     */
     for (size_t i = 0; i < slot.data.num; i++) {
-      auto dlTensor = slot.data.dataList[i];
-      (*datas_)[i].CopyFrom(dlTensor);
+      auto dlTensor = slot.data.inputList[i]->data;
+      int outputIndx = slot.data.inputList[i]->dependent.GetOutputIndx() - 1;
+      assert(outputIndx < slot.data.num);
+      dataMap[outputIndx] = dlTensor;
+    }
+
+    for (size_t i = 0; i < dataMap.size(); i++) {
+      auto dlTensor = dataMap[i];
+      /* alloc NDArray if there is no NDArray Allocated in vector
+       */
+      if (datas_->size() < i + 1) {
+        /* allocated NDArray
+         */
+        vector<int64_t> shape;
+        for (int i = 0; i < dlTensor->ndim; i++) {
+          shape.push_back(dlTensor->shape[i]);
+        }
+        auto ndarray = NDArray::Empty(shape, dlTensor->dtype, dlTensor->device);
+        ndarray.CreateView(shape, dlTensor->dtype);
+
+        /* push into NDArray vector
+         */
+        datas_->push_back(ndarray);
+      }
+      datas_->at(i).CopyFrom(dlTensor);
     }
     return *this;
   }
@@ -303,8 +424,13 @@ class RuntimeFunction {
 class RuntimeData {
  private:
   shared_ptr<RuntimeFunction> runtimePtr;
-  template <typename type>
-  void ImportData(type dlTensors, size_t inputsLen) {
+  int runtimeIndx;
+  /* Storage these data that need to get forwarding to
+   * next runtime.
+   */
+  PipelineData forwardData;
+
+  void ImportData(vector<DLTensor*> dlTensors, size_t inputsLen) {
     assert(runtimePtr->NumInputs() >= inputsLen);
     for (size_t i = 0; i < inputsLen; i++) {
       /*
@@ -316,16 +442,47 @@ class RuntimeData {
     return;
   }
 
+  void ImportPipelineData(InputData** data, size_t inputsLen) {
+    assert(runtimePtr->NumInputs() >= inputsLen);
+    vector<InputData*> forwardDatas;
+    for (size_t i = 0; i < inputsLen; i++) {
+      /*
+       * Use SetInput which have logic to handle
+       * cross device memory copy to set input data.
+       */
+      int inputIndx = data[i]->dependent.GetDepModInputIndx(runtimeIndx);
+      if (inputIndx > 0) {
+        runtimePtr->SetInput(inputIndx - 1, data[i]->data);
+        /* data getused remove dependent reference for current runtime
+         */
+        data[i]->dependent.RemoveDependentRef(runtimeIndx);
+      }
+
+      /* save these data that need forwarding to next runtime.
+       */
+      if (data[i]->dependent.NeedForward()) {
+        forwardDatas.push_back(data[i]);
+      }
+    }
+
+    forwardData.Copy(forwardDatas, kDLCPU, 0);
+    return;
+  }
+
  public:
-  void Init(shared_ptr<RuntimeFunction> runtime) { runtimePtr = runtime; }
+  void ExportAppendData(vector<shared_ptr<OutputData>>* outputs) {
+    forwardData.ExportAppendData(outputs);
+    return;
+  }
 
-  RuntimeData& operator=(const SLOT& slot) {
-    ImportData<DLTensor**>(slot.data.dataList, slot.data.num);
-    return *this;
+  void Init(shared_ptr<RuntimeFunction> runtime, int Indx) {
+    runtimeIndx = Indx;
+    runtimePtr = runtime;
   }
 
-  RuntimeData& operator=(vector<DLTensor*> dlTensors) {
-    ImportData<vector<DLTensor*>>(dlTensors, dlTensors.size());
+  RuntimeData& operator=(const SLOT& slot) {
+    ImportPipelineData(slot.data.inputList, slot.data.num);
+
     return *this;
   }
 };
@@ -335,22 +492,27 @@ class RuntimeItem {
   shared_ptr<RuntimeItem> prev = nullptr;
   shared_ptr<RuntimeItem> next = nullptr;
 
+  RUNTIME_PIPELINE_OUTPUT_CONF runtime_pipeline_output_conf;
+  int runtimeIndx;
   int inputsNum;
   RuntimeData rData;
   TControl control;
   QUEUE* queue = nullptr;
   thread t;
   shared_ptr<RuntimeFunction> runtimePtr = nullptr;
-  RuntimeItem(Module mod, QUEUE* inputQueue) {
+  RuntimeItem(Module mod, QUEUE* inputQueue, RUNTIME_PIPELINE_OUTPUT_CONF* pconfig, int indx) {
     if (runtimePtr == nullptr) {
       runtimePtr = make_shared<RuntimeFunction>(mod);
       inputsNum = runtimePtr->NumOutputs();
-      rData.Init(runtimePtr);
+      runtimeIndx = indx;
+      rData.Init(runtimePtr, runtimeIndx);
     }
 
     if (!queue) {
       queue = inputQueue;
     }
+    runtime_pipeline_output_conf = *pconfig;
+    runtimeIndx = indx;
   }
 
   RuntimeItem(void) {}
@@ -390,6 +552,19 @@ class RuntimeItem {
     }
     return outputs;
   }
+
+  void GetOutput2(vector<shared_ptr<OutputData>>* outputs) {
+    size_t outputsNum = runtimePtr->NumOutputs();
+    for (size_t i = 0; i < outputsNum; i++) {
+      shared_ptr<OutputData> output =
+          make_shared<OutputData>(runtimePtr->GetOutput(i), i + 1, runtime_pipeline_output_conf);
+
+      outputs->push_back(output);
+    }
+
+    rData.ExportAppendData(outputs);
+    return;
+  }
 };
 
 #endif  //  TVM_RUNTIME_PIPELINE_PIPELINE_STRUCT_H_
diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index d4d6b5d8ae07..d69875bc01d0 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -23,46 +23,84 @@
 from tvm.contrib import graph_executor, pipeline_executor
 
 
-def run_modules(mods, dev, target, dname, data):
-    for mod in mods:
+def run_modules(mod_configs, dev, target, dname, data):
+    mod_input = {}
+    final_output = {}
+    indx = 1
+    for mod in mod_configs:
         with tvm.transform.PassContext(opt_level=3):
             lib = relay.build(mod, target)
 
         m = graph_executor.GraphModule(lib["default"](dev))
-        m.set_input(dname, data)
+        # Get input information
+        mod_key = indx
+        if mod_key in mod_input:
+            for input in mod_input[mod_key]:
+                input = mod_input[mod_key][input]
+                m.set_input(input["index"] - 1, input["data"])
+        else:
+            m.set_input(dname, data)
         m.run()
         n = m.get_num_outputs()
-        output = m.get_output(0).asnumpy()
-        data = output
+        # parse mod_config and set current output as next mod input data
+        mconfig = mod_configs[mod]
+        for output in mconfig["pipeline"]["output"]:
+            output_data = m.get_output(output["output_indx"] - 1).asnumpy()
+            for dep in output["dependent"]:
+                # currnet output use as dependent input,
+                # input_indx indicate the input index number.
+                input_indx = dep["input_indx"]
+                mod_indx = dep["mod_indx"]
+                if mod_indx == 0:
+                    final_output[input_indx] = output_data
+                else:
+                    if mod_indx in mod_input:
+                        mod_input[mod_indx][input_indx] = {"index": input_indx, "data": output_data}
+                    else:
+                        mod_input[mod_indx] = {
+                            input_indx: {"index": input_indx, "data": output_data}
+                        }
+        indx = indx + 1
 
-    return output
+    return final_output
 
 
 def get_mannual_mod():
     mods = []
     dshape = (3, 3)
     data = relay.var("data", relay.TensorType(dshape, "float32"))
-    mvalue1 = np.full((1), 5).astype("float32")
+    data_net1_output_1 = relay.var("data_0", relay.TensorType(dshape, "float32"))
+    data_net1_output_2 = relay.var("data_1", relay.TensorType(dshape, "float32"))
+    data_net2_output_1 = relay.var("data_0", relay.TensorType(dshape, "float32"))
+    mvalue1 = np.full((1), 1).astype("float32")
     mvalue2 = np.full((1), 2).astype("float32")
     mvalue3 = np.full((1), 3).astype("float32")
-    mvalue4 = np.full((1), 4).astype("float32")
     mv1 = relay.Constant(tvm.nd.array(mvalue1))
     mv2 = relay.Constant(tvm.nd.array(mvalue2))
     mv3 = relay.Constant(tvm.nd.array(mvalue3))
-    mv4 = relay.Constant(tvm.nd.array(mvalue4))
-    net1 = relay.multiply(data, mv1)
 
-    net2 = relay.add(data, mv2)
-    net2 = relay.add(net2, mv3)
+    # net1 have three output, output3 is final output
+    net_output1 = relay.add(data, mv1)
+    net_output2 = relay.subtract(data, mv2)
+    net_output3 = relay.multiply(data, mv3)
 
-    net3 = relay.multiply(data, mv4)
+    # net2 use net1 output1 as input
+    net2 = relay.add(data_net1_output_1, mv2)
+    net2 = relay.add(net2, mv3)
 
-    net4 = relay.subtract(data, mv1)
+    # net3 use net2 output1 and net1 outpu2 as input
+    net3 = relay.multiply(data_net2_output_1, mv3)
+    net3 = relay.add(net3, data_net1_output_2)
 
-    mods.append(tvm.IRModule.from_expr(relay.Function([data], net1)))
-    mods.append(tvm.IRModule.from_expr(relay.Function([data], net2)))
-    mods.append(tvm.IRModule.from_expr(relay.Function([data], net3)))
-    mods.append(tvm.IRModule.from_expr(relay.Function([data], net4)))
+    mods.append(
+        tvm.IRModule.from_expr(
+            relay.Function([data], relay.Tuple([net_output1, net_output2, net_output3]))
+        )
+    )
+    mods.append(tvm.IRModule.from_expr(relay.Function([data_net1_output_1], net2)))
+    mods.append(
+        tvm.IRModule.from_expr(relay.Function([data_net1_output_2, data_net2_output_1], net3))
+    )
 
     return mods, dshape
 
@@ -79,27 +117,53 @@ def run_pipeline(target):
     for i in range(len(mods) + 1):
         datas.append(np.full(dshape, 3 + i).astype("float32"))
 
+    # set configure
+    indx = 0
+    mod_config = {}
+    mconfig = {"target_host": None, "mod_name": "default", "build": None, "params": None}
+    mconfig1 = mconfig.copy()
+    mconfig1["target"] = target[0]
+    mconfig1["dev"] = target[1]
+    # third output is final output, second output for mod2, third for  mod3
+    # input
+    mconfig1["pipeline"] = {
+        "mod_indx": 1,
+        "output": [
+            {"output_indx": 1, "dependent": [{"mod_indx": 2, "input_indx": 1}]},
+            {"output_indx": 2, "dependent": [{"mod_indx": 3, "input_indx": 1}]},
+            {"output_indx": 3, "dependent": [{"mod_indx": 0, "input_indx": 1}]},
+        ],
+    }
+    mod_config[mods[0]] = mconfig1
+
+    mconfig2 = mconfig.copy()
+    mconfig2["target"] = "llvm"
+    mconfig2["dev"] = tvm.cpu(0)
+    mconfig2["pipeline"] = {
+        "mod_indx": 2,
+        "output": [
+            {"output_indx": 1, "dependent": [{"mod_indx": 3, "input_indx": 2}]},
+        ],
+    }
+    mod_config[mods[1]] = mconfig2
+
+    mconfig3 = mconfig.copy()
+    mconfig3["target"] = "llvm"
+    mconfig3["dev"] = tvm.cpu(0)
+
+    mconfig3["pipeline"] = {
+        "mod_indx": 3,
+        "output": [{"output_indx": 1, "dependent": [{"mod_indx": 0, "input_indx": 2}]}],
+    }
+    mod_config[mods[2]] = mconfig3
+
     """
     #Run with graph executor for verification purpose
     """
-    outs = [run_modules(mods, tvm.cpu(), "llvm", "data", data) for data in datas]
-
-    mod_config = {}
-    indx = 0
-    for mod in mods:
-        mconfig = {"target_host": None, "mod_name": "default", "build": None, "params": None}
-        # first two module use target that could be "cuda", "nvptx" etc.
-        if indx < 2:
-            mconfig["target"] = target[0]
-            mconfig["dev"] = target[1]
-        else:
-            mconfig["target"] = "llvm"
-            mconfig["dev"] = tvm.cpu()
+    outs = [run_modules(mod_config, tvm.cpu(), "llvm", "data", data) for data in datas]
+    """
 
-        mod_config[mod] = mconfig
-        indx = indx + 1
 
-    """
     #build and create pipeline module
     """
     with relay.build_config(opt_level=3):
@@ -117,7 +181,8 @@ def run_pipeline(target):
     """
     pipeline_outputs = []
     for i in range(len(datas)):
-        pipeline_outputs.append(pipeline_module.get_output()[0].asnumpy())
+        curOutputs = [output.asnumpy() for output in pipeline_module.get_output()]
+        pipeline_outputs.append(curOutputs)
 
     """
     #Stop pipeline execution.
@@ -128,7 +193,8 @@ def run_pipeline(target):
     #Verify result
     """
     for ref_out, out in zip(outs, pipeline_outputs):
-        tvm.testing.assert_allclose(ref_out, out)
+        for ref in ref_out:
+            tvm.testing.assert_allclose(ref_out[ref], out[ref - 1])
 
 
 def test_pipeline():

From 9d24a0fdf46ffa17d3dc321c00b8b069eb646134 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Thu, 27 May 2021 12:30:54 -0700
Subject: [PATCH 15/28] Fix plint error.

---
 src/runtime/pipeline/pipeline_executor.cc | 3 ++-
 src/runtime/pipeline/pipeline_executor.h  | 6 +++---
 src/runtime/pipeline/pipeline_struct.h    | 4 ++--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/runtime/pipeline/pipeline_executor.cc b/src/runtime/pipeline/pipeline_executor.cc
index 5b90a593b63d..52c4131954c8 100644
--- a/src/runtime/pipeline/pipeline_executor.cc
+++ b/src/runtime/pipeline/pipeline_executor.cc
@@ -18,9 +18,10 @@
  */
 
 /*!
- * \file pipeline_runtime.cc
+ * \file pipeline_executor.cc
  */
 #include "pipeline_executor.h"
+
 #include <tvm/runtime/registry.h>
 
 namespace tvm {
diff --git a/src/runtime/pipeline/pipeline_executor.h b/src/runtime/pipeline/pipeline_executor.h
index bfbb21839a80..abe9a5e40918 100644
--- a/src/runtime/pipeline/pipeline_executor.h
+++ b/src/runtime/pipeline/pipeline_executor.h
@@ -18,9 +18,8 @@
  */
 
 /*!
- * \brief Tiny graph runtime that can run graph
- *        containing only tvm PackedFunc.
- * \file graph_runtime.h
+ * \brief pipeline executor
+ * \file pipeline_executor.h
  */
 #ifndef TVM_RUNTIME_PIPELINE_PIPELINE_EXECUTOR_H_
 #define TVM_RUNTIME_PIPELINE_PIPELINE_EXECUTOR_H_
@@ -31,6 +30,7 @@
 
 #include "../file_utils.h"
 #include "pipeline_function.h"
+
 using namespace std;
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/pipeline/pipeline_struct.h b/src/runtime/pipeline/pipeline_struct.h
index 4572c1acf288..cc153c9b0ca9 100644
--- a/src/runtime/pipeline/pipeline_struct.h
+++ b/src/runtime/pipeline/pipeline_struct.h
@@ -36,6 +36,7 @@
 #include <vector>
 #define SLOT slot_t<>
 #define SUB_Q_SIZE 1024
+
 using namespace tvm::runtime;
 using namespace std;
 typedef unordered_map<int, unordered_map<int, int>> RUNTIME_PIPELINE_OUTPUT_CONF;
@@ -379,8 +380,7 @@ class RuntimeFunction {
      */
   void CopyFromTo(DLTensor* from, DLTensor* to) {
     if (!(from->device.device_type == to->device.device_type ||
-          from->device.device_type == kDLCPU || to->device.device_type == kDLCPU ||
-          from->device.device_type == kDLCPUPinned || to->device.device_type == kDLCPUPinned)) {
+          from->device.device_type == kDLCPU || to->device.device_type == kDLCPU)) {
       if (dlLocal == nullptr) {
         dlLocal = CreateFromDLTensor(from);
       }

From fc30cac425dac07a9d9bedc123ce358de6d64e93 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Fri, 28 May 2021 11:30:21 -0700
Subject: [PATCH 16/28] add correct comments.

---
 tests/python/relay/test_pipeline_executor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index d69875bc01d0..060b8425058f 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -124,7 +124,7 @@ def run_pipeline(target):
     mconfig1 = mconfig.copy()
     mconfig1["target"] = target[0]
     mconfig1["dev"] = target[1]
-    # third output is final output, second output for mod2, third for  mod3
+    # third output is final output, second output for mod3, first for mod2
     # input
     mconfig1["pipeline"] = {
         "mod_indx": 1,

From ecc71c0ec52460d87b1962045e3c7d8b428de6b8 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Thu, 3 Jun 2021 19:44:28 -0700
Subject: [PATCH 17/28] address review comments.

---
 python/tvm/contrib/pipeline_executor.py      | 84 ++++++++++++--------
 src/runtime/pipeline/pipeline_executor.cc    | 39 ++++++---
 src/runtime/pipeline/pipeline_executor.h     |  5 +-
 src/runtime/pipeline/pipeline_function.cc    | 24 ++++--
 src/runtime/pipeline/pipeline_function.h     |  4 +-
 tests/python/relay/test_pipeline_executor.py | 26 ++++--
 6 files changed, 122 insertions(+), 60 deletions(-)

diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
index dc482d141f3a..06c5bd0ddc78 100644
--- a/python/tvm/contrib/pipeline_executor.py
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -14,40 +14,65 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Minimum pipeline executor that executes pipeline containing TVM PackedFunc."""
+"""Pipeline executor that executes pipeline containing TVM PackedFunc."""
 import json
 import tvm._ffi
 from tvm import relay
 from tvm.contrib import graph_executor
 
 
+def pipeline_executor_enabled():
+    """ check if pipeline executor enabled. """
+    pipeline_enabled = False
+    try:
+        pipelinecreate = tvm._ffi.get_global_func("tvm.pipeline_executor.create")
+        assert pipelinecreate
+        pipeline_enabled = True
+    except ValueError:
+        print("pipeline executor not enabled!")
+
+    return pipeline_enabled
+
+
 def build_pipeline(config):
     """build module list that can use for pipeline execution.
-    Parameters:
-    ir_mods:
-        list of IRModule
 
-    config:
-        build configuration informaiton, structure like following.
+    Parameters
+    ----------
+
+    config: Dict[IRModule, Dict[str, Any]]
+        build configuration informaton, structure like following.
         {IRModule: {"target":target,
                     "target_host":target_host,
                     "params":params,
                     "mod_name"mod_name,
                     "build":build}}
 
-    Return:
+    Returns
+    -------
+    ret: List[IRModule]
         list of IRModule
+    string_config: Dict[int, Dict[str, any]]
+        pipeline configuration
     """
     mods = {}
-    string_config = [{}] * len(config)
+    config_len = len(config)
+    string_config = [{} for _ in range(config_len)]
     for ir_mod in config:
+        # Get module configuration
         mod_config = config[ir_mod]
-        string_config[mod_config["pipeline"]["mod_indx"] - 1] = mod_config["pipeline"]
+        assert "pipeline" in mod_config and "mod_indx" in mod_config["pipeline"]
+        # Get module index in pipeline configuration
+        mod_indx = mod_config["pipeline"]["mod_indx"] - 1
+        assert mod_indx < config_len
+        # Create pipeline configuration
+        string_config[mod_indx] = mod_config["pipeline"]
         build_func = relay.build
         # if there is a self defined build function then use it.
         if mod_config["build"]:
             build_func = mod_config.build
 
+        # build IRModule
         mod = build_func(
             ir_mod,
             mod_config["target"],
@@ -58,6 +83,7 @@ def build_pipeline(config):
 
         mods[mod] = {"dev": mod_config["dev"]}
 
+    # return IRModule list and pipeline configuration
     return mods, string_config
 
 
@@ -66,9 +92,11 @@ def create(mods, mod_config):
 
     Parameters
     ----------
-    sub_mods :
-        {"lib": <module>,
-         "dev": <device>}
+    mods : List[IRModule]
+        list of IRModule
+
+    mod_config : Dict[IRModule, Dict[str, Any]]
+        modules and modules dependency configuration informaiton.
 
     Returns
     -------
@@ -98,13 +126,11 @@ class PipelineModule(object):
 
     Parameters
     ----------
-    module : tvm.runtime.Module
+    graph_module : List[GraphModule]
         The internal tvm module that holds the actual graph functions.
 
-    Attributes
-    ----------
-    module : tvm.runtime.Module
-        The internal tvm module that holds the actual graph functions.
+    pipeline_config : Dict[IRModule, Dict[str, Any]]
+        modules and modules dependency configuration informaiton.
 
     """
 
@@ -114,6 +140,7 @@ def __init__(self, graph_modules, pipeline_config):
             mods.append(module.module)
 
         pipelinecreate = tvm._ffi.get_global_func("tvm.pipeline_executor.create")
+        assert pipelinecreate
         module = pipelinecreate(mods, pipeline_config)
 
         self.graph_modules_ = graph_modules
@@ -126,39 +153,30 @@ def __init__(self, graph_modules, pipeline_config):
         self._get_num_outputs = module["get_num_outputs"]
         self._get_num_inputs = module["get_num_inputs"]
 
-    def set_input(self, key=None, value=None, params=None):
+    def set_input(self, key, value, params=None, modindx=0):
         """Set inputs to the module via kwargs
 
         Parameters
         ----------
-        key : int or str
+        key : array_like
            The input key
 
-        value : the input value.
+        value : array_like.
            The input key
 
         params : dict of str to NDArray
            Additional arguments
         """
         if key is not None:
-            self.graph_modules_[0].set_input(key, value)
+            self.graph_modules_[modindx].set_input(key, value)
 
         if params:
-            indx = 0
             for param in params:
-                self.graph_modules_[indx].set_input(**param)
+                self.graph_modules_[modindx].set_input(**param)
                 indx = indx + 1
 
-    def run(self, **input_dict):
-        """Run forward execution of the graph
-
-        Parameters
-        ----------
-        input_dict: dict of str to NDArray
-            List of input values to be feed to
-        """
-        if input_dict:
-            self.set_input(**input_dict)
+    def run(self):
+        """Run forward execution of the graph"""
         self._run()
 
     def stop(self):
diff --git a/src/runtime/pipeline/pipeline_executor.cc b/src/runtime/pipeline/pipeline_executor.cc
index 52c4131954c8..3ab18a5fe3b2 100644
--- a/src/runtime/pipeline/pipeline_executor.cc
+++ b/src/runtime/pipeline/pipeline_executor.cc
@@ -41,22 +41,29 @@ void SubGraphRuntime::Init(const Array<tvm::runtime::Module>& modules,
   std::istringstream is(pipeline_json);
   dmlc::JSONReader reader(&is);
   this->Load(&reader);
-  pipeline_init(modules, &runtimes, &pipeline_conf);
+  outpuNumber = pipeline_init(modules, &runtimes, &pipeline_conf);
   return;
 }
 
 /*!
- * \brief set index-th input to the graph.
+ * \brief set index-th input to the modIndx-th graph.
  * \param index The input index.
  * \param data_in The input data.
+ * \param modIndx The runtime index.
  */
-void SubGraphRuntime::SetInput(int index, DLTensor* data_in) {
-  auto gruntime = runtimes.front();
+void SubGraphRuntime::SetInput(int index, DLTensor* data_in, int modIndx) {
+  auto gruntime = runtimes[modIndx];
   gruntime->runtimePtr->SetInput(index, data_in);
 }
 
-void SubGraphRuntime::SetInput(const std::string& name, DLTensor* data_in) {
-  auto gruntime = runtimes.front();
+/*!
+ * \brief set index-th input to the modIndx-th graph.
+ * \param index The input index.
+ * \param data_in The input data.
+ * \param modIndx The runtime index.
+ */
+void SubGraphRuntime::SetInput(const std::string& name, DLTensor* data_in, int modIndx) {
+  auto gruntime = runtimes[modIndx];
   gruntime->runtimePtr->SetInput(name, data_in);
 }
 
@@ -65,14 +72,20 @@ void SubGraphRuntime::SetInput(const std::string& name, DLTensor* data_in) {
  *
  * \return The number of outputs from last pipeline.
  */
-int SubGraphRuntime::NumOutputs() const { return runtimes.back()->runtimePtr->NumOutputs(); }
+int SubGraphRuntime::NumOutputs() const { return outpuNumber; }
 
 /*!
  * \brief Get the number of inputs
  *
  * \return The number of inputs to the first pipeline.
  */
-int SubGraphRuntime::NumInputs() const { return runtimes.front()->runtimePtr->NumInputs(); }
+int SubGraphRuntime::NumInputs() const {
+  int inputsNum = 0;
+  for (auto runtime : runtimes) {
+    inputsNum += runtime->runtimePtr->NumInputs();
+  }
+  return inputsNum;
+}
 
 /*!
  * \brief Return NDArray for given input index.
@@ -110,10 +123,16 @@ PackedFunc SubGraphRuntime::GetFunction(const std::string& name,
   // Return member functions during query.
   if (name == "set_input") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      /* Default use first runtime index value.
+       */
+      int modIndx = 0;
+      if (args.num_args == 3) {
+        modIndx = static_cast<int>(args[2]);
+      }
       if (String::CanConvertFrom(args[0])) {
-        this->SetInput(args[0].operator String(), args[1]);
+        this->SetInput(args[0].operator String(), args[1], modIndx);
       } else {
-        this->SetInput(static_cast<int>(args[0]), args[1]);
+        this->SetInput(static_cast<int>(args[0]), args[1], modIndx);
       }
     });
   } else if (name == "get_output") {
diff --git a/src/runtime/pipeline/pipeline_executor.h b/src/runtime/pipeline/pipeline_executor.h
index abe9a5e40918..5422b5b59974 100644
--- a/src/runtime/pipeline/pipeline_executor.h
+++ b/src/runtime/pipeline/pipeline_executor.h
@@ -81,8 +81,8 @@ class TVM_DLL SubGraphRuntime : public ModuleNode {
    * \param index The input index.
    * \param data_in The input data.
    */
-  void SetInput(int index, DLTensor* data_in);
-  void SetInput(const std::string& name, DLTensor* data_in);
+  void SetInput(int index, DLTensor* data_in, int modIndx);
+  void SetInput(const std::string& name, DLTensor* data_in, int modIndx);
   NDArray GetInput(int index, int mIndx) const;
   NDArray GetInput(const std::string& name, int mIndx) const;
   /*!
@@ -162,6 +162,7 @@ class TVM_DLL SubGraphRuntime : public ModuleNode {
   vector<NDArray> output_entry_;
   PIPELINE_CONF pipeline_conf;
   vector<shared_ptr<RuntimeItem>> runtimes;
+  size_t outpuNumber = 0;
 };
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/pipeline/pipeline_function.cc b/src/runtime/pipeline/pipeline_function.cc
index cd916082f386..88c51705a8ec 100644
--- a/src/runtime/pipeline/pipeline_function.cc
+++ b/src/runtime/pipeline/pipeline_function.cc
@@ -53,18 +53,19 @@ thread* pipeline_pipeline_init(SHARED_RUNTIME_VEC* runtimes) {
   return NULL;
 }
 
-void pipeline_init(Array<Module> graphRuntimes, SHARED_RUNTIME_VEC* runtimes,
-                   PIPELINE_CONF* pipeline_conf) {
+size_t pipeline_init(Array<Module> graphRuntimes, SHARED_RUNTIME_VEC* runtimes,
+                     PIPELINE_CONF* pipeline_conf) {
+  int outputNum = 0;
   int len = graphRuntimes.size();
   for (int i = 0; i < len; i++) {
     QUEUE* sub_queue = createQueue<SLOT>(NULL, SUB_Q_SIZE);
     /* runtimeIndx start from 1.
      */
     int runtimeIndx = i + 1;
-    auto runItem = make_shared<RuntimeItem>(graphRuntimes[i], sub_queue,
-                                            &((*pipeline_conf)[runtimeIndx]), runtimeIndx);
+    auto& pConf = pipeline_conf->at(runtimeIndx);
+    auto runItem = make_shared<RuntimeItem>(graphRuntimes[i], sub_queue, &pConf, runtimeIndx);
     runtimes->push_back(runItem);
-    /*set prev and next for RuntimeItem, runtime need these information to
+    /* set prev and next for RuntimeItem, runtime need these information to
      * poll data from prev and do notification for next.
      */
     if (i > 0) {
@@ -73,9 +74,20 @@ void pipeline_init(Array<Module> graphRuntimes, SHARED_RUNTIME_VEC* runtimes,
     if (i == len - 1) {
       (*runtimes)[i]->next = (*runtimes)[0];
     }
+    /* get output number.
+     */
+    if (i < len - 1) {
+      for (auto depMap : pConf) {
+        /* output is final output when dependent number is 0.
+         */
+        outputNum += depMap.second.find(0) != depMap.second.end();
+      }
+    } else {
+      outputNum += runItem->runtimePtr->NumOutputs();
+    }
   }
   pipeline_pipeline_init(runtimes);
-  return;
+  return outputNum;
 }
 
 inline void pipeline_queue_push(QUEUE* queue, vector<shared_ptr<OutputData>>* outputs) {
diff --git a/src/runtime/pipeline/pipeline_function.h b/src/runtime/pipeline/pipeline_function.h
index c3a8a29f76d3..440956a69a53 100644
--- a/src/runtime/pipeline/pipeline_function.h
+++ b/src/runtime/pipeline/pipeline_function.h
@@ -29,8 +29,8 @@ using namespace tvm::runtime;
 typedef vector<shared_ptr<RuntimeItem>> SHARED_RUNTIME_VEC;
 typedef unordered_map<int, unordered_map<int, unordered_map<int, int>>> PIPELINE_CONF;
 
-void pipeline_init(Array<Module> graphRuntimes, SHARED_RUNTIME_VEC* runtimes,
-                   PIPELINE_CONF* pipeline_conf);
+size_t pipeline_init(Array<Module> graphRuntimes, SHARED_RUNTIME_VEC* runtimes,
+                     PIPELINE_CONF* pipeline_conf);
 void pipeline_run(const SHARED_RUNTIME_VEC& runtimes);
 inline void pipeline_queue_push(QUEUE* queue, vector<shared_ptr<OutputData>>* outputs);
 bool pipeline_queue_poll(QUEUE* queue, RuntimeData* runtimeData);
diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index 060b8425058f..4702ae9b7cf4 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -23,7 +23,7 @@
 from tvm.contrib import graph_executor, pipeline_executor
 
 
-def run_modules(mod_configs, dev, target, dname, data):
+def run_modules(mod_configs, dev, target, dname, data, iMod, iName, iData):
     mod_input = {}
     final_output = {}
     indx = 1
@@ -40,6 +40,11 @@ def run_modules(mod_configs, dev, target, dname, data):
                 m.set_input(input["index"] - 1, input["data"])
         else:
             m.set_input(dname, data)
+
+        # set input for specify module
+        if mod == iMod:
+            m.set_input(iName, iData)
+
         m.run()
         n = m.get_num_outputs()
         # parse mod_config and set current output as next mod input data
@@ -68,7 +73,8 @@ def run_modules(mod_configs, dev, target, dname, data):
 def get_mannual_mod():
     mods = []
     dshape = (3, 3)
-    data = relay.var("data", relay.TensorType(dshape, "float32"))
+    data = relay.var("data_0", relay.TensorType(dshape, "float32"))
+    data21 = relay.var("data_1", relay.TensorType(dshape, "float32"))
     data_net1_output_1 = relay.var("data_0", relay.TensorType(dshape, "float32"))
     data_net1_output_2 = relay.var("data_1", relay.TensorType(dshape, "float32"))
     data_net2_output_1 = relay.var("data_0", relay.TensorType(dshape, "float32"))
@@ -86,6 +92,7 @@ def get_mannual_mod():
 
     # net2 use net1 output1 as input
     net2 = relay.add(data_net1_output_1, mv2)
+    net2 = relay.add(net2, data21)
     net2 = relay.add(net2, mv3)
 
     # net3 use net2 output1 and net1 outpu2 as input
@@ -97,7 +104,7 @@ def get_mannual_mod():
             relay.Function([data], relay.Tuple([net_output1, net_output2, net_output3]))
         )
     )
-    mods.append(tvm.IRModule.from_expr(relay.Function([data_net1_output_1], net2)))
+    mods.append(tvm.IRModule.from_expr(relay.Function([data_net1_output_1, data21], net2)))
     mods.append(
         tvm.IRModule.from_expr(relay.Function([data_net1_output_2, data_net2_output_1], net3))
     )
@@ -129,7 +136,7 @@ def run_pipeline(target):
     mconfig1["pipeline"] = {
         "mod_indx": 1,
         "output": [
-            {"output_indx": 1, "dependent": [{"mod_indx": 2, "input_indx": 1}]},
+            {"output_indx": 1, "dependent": [{"mod_indx": 2, "input_indx": 2}]},
             {"output_indx": 2, "dependent": [{"mod_indx": 3, "input_indx": 1}]},
             {"output_indx": 3, "dependent": [{"mod_indx": 0, "input_indx": 1}]},
         ],
@@ -160,7 +167,10 @@ def run_pipeline(target):
     """
     #Run with graph executor for verification purpose
     """
-    outs = [run_modules(mod_config, tvm.cpu(), "llvm", "data", data) for data in datas]
+    outs = [
+        run_modules(mod_config, tvm.cpu(), "llvm", "data_0", data, mods[1], "data_1", data)
+        for data in datas
+    ]
     """
 
 
@@ -173,7 +183,8 @@ def run_pipeline(target):
     #Use pipeline executor to pipeline the said pipeline which use different backend
     """
     for data in datas:
-        pipeline_module.set_input("data", data)
+        pipeline_module.set_input("data_0", data)
+        pipeline_module.set_input("data_1", data, modindx=1)
         pipeline_module.run()
 
     """
@@ -204,4 +215,5 @@ def test_pipeline():
 
 
 if __name__ == "__main__":
-    test_pipeline()
+    if pipeline_executor.pipeline_executor_enabled():
+        test_pipeline()

From 3eb1b4c3f70c6af7f634c61e5660b02e0d60752c Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Tue, 15 Jun 2021 01:07:43 -0700
Subject: [PATCH 18/28] add OTF input set support

---
 cmake/config.cmake                           |  2 +-
 python/tvm/contrib/graph_executor.py         |  1 +
 python/tvm/contrib/pipeline_executor.py      | 19 +++----
 src/runtime/pipeline/pipeline_executor.cc    | 51 +++++++++--------
 src/runtime/pipeline/pipeline_executor.h     | 32 ++++++++---
 src/runtime/pipeline/pipeline_function.cc    | 60 ++++++++++++++++++--
 src/runtime/pipeline/pipeline_function.h     | 11 +++-
 src/runtime/pipeline/pipeline_struct.h       | 57 ++++++++++---------
 tests/python/relay/test_pipeline_executor.py | 37 ++++++------
 9 files changed, 175 insertions(+), 95 deletions(-)

diff --git a/cmake/config.cmake b/cmake/config.cmake
index 408463e772a5..76465ce49941 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -102,7 +102,7 @@ set(USE_STACKVM_RUNTIME OFF)
 # Whether enable tiny embedded graph executor.
 set(USE_GRAPH_EXECUTOR ON)
 # Whether enable subgraph runtime.
-set(USE_PIPELINE_EXECUTOR ON)
+set(USE_PIPELINE_EXECUTOR OFF)
 
 # Whether enable tiny graph executor with CUDA Graph
 set(USE_GRAPH_EXECUTOR_CUDA_GRAPH OFF)
diff --git a/python/tvm/contrib/graph_executor.py b/python/tvm/contrib/graph_executor.py
index f064f8dbee69..94e4261aca2f 100644
--- a/python/tvm/contrib/graph_executor.py
+++ b/python/tvm/contrib/graph_executor.py
@@ -156,6 +156,7 @@ def __init__(self, module):
         self._run = module["run"]
         self._get_output = module["get_output"]
         self._get_input = module["get_input"]
+        self._get_input_index = module["get_input_index"]
         self._get_num_outputs = module["get_num_outputs"]
         self._get_input_index = module["get_input_index"]
         self._get_num_inputs = module["get_num_inputs"]
diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
index 06c5bd0ddc78..0f88134e4028 100644
--- a/python/tvm/contrib/pipeline_executor.py
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -22,7 +22,7 @@
 
 
 def pipeline_executor_enabled():
-    """ check if pipeline executor enabled. """
+    """check if pipeline executor enabled."""
     pipeline_enabled = False
     try:
         pipelinecreate = tvm._ffi.get_global_func("tvm.pipeline_executor.create")
@@ -69,7 +69,7 @@ def build_pipeline(config):
         string_config[mod_indx] = mod_config["pipeline"]
         build_func = relay.build
         # if there is a self defined build function then use it.
-        if mod_config["build"]:
+        if "build" in mod_config and mod_config["build"]:
             build_func = mod_config.build
 
         # build IRModule
@@ -118,11 +118,8 @@ def create(mods, mod_config):
 
 
 class PipelineModule(object):
-    """Wrapper runtime module.
-
-    This is a thin wrapper of the underlying TVM module.
-    you can also directly call set_input, run, and get_output
-    of underlying module functions
+    """Wrapper runtime module. This is a thin wrapper of the underlying TVM module.
+    you can also directly call set_input, run, and get_output of underlying module functions.
 
     Parameters
     ----------
@@ -153,7 +150,7 @@ def __init__(self, graph_modules, pipeline_config):
         self._get_num_outputs = module["get_num_outputs"]
         self._get_num_inputs = module["get_num_inputs"]
 
-    def set_input(self, key, value, params=None, modindx=0):
+    def set_input(self, key, value, modindx=1, params=None):
         """Set inputs to the module via kwargs
 
         Parameters
@@ -167,13 +164,13 @@ def set_input(self, key, value, params=None, modindx=0):
         params : dict of str to NDArray
            Additional arguments
         """
+        assert modindx >= 1
         if key is not None:
-            self.graph_modules_[modindx].set_input(key, value)
+            self._set_input(key, tvm.nd.array(value, tvm.cpu()), modindx)
 
         if params:
             for param in params:
-                self.graph_modules_[modindx].set_input(**param)
-                indx = indx + 1
+                self.graph_modules_[modindx - 1].set_input(**param)
 
     def run(self):
         """Run forward execution of the graph"""
diff --git a/src/runtime/pipeline/pipeline_executor.cc b/src/runtime/pipeline/pipeline_executor.cc
index 3ab18a5fe3b2..0cd801068714 100644
--- a/src/runtime/pipeline/pipeline_executor.cc
+++ b/src/runtime/pipeline/pipeline_executor.cc
@@ -34,7 +34,11 @@ void SubGraphRuntime::Stop() { pipeline_stop(runtimes); }
 /*!
  * \brief Run all the operations one by one.
  */
-void SubGraphRuntime::Run() { pipeline_run(runtimes); }
+void SubGraphRuntime::Run() {
+  pipeline_run(runtimes, input_int_map);
+  /* Clear the input map
+   */
+}
 
 void SubGraphRuntime::Init(const Array<tvm::runtime::Module>& modules,
                            const std::string& pipeline_json) {
@@ -52,19 +56,11 @@ void SubGraphRuntime::Init(const Array<tvm::runtime::Module>& modules,
  * \param modIndx The runtime index.
  */
 void SubGraphRuntime::SetInput(int index, DLTensor* data_in, int modIndx) {
-  auto gruntime = runtimes[modIndx];
-  gruntime->runtimePtr->SetInput(index, data_in);
-}
-
-/*!
- * \brief set index-th input to the modIndx-th graph.
- * \param index The input index.
- * \param data_in The input data.
- * \param modIndx The runtime index.
- */
-void SubGraphRuntime::SetInput(const std::string& name, DLTensor* data_in, int modIndx) {
-  auto gruntime = runtimes[modIndx];
-  gruntime->runtimePtr->SetInput(name, data_in);
+  if (1 == modIndx) {
+    runtimes[0]->runtimePtr->SetInput(index, data_in);
+  } else {
+    pipeline_setinput(input_int_map, index, data_in, modIndx);
+  }
 }
 
 /*!
@@ -98,9 +94,15 @@ NDArray SubGraphRuntime::GetInput(int index, int mIndx) const {
   return gruntime->runtimePtr->GetInput(index);
 }
 
-NDArray SubGraphRuntime::GetInput(const std::string& name, int mIndx) const {
-  auto gruntime = runtimes[mIndx];
-  return gruntime->runtimePtr->GetInput(name);
+/*!
+ * \brief Return input index for given input name.
+ * \param name The input name.
+ *
+ * \return int corresponding to given input node name.
+ */
+int SubGraphRuntime::GetInputIndex(const string& name, int mIndx) const {
+  auto gruntime = runtimes[mIndx - 1];
+  return gruntime->runtimePtr->GetInputIndex(name);
 }
 
 /*!
@@ -120,7 +122,8 @@ Array<NDArray> SubGraphRuntime::GetOutput(bool syncPoll) {
 
 PackedFunc SubGraphRuntime::GetFunction(const std::string& name,
                                         const ObjectPtr<Object>& sptr_to_self) {
-  // Return member functions during query.
+  /* Return member functions during query.
+   */
   if (name == "set_input") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       /* Default use first runtime index value.
@@ -130,7 +133,8 @@ PackedFunc SubGraphRuntime::GetFunction(const std::string& name,
         modIndx = static_cast<int>(args[2]);
       }
       if (String::CanConvertFrom(args[0])) {
-        this->SetInput(args[0].operator String(), args[1], modIndx);
+        int index = this->GetInputIndex(args[0].operator String(), modIndx);
+        this->SetInput(index, args[1], modIndx);
       } else {
         this->SetInput(static_cast<int>(args[0]), args[1], modIndx);
       }
@@ -145,17 +149,18 @@ PackedFunc SubGraphRuntime::GetFunction(const std::string& name,
     });
   } else if (name == "get_input") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      int in_idx = 0, graph_idx = 0;
+      int in_idx = 0, mod_idx = 0;
       if (args.num_args == 2) {
-        graph_idx = args[1];
+        mod_idx = args[1];
       }
 
       if (String::CanConvertFrom(args[0])) {
-        *rv = this->GetInput(args[0].operator String(), graph_idx);
+        int index = this->GetInputIndex(args[0].operator String(), mod_idx);
+        *rv = this->GetInput(index, mod_idx);
       } else {
         in_idx = args[0];
         if (in_idx >= 0) {
-          *rv = this->GetInput(in_idx, graph_idx);
+          *rv = this->GetInput(in_idx, mod_idx);
         }
       }
     });
diff --git a/src/runtime/pipeline/pipeline_executor.h b/src/runtime/pipeline/pipeline_executor.h
index 5422b5b59974..482d63a6c3fc 100644
--- a/src/runtime/pipeline/pipeline_executor.h
+++ b/src/runtime/pipeline/pipeline_executor.h
@@ -43,6 +43,7 @@ namespace runtime {
  */
 class TVM_DLL SubGraphRuntime : public ModuleNode {
  public:
+  SubGraphRuntime() { input_int_map = make_shared<MOD_DLDATA_MAP>(); }
   ~SubGraphRuntime() {
     /* stop pipeline threads and release data in deconstructor.
      */
@@ -82,9 +83,20 @@ class TVM_DLL SubGraphRuntime : public ModuleNode {
    * \param data_in The input data.
    */
   void SetInput(int index, DLTensor* data_in, int modIndx);
-  void SetInput(const std::string& name, DLTensor* data_in, int modIndx);
+
+  /*!
+   * \brief get index-th input.
+   * \param index The input index.
+   * \return The input data.
+   */
   NDArray GetInput(int index, int mIndx) const;
-  NDArray GetInput(const std::string& name, int mIndx) const;
+
+  /*!
+   * \brief get input index-th by name.
+   * \param input name.
+   * \return The input index.
+   */
+  int GetInputIndex(const string& name, int mIndx) const;
   /*!
    * \brief Get the number of outputs
    *
@@ -111,7 +123,7 @@ class TVM_DLL SubGraphRuntime : public ModuleNode {
       std::string key;
       reader->BeginObject();
       int mod_indx = 0;
-      unordered_map<int, unordered_map<int, int>> output;
+      unordered_map<int, unordered_map<int, string>> output;
       while (reader->NextObjectItem(&key)) {
         if (key == "mod_indx") {
           reader->Read(&mod_indx);
@@ -120,7 +132,7 @@ class TVM_DLL SubGraphRuntime : public ModuleNode {
           reader->BeginArray();
           while (reader->NextArrayItem()) {
             int output_indx = -1;
-            unordered_map<int, int> depend;
+            unordered_map<int, string> depend;
             reader->BeginObject();
             while (reader->NextObjectItem(&key)) {
               if (key == "output_indx") {
@@ -128,19 +140,20 @@ class TVM_DLL SubGraphRuntime : public ModuleNode {
               }
               if (key == "dependent") {
                 reader->BeginArray();
-                int dep_mod_indx = -1, input_indx = -1;
+                int dep_mod_indx = -1;
+                string inputName;
                 while (reader->NextArrayItem()) {
                   reader->BeginObject();
                   while (reader->NextObjectItem(&key)) {
                     if (key == "mod_indx") {
                       reader->Read(&dep_mod_indx);
                     }
-                    if (key == "input_indx") {
-                      reader->Read(&input_indx);
+                    if (key == "input_name") {
+                      reader->Read(&inputName);
                     }
                   }
-                  if (dep_mod_indx >= 0 && input_indx >= 0) {
-                    depend[dep_mod_indx] = input_indx;
+                  if (dep_mod_indx >= 0) {
+                    depend[dep_mod_indx] = inputName;
                   }
                 }
               }
@@ -162,6 +175,7 @@ class TVM_DLL SubGraphRuntime : public ModuleNode {
   vector<NDArray> output_entry_;
   PIPELINE_CONF pipeline_conf;
   vector<shared_ptr<RuntimeItem>> runtimes;
+  MOD_DLDATA_MAP_PTR input_int_map;
   size_t outpuNumber = 0;
 };
 }  // namespace runtime
diff --git a/src/runtime/pipeline/pipeline_function.cc b/src/runtime/pipeline/pipeline_function.cc
index 88c51705a8ec..aaad3e8bc04e 100644
--- a/src/runtime/pipeline/pipeline_function.cc
+++ b/src/runtime/pipeline/pipeline_function.cc
@@ -39,7 +39,7 @@ void pipeline_pipeline_run(const int& num, const shared_ptr<RuntimeItem>& curRun
     curRunItem->Run();
 
     vector<shared_ptr<OutputData>> outputs;
-    curRunItem->GetOutput2(&outputs);
+    curRunItem->GetOutput(&outputs);
     pipeline_queue_push(nextQueue, &outputs);
     curRunItem->notifyDataReadyToNext();
   }
@@ -53,6 +53,24 @@ thread* pipeline_pipeline_init(SHARED_RUNTIME_VEC* runtimes) {
   return NULL;
 }
 
+RUNTIME_PIPELINE_OUTPUT_CONF
+pipeline_name_to_indx(const Array<Module>& graphRuntimes,
+                      const RUNTIME_PIPELINE_OUTPUT_CONF_STR& pConfStr) {
+  RUNTIME_PIPELINE_OUTPUT_CONF confRet;
+  for (auto outConf : pConfStr) {
+    for (auto conf : outConf.second) {
+      int modIndx = conf.first;
+      if (modIndx) {
+        auto mGetIndex = ((Module)graphRuntimes[modIndx - 1]).GetFunction("get_input_index");
+        confRet[outConf.first][modIndx] = (static_cast<int>(mGetIndex(conf.second))) + 1;
+      } else {
+        confRet[outConf.first][modIndx] = stoi(conf.second);
+      }
+    }
+  }
+  return confRet;
+}
+
 size_t pipeline_init(Array<Module> graphRuntimes, SHARED_RUNTIME_VEC* runtimes,
                      PIPELINE_CONF* pipeline_conf) {
   int outputNum = 0;
@@ -62,7 +80,10 @@ size_t pipeline_init(Array<Module> graphRuntimes, SHARED_RUNTIME_VEC* runtimes,
     /* runtimeIndx start from 1.
      */
     int runtimeIndx = i + 1;
-    auto& pConf = pipeline_conf->at(runtimeIndx);
+    /* get dependency configuration information.
+     */
+    auto pConf = pipeline_name_to_indx(graphRuntimes, pipeline_conf->at(runtimeIndx));
+
     auto runItem = make_shared<RuntimeItem>(graphRuntimes[i], sub_queue, &pConf, runtimeIndx);
     runtimes->push_back(runItem);
     /* set prev and next for RuntimeItem, runtime need these information to
@@ -99,12 +120,23 @@ bool pipeline_queue_poll(QUEUE* queue, RuntimeData* runtimeData) {
   return q_poll<SLOT, RuntimeData>(queue, runtimeData);
 }
 
-void pipeline_run(const SHARED_RUNTIME_VEC& runtimes) {
+void pipeline_run(const SHARED_RUNTIME_VEC& runtimes, const MOD_DLDATA_MAP_PTR indxInputs) {
   shared_ptr<RuntimeItem> runtime = runtimes.front();
   runtime->Run();
-
+  /* Get runtime output
+   */
   vector<shared_ptr<OutputData>> outputs;
-  runtime->GetOutput2(&outputs);
+  runtime->GetOutput(&outputs);
+
+  /* Storage input data for runtimes after first runtime
+   */
+  for (auto modInputs : *indxInputs) {
+    int modIndx = modInputs.first;
+    for (auto inputs : modInputs.second) {
+      outputs.push_back(make_shared<OutputData>(modIndx, inputs.first + 1, inputs.second->data));
+    }
+  }
+
   pipeline_queue_push(runtime->next->queue, &outputs);
   runtime->notifyDataReadyToNext();
   return;
@@ -133,3 +165,21 @@ void pipeline_stop(const SHARED_RUNTIME_VEC& runtimes) {
     runtimes.front()->notifyNextExit();
   }
 }
+
+void pipeline_setinput(MOD_DLDATA_MAP_PTR input_int_map, const int index, const DLTensor* data_in,
+                       const int modIndx) {
+  if (input_int_map->find(modIndx) == input_int_map->end()) {
+    DLDATA_MAP dlmap;
+    dlmap[index] = nullptr;
+    input_int_map->insert({modIndx, dlmap});
+  } else if (input_int_map->at(modIndx).find(index) == input_int_map->at(modIndx).end()) {
+    input_int_map->at(modIndx)[index] = nullptr;
+  }
+
+  TENSOR_DATA tensor_data = input_int_map->at(modIndx)[index];
+  if (tensor_data == nullptr) {
+    tensor_data = make_shared<TensorData>();
+    input_int_map->at(modIndx)[index] = tensor_data;
+  }
+  tensor_data->CreateCopyFrom(data_in, kDLCPU, 0);
+}
diff --git a/src/runtime/pipeline/pipeline_function.h b/src/runtime/pipeline/pipeline_function.h
index 440956a69a53..3c5671c6a891 100644
--- a/src/runtime/pipeline/pipeline_function.h
+++ b/src/runtime/pipeline/pipeline_function.h
@@ -19,6 +19,7 @@
 #ifndef TVM_RUNTIME_PIPELINE_PIPELINE_FUNCTION_H_
 #define TVM_RUNTIME_PIPELINE_PIPELINE_FUNCTION_H_
 #include <memory>
+#include <string>
 #include <unordered_map>
 #include <vector>
 
@@ -27,15 +28,21 @@
 using namespace std;
 using namespace tvm::runtime;
 typedef vector<shared_ptr<RuntimeItem>> SHARED_RUNTIME_VEC;
-typedef unordered_map<int, unordered_map<int, unordered_map<int, int>>> PIPELINE_CONF;
+typedef unordered_map<int, unordered_map<int, unordered_map<int, string>>> PIPELINE_CONF;
+typedef shared_ptr<TensorData> TENSOR_DATA;
+typedef unordered_map<int, TENSOR_DATA> DLDATA_MAP;
+typedef unordered_map<int, DLDATA_MAP> MOD_DLDATA_MAP;
+typedef shared_ptr<MOD_DLDATA_MAP> MOD_DLDATA_MAP_PTR;
 
 size_t pipeline_init(Array<Module> graphRuntimes, SHARED_RUNTIME_VEC* runtimes,
                      PIPELINE_CONF* pipeline_conf);
-void pipeline_run(const SHARED_RUNTIME_VEC& runtimes);
+void pipeline_run(const SHARED_RUNTIME_VEC& runtimes, const MOD_DLDATA_MAP_PTR indxInputs);
 inline void pipeline_queue_push(QUEUE* queue, vector<shared_ptr<OutputData>>* outputs);
 bool pipeline_queue_poll(QUEUE* queue, RuntimeData* runtimeData);
 bool pipeline_poll(vector<NDArray>* output, const SHARED_RUNTIME_VEC& runtimes,
                    const bool bSync = false);
 void pipeline_stop(const SHARED_RUNTIME_VEC& runtimes);
+void pipeline_setinput(MOD_DLDATA_MAP_PTR input_int_map, const int index, const DLTensor* data_in,
+                       const int modIndx);
 
 #endif  // TVM_RUNTIME_PIPELINE_PIPELINE_FUNCTION_H_
diff --git a/src/runtime/pipeline/pipeline_struct.h b/src/runtime/pipeline/pipeline_struct.h
index cc153c9b0ca9..12305530c0da 100644
--- a/src/runtime/pipeline/pipeline_struct.h
+++ b/src/runtime/pipeline/pipeline_struct.h
@@ -40,6 +40,7 @@
 using namespace tvm::runtime;
 using namespace std;
 typedef unordered_map<int, unordered_map<int, int>> RUNTIME_PIPELINE_OUTPUT_CONF;
+typedef unordered_map<int, unordered_map<int, string>> RUNTIME_PIPELINE_OUTPUT_CONF_STR;
 /* thread control struction, for single consumer single producer mode.
  */
 class TControl {
@@ -121,9 +122,8 @@ class Dependent {
   bool NeedForward() { return (bFinal || depNum > 0); }
 };
 
-class InputData {
+class TensorData {
  public:
-  Dependent dependent;
   DLTensor* data = nullptr;
 
   DLTensor* CreateCopyFrom(const DLTensor* from, int device_type, int device_id) {
@@ -141,7 +141,7 @@ class InputData {
     TVMArrayCopyFromTo(const_cast<DLTensor*>(from), data, nullptr);
     return data;
   }
-  ~InputData() {
+  ~TensorData() {
     if (data) {
       TVMArrayFree(data);
       data = nullptr;
@@ -149,6 +149,17 @@ class InputData {
   }
 };
 
+class InputData {
+ public:
+  Dependent dependent;
+  TensorData dlData;
+
+  DLTensor* CreateCopyFrom(const DLTensor* from, int device_type, int device_id) {
+    dlData.CreateCopyFrom(from, device_type, device_id);
+    return dlData.data;
+  }
+};
+
 class OutputData {
  public:
   OutputData(const NDArray& data, const size_t Indx,
@@ -165,18 +176,23 @@ class OutputData {
     }
   }
 
+  OutputData(const int modIndx, const int inputIndx, const DLTensor* data) {
+    dltensor = const_cast<DLTensor*>(data);
+    dependent.SetDepModInputIndx(modIndx, inputIndx);
+  }
+
   explicit OutputData(const InputData* pdata) {
     dependent = pdata->dependent;
-    /* caller need make sure pdata->data is avaialble.
+    /* caller need make sure pdata->dlData.data is avaialble.
      */
-    dltensor = pdata->data;
+    dltensor = pdata->dlData.data;
   }
 
   OutputData& operator=(const InputData* pdata) {
     dependent = pdata->dependent;
-    /* caller need make sure pdata->data is avaialble.
+    /* caller need make sure pdata->dlData.data is avaialble.
      */
-    dltensor = pdata->data;
+    dltensor = pdata->dlData.data;
     return *this;
   }
 
@@ -241,7 +257,7 @@ class PipelineData {
     ResetDataList(num);
 
     for (size_t i = 0; i < num; i++) {
-      CreateCopyFrom(dlInput[i]->data, dlInput[i]->dependent, &inputList[i], device_type,
+      CreateCopyFrom(dlInput[i]->dlData.data, dlInput[i]->dependent, &inputList[i], device_type,
                      device_id);
     }
     return;
@@ -290,7 +306,7 @@ class pipelineOutputData {
     /* output may not ordered by index in slot, use a map to index them.
      */
     for (size_t i = 0; i < slot.data.num; i++) {
-      auto dlTensor = slot.data.inputList[i]->data;
+      auto dlTensor = slot.data.inputList[i]->dlData.data;
       int outputIndx = slot.data.inputList[i]->dependent.GetOutputIndx() - 1;
       assert(outputIndx < slot.data.num);
       dataMap[outputIndx] = dlTensor;
@@ -343,6 +359,7 @@ class RuntimeFunction {
   tvm::runtime::PackedFunc set_input;
   tvm::runtime::PackedFunc get_output;
   tvm::runtime::PackedFunc get_input;
+  tvm::runtime::PackedFunc get_input_index;
   tvm::runtime::PackedFunc run;
   explicit RuntimeFunction(const Module& m) {
     module_ = m;
@@ -351,6 +368,7 @@ class RuntimeFunction {
     set_input = module_.GetFunction("set_input");
     get_output = module_.GetFunction("get_output");
     get_input = module_.GetFunction("get_input");
+    get_input_index = module_.GetFunction("get_input_index");
     run = module_.GetFunction("run");
   }
   ~RuntimeFunction() {
@@ -418,6 +436,8 @@ class RuntimeFunction {
 
   NDArray GetInput(int index) const { return get_input(index); }
 
+  int GetInputIndex(const std::string& name) { return get_input_index(name); }
+
   void Run() { run(); }
 };
 
@@ -452,7 +472,7 @@ class RuntimeData {
        */
       int inputIndx = data[i]->dependent.GetDepModInputIndx(runtimeIndx);
       if (inputIndx > 0) {
-        runtimePtr->SetInput(inputIndx - 1, data[i]->data);
+        runtimePtr->SetInput(inputIndx - 1, data[i]->dlData.data);
         /* data getused remove dependent reference for current runtime
          */
         data[i]->dependent.RemoveDependentRef(runtimeIndx);
@@ -538,22 +558,7 @@ class RuntimeItem {
     }
   }
 
-  /*
-   * Here we need to use a container to storage NDArray that from
-   * GetOutput, if just copy the data but not storage NDArray, the
-   * memory of data may get freed, especially for RPC device data,
-   */
-  Array<NDArray> GetOutput(void) {
-    Array<NDArray> outputs;
-    size_t outputsNum = runtimePtr->NumOutputs();
-    for (size_t i = 0; i < outputsNum; i++) {
-      auto output = runtimePtr->GetOutput(i);
-      outputs.push_back(output);
-    }
-    return outputs;
-  }
-
-  void GetOutput2(vector<shared_ptr<OutputData>>* outputs) {
+  void GetOutput(vector<shared_ptr<OutputData>>* outputs) {
     size_t outputsNum = runtimePtr->NumOutputs();
     for (size_t i = 0; i < outputsNum; i++) {
       shared_ptr<OutputData> output =
diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index 4702ae9b7cf4..727b31d836e2 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -37,7 +37,7 @@ def run_modules(mod_configs, dev, target, dname, data, iMod, iName, iData):
         if mod_key in mod_input:
             for input in mod_input[mod_key]:
                 input = mod_input[mod_key][input]
-                m.set_input(input["index"] - 1, input["data"])
+                m.set_input(input["index"], input["data"])
         else:
             m.set_input(dname, data)
 
@@ -53,17 +53,17 @@ def run_modules(mod_configs, dev, target, dname, data, iMod, iName, iData):
             output_data = m.get_output(output["output_indx"] - 1).asnumpy()
             for dep in output["dependent"]:
                 # currnet output use as dependent input,
-                # input_indx indicate the input index number.
-                input_indx = dep["input_indx"]
+                # input_name indicate the input index number.
                 mod_indx = dep["mod_indx"]
+                input_name = dep["input_name"]
                 if mod_indx == 0:
-                    final_output[input_indx] = output_data
+                    final_output[input_name] = output_data
                 else:
                     if mod_indx in mod_input:
-                        mod_input[mod_indx][input_indx] = {"index": input_indx, "data": output_data}
+                        mod_input[mod_indx][input_name] = {"index": input_name, "data": output_data}
                     else:
                         mod_input[mod_indx] = {
-                            input_indx: {"index": input_indx, "data": output_data}
+                            input_name: {"index": input_name, "data": output_data}
                         }
         indx = indx + 1
 
@@ -136,9 +136,9 @@ def run_pipeline(target):
     mconfig1["pipeline"] = {
         "mod_indx": 1,
         "output": [
-            {"output_indx": 1, "dependent": [{"mod_indx": 2, "input_indx": 2}]},
-            {"output_indx": 2, "dependent": [{"mod_indx": 3, "input_indx": 1}]},
-            {"output_indx": 3, "dependent": [{"mod_indx": 0, "input_indx": 1}]},
+            {"output_indx": 1, "dependent": [{"mod_indx": 2, "input_name": "data_0"}]},
+            {"output_indx": 2, "dependent": [{"mod_indx": 3, "input_name": "data_0"}]},
+            {"output_indx": 3, "dependent": [{"mod_indx": 0, "input_name": "1"}]},
         ],
     }
     mod_config[mods[0]] = mconfig1
@@ -149,7 +149,7 @@ def run_pipeline(target):
     mconfig2["pipeline"] = {
         "mod_indx": 2,
         "output": [
-            {"output_indx": 1, "dependent": [{"mod_indx": 3, "input_indx": 2}]},
+            {"output_indx": 1, "dependent": [{"mod_indx": 3, "input_name": "data_1"}]},
         ],
     }
     mod_config[mods[1]] = mconfig2
@@ -160,7 +160,7 @@ def run_pipeline(target):
 
     mconfig3["pipeline"] = {
         "mod_indx": 3,
-        "output": [{"output_indx": 1, "dependent": [{"mod_indx": 0, "input_indx": 2}]}],
+        "output": [{"output_indx": 1, "dependent": [{"mod_indx": 0, "input_name": "2"}]}],
     }
     mod_config[mods[2]] = mconfig3
 
@@ -182,9 +182,10 @@ def run_pipeline(target):
     """
     #Use pipeline executor to pipeline the said pipeline which use different backend
     """
+    d3 = np.full(dshape, 10).astype("float32")
     for data in datas:
         pipeline_module.set_input("data_0", data)
-        pipeline_module.set_input("data_1", data, modindx=1)
+        pipeline_module.set_input("data_1", data, modindx=2)
         pipeline_module.run()
 
     """
@@ -205,15 +206,15 @@ def run_pipeline(target):
     """
     for ref_out, out in zip(outs, pipeline_outputs):
         for ref in ref_out:
-            tvm.testing.assert_allclose(ref_out[ref], out[ref - 1])
+            tvm.testing.assert_allclose(ref_out[ref], out[int(ref) - 1])
 
 
 def test_pipeline():
-    target_list = tvm.testing.enabled_targets()
-    for target in target_list:
-        run_pipeline(target)
+    if pipeline_executor.pipeline_executor_enabled():
+        target_list = tvm.testing.enabled_targets()
+        for target in target_list:
+            run_pipeline(target)
 
 
 if __name__ == "__main__":
-    if pipeline_executor.pipeline_executor_enabled():
-        test_pipeline()
+    test_pipeline()

From 4d19179459dd1b27cc5a3fb24d3afadef13fda8f Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Tue, 22 Jun 2021 16:00:32 -0700
Subject: [PATCH 19/28] decouple pipeline_build and create.

---
 python/tvm/contrib/pipeline_executor.py      | 9 ++++-----
 tests/python/relay/test_pipeline_executor.py | 4 +++-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
index 0f88134e4028..88f720059f39 100644
--- a/python/tvm/contrib/pipeline_executor.py
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -87,15 +87,15 @@ def build_pipeline(config):
     return mods, string_config
 
 
-def create(mods, mod_config):
+def create(pipeline_mods, mod_config):
     """Create a pipeline runtime executor.
 
     Parameters
     ----------
-    mods : List[IRModule]
+    pipeline_mods : List[IRModule]
         list of IRModule
 
-    mod_config : Dict[IRModule, Dict[str, Any]]
+    mod_config : Dict[int, Dict[str, Any]]
         modules and modules dependency configuration informaiton.
 
     Returns
@@ -103,7 +103,6 @@ def create(mods, mod_config):
     submodule : PipelineModule
         Runtime pipeline module.
     """
-    pipeline_mods, string_config = build_pipeline(mod_config)
 
     mods = []
     for pipeline_mod in pipeline_mods:
@@ -113,7 +112,7 @@ def create(mods, mod_config):
 
         mods.append(mod)
 
-    submodule = PipelineModule(mods, json.dumps(string_config))
+    submodule = PipelineModule(mods, json.dumps(mod_config))
     return submodule
 
 
diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index 727b31d836e2..b00126b5d60d 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -177,7 +177,9 @@ def run_pipeline(target):
     #build and create pipeline module
     """
     with relay.build_config(opt_level=3):
-        pipeline_module = pipeline_executor.create(mods, mod_config)
+        pipeline_mods, string_config = pipeline_executor.build_pipeline(mod_config)
+
+    pipeline_module = pipeline_executor.create(pipeline_mods, string_config)
 
     """
     #Use pipeline executor to pipeline the said pipeline which use different backend

From df18253bd57ba7f4dbf7edc08c64187c10cac8a0 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Wed, 23 Jun 2021 12:27:33 -0700
Subject: [PATCH 20/28] address review comments.

---
 cmake/config.cmake                           |   1 +
 python/tvm/contrib/graph_executor.py         |  10 --
 python/tvm/contrib/pipeline_executor.py      | 105 ++++++++++++++-----
 src/runtime/pipeline/pipeline_executor.cc    |  39 +++----
 src/runtime/pipeline/pipeline_executor.h     |  39 +++++--
 src/runtime/pipeline/pipeline_function.cc    |  65 +++++++++++-
 src/runtime/pipeline/pipeline_function.h     |  13 ++-
 tests/python/relay/test_pipeline_executor.py |   5 +-
 8 files changed, 201 insertions(+), 76 deletions(-)

diff --git a/cmake/config.cmake b/cmake/config.cmake
index 76465ce49941..b520a9627b83 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -101,6 +101,7 @@ set(USE_STACKVM_RUNTIME OFF)
 
 # Whether enable tiny embedded graph executor.
 set(USE_GRAPH_EXECUTOR ON)
+
 # Whether enable subgraph runtime.
 set(USE_PIPELINE_EXECUTOR OFF)
 
diff --git a/python/tvm/contrib/graph_executor.py b/python/tvm/contrib/graph_executor.py
index 94e4261aca2f..9321c8cc7753 100644
--- a/python/tvm/contrib/graph_executor.py
+++ b/python/tvm/contrib/graph_executor.py
@@ -246,16 +246,6 @@ def get_input(self, index, out=None):
 
     def get_input_index(self, name):
         """Get inputs index via input name.
-
-        Parameters
-        ----------
-        name : str
-           The input key name
-
-        Returns
-        -------
-        index: int
-            The input index. -1 will be returned if the given input name is not found.
         """
         return self._get_input_index(name)
 
diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
index 88f720059f39..de47084e51e7 100644
--- a/python/tvm/contrib/pipeline_executor.py
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -22,7 +22,12 @@
 
 
 def pipeline_executor_enabled():
-    """check if pipeline executor enabled."""
+    """check if pipeline executor enabled.
+    Return
+    ------
+    enable: bool
+        return pipeline executor get enabled or not
+    """
     pipeline_enabled = False
     try:
         pipelinecreate = tvm._ffi.get_global_func("tvm.pipeline_executor.create")
@@ -34,19 +39,39 @@ def pipeline_executor_enabled():
     return pipeline_enabled
 
 
-def build_pipeline(config):
-    """build module list that can use for pipeline execution.
+def write_file(file_name, data, mode):
+    """write data into file
 
     Parameters
     ----------
+    file_name: str
+        file name
+    data: str
+        data
+    mode: str
+        file open mode
+    """
+    if file_name:
+        with open(file_name, mode) as file_handle:
+            file_handle.write(data)
+
+    return
+
+
+def build_pipeline(mod_n_configs, export_path=None):
+    """build module list that can use for pipeline execution.
 
-    config: Dict[IRModule, Dict[str, Any]]
+    Parameters
+    ----------
+    mod_n_configs: Dict[IRModule, Dict[str, Any]]
         build configuration informaton, structure like following.
         {IRModule: {"target":target,
                     "target_host":target_host,
                     "params":params,
                     "mod_name"mod_name,
                     "build":build}}
+    export_path: str
+        export build result into file
 
     Returns
     -------
@@ -56,21 +81,25 @@ def build_pipeline(config):
         pipeline configuration
     """
     mods = {}
-    config_len = len(config)
+    config_len = len(mod_n_configs)
     string_config = [{} for _ in range(config_len)]
-    for ir_mod in config:
+    for _, (ir_mod, mod_config) in enumerate(mod_n_configs.items()):
+        # init lib_name and json_name params with empty
+        lib_name = ""
+        json_name = ""
+        params_name = ""
         # Get module configuration
-        mod_config = config[ir_mod]
         assert "pipeline" in mod_config and "mod_indx" in mod_config["pipeline"]
         # Get module index in pipeline configuration
-        mod_indx = mod_config["pipeline"]["mod_indx"] - 1
+        mconf = mod_config["pipeline"].copy()
+        # Get mod device config
+        dev = mod_config["dev"]
+        mod_indx = mconf["mod_indx"] - 1
         assert mod_indx < config_len
-        # Create pipeline configuration
-        string_config[mod_indx] = mod_config["pipeline"]
         build_func = relay.build
         # if there is a self defined build function then use it.
         if "build" in mod_config and mod_config["build"]:
-            build_func = mod_config.build
+            build_func = mod_config["build"]
 
         # build IRModule
         mod = build_func(
@@ -81,7 +110,28 @@ def build_pipeline(config):
             mod_name=mod_config["mod_name"],
         )
 
-        mods[mod] = {"dev": mod_config["dev"]}
+        if export_path:
+            graph, lib, params = mod
+            lib_name = "{}/lib{}.so".format(export_path, mod_indx)
+            json_name = "{}/json{}".format(export_path, mod_indx)
+            params_name = "{}/params{}".format(export_path, mod_indx)
+            lib.export_library(lib_name)
+            write_file(json_name, graph, "w")
+            write_file(params_name, relay.save_param_dict(params), "wb")
+
+        mconf["lib_name"] = lib_name
+        mconf["json_name"] = json_name
+        mconf["params_name"] = params_name
+        mconf["dev"] = "{},{}".format(dev.device_type, dev.device_id)
+        # Create pipeline configuration
+        string_config[mod_indx] = mconf
+        # associate mod with device
+        mods[mod] = {"dev": dev}
+
+    if export_path:
+        write_file("{}/config".format(export_path), json.dumps(string_config), "w")
+        #with open("{}/config".format(export_path), "w") as config_file:
+        #    config_file.write(json.dumps(string_config))
 
     # return IRModule list and pipeline configuration
     return mods, string_config
@@ -109,10 +159,11 @@ def create(pipeline_mods, mod_config):
         mod = graph_executor.GraphModule(
             pipeline_mod["default"](pipeline_mods[pipeline_mod]["dev"])
         )
-
-        mods.append(mod)
+    
+        mods.append(mod.module)
 
     submodule = PipelineModule(mods, json.dumps(mod_config))
+    #submodule = PipelineModule(pipeline_mods, json.dumps(mod_config))
     return submodule
 
 
@@ -130,16 +181,16 @@ class PipelineModule(object):
 
     """
 
-    def __init__(self, graph_modules, pipeline_config):
+    def __init__(self, modules, pipeline_config):
         mods = []
-        for module in graph_modules:
-            mods.append(module.module)
+        for module in modules:
+            mods.append(module)
 
         pipelinecreate = tvm._ffi.get_global_func("tvm.pipeline_executor.create")
         assert pipelinecreate
         module = pipelinecreate(mods, pipeline_config)
 
-        self.graph_modules_ = graph_modules
+        self.graph_modules_ = modules
 
         self._set_input = module["set_input"]
         self._run = module["run"]
@@ -149,27 +200,29 @@ def __init__(self, graph_modules, pipeline_config):
         self._get_num_outputs = module["get_num_outputs"]
         self._get_num_inputs = module["get_num_inputs"]
 
-    def set_input(self, key, value, modindx=1, params=None):
+    def set_input(self, key, value, mod_idx=1, params=None):
         """Set inputs to the module via kwargs
 
         Parameters
         ----------
         key : array_like
-           The input key
+            The input key
 
         value : array_like.
-           The input key
+            The input key
+
+        mod_idx : int
+            the submodule index
 
         params : dict of str to NDArray
-           Additional arguments
+            Additional arguments
         """
-        assert modindx >= 1
-        if key is not None:
-            self._set_input(key, tvm.nd.array(value, tvm.cpu()), modindx)
+        assert mod_idx >= 1
+        self._set_input(key, tvm.nd.array(value, tvm.cpu()), mod_idx)
 
         if params:
             for param in params:
-                self.graph_modules_[modindx - 1].set_input(**param)
+                self.graph_modules_[mod_idx - 1].set_input(**param)
 
     def run(self):
         """Run forward execution of the graph"""
diff --git a/src/runtime/pipeline/pipeline_executor.cc b/src/runtime/pipeline/pipeline_executor.cc
index 0cd801068714..422bae4b61cd 100644
--- a/src/runtime/pipeline/pipeline_executor.cc
+++ b/src/runtime/pipeline/pipeline_executor.cc
@@ -22,30 +22,22 @@
  */
 #include "pipeline_executor.h"
 
-#include <tvm/runtime/registry.h>
-
 namespace tvm {
 namespace runtime {
 
-/*!
- *\bief Stop pipeline run.
- */
-void SubGraphRuntime::Stop() { pipeline_stop(runtimes); }
+/*! \bief Stop pipeline run. */
+void SubGraphRuntime::Stop() { pipeline_stop(runtimes_); }
 /*!
  * \brief Run all the operations one by one.
  */
-void SubGraphRuntime::Run() {
-  pipeline_run(runtimes, input_int_map);
-  /* Clear the input map
-   */
-}
+void SubGraphRuntime::Run() { pipeline_run(runtimes_, input_int_map_); }
 
 void SubGraphRuntime::Init(const Array<tvm::runtime::Module>& modules,
                            const std::string& pipeline_json) {
   std::istringstream is(pipeline_json);
   dmlc::JSONReader reader(&is);
   this->Load(&reader);
-  outpuNumber = pipeline_init(modules, &runtimes, &pipeline_conf);
+  outpuNumber_ = pipeline_init(modules, &runtimes_, pipeline_conf_, mod_conf_);
   return;
 }
 
@@ -55,11 +47,11 @@ void SubGraphRuntime::Init(const Array<tvm::runtime::Module>& modules,
  * \param data_in The input data.
  * \param modIndx The runtime index.
  */
-void SubGraphRuntime::SetInput(int index, DLTensor* data_in, int modIndx) {
-  if (1 == modIndx) {
-    runtimes[0]->runtimePtr->SetInput(index, data_in);
+void SubGraphRuntime::SetInput(int index, DLTensor* data_in, int mod_idx) {
+  if (1 == mod_idx) {
+    runtimes_[0]->runtimePtr->SetInput(index, data_in);
   } else {
-    pipeline_setinput(input_int_map, index, data_in, modIndx);
+    pipeline_setinput(input_int_map_, index, data_in, mod_idx);
   }
 }
 
@@ -68,7 +60,7 @@ void SubGraphRuntime::SetInput(int index, DLTensor* data_in, int modIndx) {
  *
  * \return The number of outputs from last pipeline.
  */
-int SubGraphRuntime::NumOutputs() const { return outpuNumber; }
+int SubGraphRuntime::NumOutputs() const { return outpuNumber_; }
 
 /*!
  * \brief Get the number of inputs
@@ -77,7 +69,7 @@ int SubGraphRuntime::NumOutputs() const { return outpuNumber; }
  */
 int SubGraphRuntime::NumInputs() const {
   int inputsNum = 0;
-  for (auto runtime : runtimes) {
+  for (auto runtime : runtimes_) {
     inputsNum += runtime->runtimePtr->NumInputs();
   }
   return inputsNum;
@@ -90,7 +82,7 @@ int SubGraphRuntime::NumInputs() const {
  * \return NDArray corresponding to given input node index.
  */
 NDArray SubGraphRuntime::GetInput(int index, int mIndx) const {
-  auto gruntime = runtimes[mIndx];
+  auto gruntime = runtimes_[mIndx];
   return gruntime->runtimePtr->GetInput(index);
 }
 
@@ -101,7 +93,7 @@ NDArray SubGraphRuntime::GetInput(int index, int mIndx) const {
  * \return int corresponding to given input node name.
  */
 int SubGraphRuntime::GetInputIndex(const string& name, int mIndx) const {
-  auto gruntime = runtimes[mIndx - 1];
+  auto gruntime = runtimes_[mIndx - 1];
   return gruntime->runtimePtr->GetInputIndex(name);
 }
 
@@ -112,7 +104,7 @@ int SubGraphRuntime::GetInputIndex(const string& name, int mIndx) const {
  */
 Array<NDArray> SubGraphRuntime::GetOutput(bool syncPoll) {
   Array<NDArray> nd;
-  if (pipeline_poll(&output_entry_, runtimes, syncPoll)) {
+  if (pipeline_poll(&output_entry_, runtimes_, syncPoll)) {
     for (auto output : output_entry_) {
       nd.push_back(output);
     }
@@ -122,12 +114,9 @@ Array<NDArray> SubGraphRuntime::GetOutput(bool syncPoll) {
 
 PackedFunc SubGraphRuntime::GetFunction(const std::string& name,
                                         const ObjectPtr<Object>& sptr_to_self) {
-  /* Return member functions during query.
-   */
   if (name == "set_input") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      /* Default use first runtime index value.
-       */
+      // Default use first runtime index value.
       int modIndx = 0;
       if (args.num_args == 3) {
         modIndx = static_cast<int>(args[2]);
diff --git a/src/runtime/pipeline/pipeline_executor.h b/src/runtime/pipeline/pipeline_executor.h
index 482d63a6c3fc..f743d56c57ca 100644
--- a/src/runtime/pipeline/pipeline_executor.h
+++ b/src/runtime/pipeline/pipeline_executor.h
@@ -43,7 +43,7 @@ namespace runtime {
  */
 class TVM_DLL SubGraphRuntime : public ModuleNode {
  public:
-  SubGraphRuntime() { input_int_map = make_shared<MOD_DLDATA_MAP>(); }
+  SubGraphRuntime() { input_int_map_ = make_shared<MOD_DLDATA_MAP>(); }
   ~SubGraphRuntime() {
     /* stop pipeline threads and release data in deconstructor.
      */
@@ -82,7 +82,7 @@ class TVM_DLL SubGraphRuntime : public ModuleNode {
    * \param index The input index.
    * \param data_in The input data.
    */
-  void SetInput(int index, DLTensor* data_in, int modIndx);
+  void SetInput(int index, DLTensor* data_in, int mod_idx);
 
   /*!
    * \brief get index-th input.
@@ -123,11 +123,33 @@ class TVM_DLL SubGraphRuntime : public ModuleNode {
       std::string key;
       reader->BeginObject();
       int mod_indx = 0;
+      std::string libName;
+      std::string jsonName;
+      std::string paramsName;
+      std::string dev;
       unordered_map<int, unordered_map<int, string>> output;
+      unordered_map<int, unordered_map<string, string>> lib;
       while (reader->NextObjectItem(&key)) {
         if (key == "mod_indx") {
           reader->Read(&mod_indx);
         }
+
+        if (key == "lib_name") {
+          reader->Read(&libName);
+        }
+
+        if (key == "json_name") {
+          reader->Read(&jsonName);
+        }
+
+        if (key == "params_name") {
+          reader->Read(&paramsName);
+        }
+
+        if (key == "dev") {
+          reader->Read(&dev);
+        }
+
         if (key == "output") {
           reader->BeginArray();
           while (reader->NextArrayItem()) {
@@ -166,17 +188,20 @@ class TVM_DLL SubGraphRuntime : public ModuleNode {
         }
       }
       if (mod_indx >= 0) {
-        pipeline_conf[mod_indx] = output;
+        pipeline_conf_[mod_indx] = output;
+        mod_conf_[mod_indx] = {
+            {"lib_name", libName}, {"json_name", jsonName}, {"params", paramsName}, {"dev", dev}};
       }
     }
   }
 
  protected:
   vector<NDArray> output_entry_;
-  PIPELINE_CONF pipeline_conf;
-  vector<shared_ptr<RuntimeItem>> runtimes;
-  MOD_DLDATA_MAP_PTR input_int_map;
-  size_t outpuNumber = 0;
+  PIPELINE_CONF pipeline_conf_;
+  MOD_CONF mod_conf_;
+  vector<shared_ptr<RuntimeItem>> runtimes_;
+  MOD_DLDATA_MAP_PTR input_int_map_;
+  size_t outpuNumber_ = 0;
 };
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/pipeline/pipeline_function.cc b/src/runtime/pipeline/pipeline_function.cc
index aaad3e8bc04e..98355bee1d4e 100644
--- a/src/runtime/pipeline/pipeline_function.cc
+++ b/src/runtime/pipeline/pipeline_function.cc
@@ -20,7 +20,7 @@
 
 #include <utility>
 using namespace tvm::runtime;
-
+using namespace std;
 void pipeline_pipeline_run(const int& num, const shared_ptr<RuntimeItem>& curRunItem) {
   QUEUE* curQueue = curRunItem->queue;
   QUEUE* nextQueue = curRunItem->next->queue;
@@ -71,9 +71,66 @@ pipeline_name_to_indx(const Array<Module>& graphRuntimes,
   return confRet;
 }
 
-size_t pipeline_init(Array<Module> graphRuntimes, SHARED_RUNTIME_VEC* runtimes,
-                     PIPELINE_CONF* pipeline_conf) {
+vector<Module> pipeline_graph_runtime(Array<Module> modules, const MOD_CONF& mod_conf) {
+  const PackedFunc* graphRuntimeCreate = Registry::Get("tvm.graph_executor.create");
+  vector<Module> ret;
+  // if modules not empty just return in vector container
+  if (!modules.empty()) {
+    for (auto mod : modules) {
+      ret.push_back(mod);
+    }
+  } else {// if modules is empty, need to build the graph runtime
+    ret.resize(mod_conf.size());
+    for (auto mconf : mod_conf) {
+      // load lib
+      auto lib = Module::LoadFromFile(mconf.second["lib_name"].c_str());
+
+      // read json
+      ifstream ifJson(mconf.second["json_name"].c_str());
+      if (ifJson.fail()) {
+        throw std::runtime_error("json file not found!");
+      }
+      const std::string json((istreambuf_iterator<char>(ifJson)), istreambuf_iterator<char>());
+
+      // create graph runtime
+      istringstream istr(mconf.second["dev"]);
+      string str;
+      int deviceType = 1, deviceId = 0;
+      while(getline(istr, str, ';')) {
+          istringstream istrDev(str);
+          string stemp;
+          if (getline(istrDev, stemp)) {
+              deviceType = stoi(stemp);
+          }
+          if (getline(istrDev, stemp)) {
+              deviceId = stoi(stemp);
+          }
+      }
+      Module graphModule = (*graphRuntimeCreate)(json, lib, deviceType, deviceId);
+
+      // load parameter
+      TVMByteArray params_arr;
+      ifstream ifParam(mconf.second["params"].c_str());
+      if (ifParam.fail()) {
+        throw std::runtime_error("params file not found!");
+      }
+      const std::string params((istreambuf_iterator<char>(ifParam)), istreambuf_iterator<char>());
+      params_arr.data = params.c_str();
+      params_arr.size = params.length();
+      auto load_params = graphModule.GetFunction("load_params");
+      load_params(params_arr);
+
+      // put into return vector
+      ret[mconf.first - 1] = graphModule;
+    }
+  }
+  return ret;
+}
+
+size_t pipeline_init(Array<Module> modules, SHARED_RUNTIME_VEC* runtimes,
+                     const PIPELINE_CONF& pipeline_conf, const MOD_CONF& mod_conf) {
   int outputNum = 0;
+  vector<Module> graphRuntimes = pipeline_graph_runtime(modules, mod_conf);
   int len = graphRuntimes.size();
   for (int i = 0; i < len; i++) {
     QUEUE* sub_queue = createQueue<SLOT>(NULL, SUB_Q_SIZE);
@@ -82,7 +139,7 @@ size_t pipeline_init(Array<Module> graphRuntimes, SHARED_RUNTIME_VEC* runtimes,
     int runtimeIndx = i + 1;
     /* get dependency configuration information.
      */
-    auto pConf = pipeline_name_to_indx(graphRuntimes, pipeline_conf->at(runtimeIndx));
+    auto pConf = pipeline_name_to_indx(graphRuntimes, pipeline_conf.at(runtimeIndx));
 
     auto runItem = make_shared<RuntimeItem>(graphRuntimes[i], sub_queue, &pConf, runtimeIndx);
     runtimes->push_back(runItem);
diff --git a/src/runtime/pipeline/pipeline_function.h b/src/runtime/pipeline/pipeline_function.h
index 3c5671c6a891..bb820147018d 100644
--- a/src/runtime/pipeline/pipeline_function.h
+++ b/src/runtime/pipeline/pipeline_function.h
@@ -18,6 +18,13 @@
  */
 #ifndef TVM_RUNTIME_PIPELINE_PIPELINE_FUNCTION_H_
 #define TVM_RUNTIME_PIPELINE_PIPELINE_FUNCTION_H_
+
+#include <dlpack/dlpack.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+
+#include <fstream>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -29,13 +36,15 @@ using namespace std;
 using namespace tvm::runtime;
 typedef vector<shared_ptr<RuntimeItem>> SHARED_RUNTIME_VEC;
 typedef unordered_map<int, unordered_map<int, unordered_map<int, string>>> PIPELINE_CONF;
+typedef unordered_map<int, unordered_map<string, string>> MOD_CONF;
 typedef shared_ptr<TensorData> TENSOR_DATA;
 typedef unordered_map<int, TENSOR_DATA> DLDATA_MAP;
 typedef unordered_map<int, DLDATA_MAP> MOD_DLDATA_MAP;
 typedef shared_ptr<MOD_DLDATA_MAP> MOD_DLDATA_MAP_PTR;
 
-size_t pipeline_init(Array<Module> graphRuntimes, SHARED_RUNTIME_VEC* runtimes,
-                     PIPELINE_CONF* pipeline_conf);
+vector<Module> pipeline_get_graphRuntime(Array<Module> modules, const MOD_CONF& mod_conf);
+size_t pipeline_init(Array<Module> modules, SHARED_RUNTIME_VEC* runtimes,
+                     const PIPELINE_CONF& pipeline_conf, const MOD_CONF& mod_conf);
 void pipeline_run(const SHARED_RUNTIME_VEC& runtimes, const MOD_DLDATA_MAP_PTR indxInputs);
 inline void pipeline_queue_push(QUEUE* queue, vector<shared_ptr<OutputData>>* outputs);
 bool pipeline_queue_poll(QUEUE* queue, RuntimeData* runtimeData);
diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index b00126b5d60d..f08bcfd23382 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -177,7 +177,8 @@ def run_pipeline(target):
     #build and create pipeline module
     """
     with relay.build_config(opt_level=3):
-        pipeline_mods, string_config = pipeline_executor.build_pipeline(mod_config)
+        pipeline_mods, string_config = pipeline_executor.build_pipeline(mod_config,
+                                                                        "/scratch/hj/data/")
 
     pipeline_module = pipeline_executor.create(pipeline_mods, string_config)
 
@@ -187,7 +188,7 @@ def run_pipeline(target):
     d3 = np.full(dshape, 10).astype("float32")
     for data in datas:
         pipeline_module.set_input("data_0", data)
-        pipeline_module.set_input("data_1", data, modindx=2)
+        pipeline_module.set_input("data_1", data, mod_idx=2)
         pipeline_module.run()
 
     """

From 29ecb2b03a9d2597e754e01c679c649bc2bac602 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Sun, 4 Jul 2021 20:21:30 -0700
Subject: [PATCH 21/28] fix plin issue, improve Load function.

---
 python/tvm/contrib/pipeline_executor.py      |   9 +-
 src/runtime/pipeline/pipeline_executor.h     | 104 ++++++++++---------
 src/runtime/pipeline/pipeline_function.cc    |  22 ++--
 tests/python/relay/test_pipeline_executor.py |   5 +-
 4 files changed, 74 insertions(+), 66 deletions(-)

diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
index de47084e51e7..04287bd7837d 100644
--- a/python/tvm/contrib/pipeline_executor.py
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -55,8 +55,6 @@ def write_file(file_name, data, mode):
         with open(file_name, mode) as file_handle:
             file_handle.write(data)
 
-    return
-
 
 def build_pipeline(mod_n_configs, export_path=None):
     """build module list that can use for pipeline execution.
@@ -130,8 +128,8 @@ def build_pipeline(mod_n_configs, export_path=None):
 
     if export_path:
         write_file("{}/config".format(export_path), json.dumps(string_config), "w")
-        #with open("{}/config".format(export_path), "w") as config_file:
-        #    config_file.write(json.dumps(string_config))
+        # with open("{}/config".format(export_path), "w") as config_file:
+        # config_file.write(json.dumps(string_config))
 
     # return IRModule list and pipeline configuration
     return mods, string_config
@@ -159,11 +157,10 @@ def create(pipeline_mods, mod_config):
         mod = graph_executor.GraphModule(
             pipeline_mod["default"](pipeline_mods[pipeline_mod]["dev"])
         )
-    
         mods.append(mod.module)
 
     submodule = PipelineModule(mods, json.dumps(mod_config))
-    #submodule = PipelineModule(pipeline_mods, json.dumps(mod_config))
+    # submodule = PipelineModule(pipeline_mods, json.dumps(mod_config))
     return submodule
 
 
diff --git a/src/runtime/pipeline/pipeline_executor.h b/src/runtime/pipeline/pipeline_executor.h
index f743d56c57ca..8cd99df1ab56 100644
--- a/src/runtime/pipeline/pipeline_executor.h
+++ b/src/runtime/pipeline/pipeline_executor.h
@@ -117,6 +117,58 @@ class TVM_DLL SubGraphRuntime : public ModuleNode {
    */
   Array<NDArray> GetOutput(bool syncPoll = true);
 
+ protected:
+  vector<NDArray> output_entry_;
+  PIPELINE_CONF pipeline_conf_;
+  MOD_CONF mod_conf_;
+  vector<shared_ptr<RuntimeItem>> runtimes_;
+  MOD_DLDATA_MAP_PTR input_int_map_;
+  size_t outpuNumber_ = 0;
+
+  unordered_map<int, string> LoadDependent(dmlc::JSONReader* reader) {
+    unordered_map<int, string> ret;
+    reader->BeginArray();
+    while (reader->NextArrayItem()) {
+      std::string key;
+      reader->BeginObject();
+      string inputName;
+      int dep_mod_indx;
+      while (reader->NextObjectItem(&key)) {
+        if (key == "mod_indx") {
+          reader->Read(&dep_mod_indx);
+        }
+        if (key == "input_name") {
+          reader->Read(&inputName);
+        }
+      }
+      ret[dep_mod_indx] = inputName;
+    }
+    return ret;
+  }
+
+  unordered_map<int, unordered_map<int, string>> LoadOutput(dmlc::JSONReader* reader) {
+    reader->BeginArray();
+    unordered_map<int, unordered_map<int, string>> ret;
+    while (reader->NextArrayItem()) {
+      std::string key;
+      reader->BeginObject();
+      string inputName;
+      int output_indx;
+      unordered_map<int, string> dep;
+      while (reader->NextObjectItem(&key)) {
+        if (key == "output_indx") {
+          reader->Read(&output_indx);
+        }
+
+        if (key == "dependent") {
+          dep = LoadDependent(reader);
+        }
+      }
+      ret[output_indx] = dep;
+    }
+    return ret;
+  }
+
   void Load(dmlc::JSONReader* reader) {
     reader->BeginArray();
     while (reader->NextArrayItem()) {
@@ -133,7 +185,6 @@ class TVM_DLL SubGraphRuntime : public ModuleNode {
         if (key == "mod_indx") {
           reader->Read(&mod_indx);
         }
-
         if (key == "lib_name") {
           reader->Read(&libName);
         }
@@ -151,57 +202,14 @@ class TVM_DLL SubGraphRuntime : public ModuleNode {
         }
 
         if (key == "output") {
-          reader->BeginArray();
-          while (reader->NextArrayItem()) {
-            int output_indx = -1;
-            unordered_map<int, string> depend;
-            reader->BeginObject();
-            while (reader->NextObjectItem(&key)) {
-              if (key == "output_indx") {
-                reader->Read(&output_indx);
-              }
-              if (key == "dependent") {
-                reader->BeginArray();
-                int dep_mod_indx = -1;
-                string inputName;
-                while (reader->NextArrayItem()) {
-                  reader->BeginObject();
-                  while (reader->NextObjectItem(&key)) {
-                    if (key == "mod_indx") {
-                      reader->Read(&dep_mod_indx);
-                    }
-                    if (key == "input_name") {
-                      reader->Read(&inputName);
-                    }
-                  }
-                  if (dep_mod_indx >= 0) {
-                    depend[dep_mod_indx] = inputName;
-                  }
-                }
-              }
-            }
-
-            if (output_indx >= 0) {
-              output[output_indx] = depend;
-            }
-          }
+          output = LoadOutput(reader);
         }
       }
-      if (mod_indx >= 0) {
-        pipeline_conf_[mod_indx] = output;
-        mod_conf_[mod_indx] = {
-            {"lib_name", libName}, {"json_name", jsonName}, {"params", paramsName}, {"dev", dev}};
-      }
+      pipeline_conf_[mod_indx] = output;
+      mod_conf_[mod_indx] = {
+          {"lib_name", libName}, {"json_name", jsonName}, {"params", paramsName}, {"dev", dev}};
     }
   }
-
- protected:
-  vector<NDArray> output_entry_;
-  PIPELINE_CONF pipeline_conf_;
-  MOD_CONF mod_conf_;
-  vector<shared_ptr<RuntimeItem>> runtimes_;
-  MOD_DLDATA_MAP_PTR input_int_map_;
-  size_t outpuNumber_ = 0;
 };
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/pipeline/pipeline_function.cc b/src/runtime/pipeline/pipeline_function.cc
index 98355bee1d4e..3033e5975baa 100644
--- a/src/runtime/pipeline/pipeline_function.cc
+++ b/src/runtime/pipeline/pipeline_function.cc
@@ -79,7 +79,9 @@ vector<Module> pipeline_graph_runtime(Array<Module> modules, const MOD_CONF& mod
     for (auto mod : modules) {
       ret.push_back(mod);
     }
-  } else {// if modules is empty, need to build the graph runtime
+
+    // if modules is empty, need to build the graph runtime from mod_conf
+  } else {
     ret.resize(mod_conf.size());
     for (auto mconf : mod_conf) {
       // load lib
@@ -96,15 +98,15 @@ vector<Module> pipeline_graph_runtime(Array<Module> modules, const MOD_CONF& mod
       istringstream istr(mconf.second["dev"]);
       string str;
       int deviceType = 1, deviceId = 0;
-      while(getline(istr, str, ';')) {
-          istringstream istrDev(str);
-          string stemp;
-          if (getline(istrDev, stemp)) {
-              deviceType = stoi(stemp);
-          }
-          if (getline(istrDev, stemp)) {
-              deviceId = stoi(stemp);
-          }
+      while (getline(istr, str, ';')) {
+        istringstream istrDev(str);
+        string stemp;
+        if (getline(istrDev, stemp)) {
+          deviceType = stoi(stemp);
+        }
+        if (getline(istrDev, stemp)) {
+          deviceId = stoi(stemp);
+        }
       }
       Module graphModule = (*graphRuntimeCreate)(json, lib, deviceType, deviceId);
 
diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index f08bcfd23382..aebad9fb81a0 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -177,8 +177,9 @@ def run_pipeline(target):
     #build and create pipeline module
     """
     with relay.build_config(opt_level=3):
-        pipeline_mods, string_config = pipeline_executor.build_pipeline(mod_config,
-                                                                        "/scratch/hj/data/")
+        pipeline_mods, string_config = pipeline_executor.build_pipeline(
+            mod_config, "/scratch/hj/data/"
+        )
 
     pipeline_module = pipeline_executor.create(pipeline_mods, string_config)
 

From ad09daa9460b96babc5d218eec1b085c9f75a695 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Fri, 10 Sep 2021 19:30:01 -0700
Subject: [PATCH 22/28] change output start from 0

---
 CMakeLists.txt                               |  8 ++++----
 src/runtime/pipeline/pipeline_struct.h       |  8 ++++----
 tests/python/relay/test_pipeline_executor.py | 13 +++++++------
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ad3fbc06791d..83b865f3c57b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -389,11 +389,11 @@ if(GTEST_INCLUDE_DIR AND GTEST_LIB)
   include(GoogleTest)
 endif()
 
-if(USE_SUBGRAPH_EXECUTOR)
+if(USE_PIPELINE_EXECUTOR)
     message(STATUS "Build with Subgraph Executor support...")
-  file(GLOB RUNTIME_SUBGRAPH_SRCS src/runtime/subgraph/*.cc)
-  list(APPEND RUNTIME_SRCS ${RUNTIME_SUBGRAPH_SRCS})
-endif(USE_SUBGRAPH_EXECUTOR)
+    file(GLOB RUNTIME_PIPELINE_SRCS src/runtime/pipeline/*.cc)
+  list(APPEND RUNTIME_SRCS ${RUNTIME_PIPELINE_SRCS})
+endif(USE_PIPELINE_EXECUTOR)
 
 # Module rules
 include(cmake/modules/VTA.cmake)
diff --git a/src/runtime/pipeline/pipeline_struct.h b/src/runtime/pipeline/pipeline_struct.h
index 12305530c0da..6fe6b7410128 100644
--- a/src/runtime/pipeline/pipeline_struct.h
+++ b/src/runtime/pipeline/pipeline_struct.h
@@ -22,7 +22,7 @@
 #include <sched.h>
 #include <string.h>
 #include <sys/syscall.h>
-#include <tvm/runtime/container.h>
+//#include <tvm/runtime/container.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
 #include <unistd.h>
@@ -100,7 +100,7 @@ class Dependent {
     assert(inputIndx <= TYP_MAX(DEP_INDX_TYPE));
     if (modIndx == 0) {
       bFinal = true;
-      outputIndx = inputIndx;
+      outputIndx = inputIndx - 1;
     } else {
       dependent[modIndx - 1] = inputIndx;
     }
@@ -307,7 +307,7 @@ class pipelineOutputData {
      */
     for (size_t i = 0; i < slot.data.num; i++) {
       auto dlTensor = slot.data.inputList[i]->dlData.data;
-      int outputIndx = slot.data.inputList[i]->dependent.GetOutputIndx() - 1;
+      int outputIndx = slot.data.inputList[i]->dependent.GetOutputIndx();
       assert(outputIndx < slot.data.num);
       dataMap[outputIndx] = dlTensor;
     }
@@ -562,7 +562,7 @@ class RuntimeItem {
     size_t outputsNum = runtimePtr->NumOutputs();
     for (size_t i = 0; i < outputsNum; i++) {
       shared_ptr<OutputData> output =
-          make_shared<OutputData>(runtimePtr->GetOutput(i), i + 1, runtime_pipeline_output_conf);
+          make_shared<OutputData>(runtimePtr->GetOutput(i), i , runtime_pipeline_output_conf);
 
       outputs->push_back(output);
     }
diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index aebad9fb81a0..b8e85fc500bb 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -50,7 +50,7 @@ def run_modules(mod_configs, dev, target, dname, data, iMod, iName, iData):
         # parse mod_config and set current output as next mod input data
         mconfig = mod_configs[mod]
         for output in mconfig["pipeline"]["output"]:
-            output_data = m.get_output(output["output_indx"] - 1).asnumpy()
+            output_data = m.get_output(output["output_indx"]).asnumpy()
             for dep in output["dependent"]:
                 # currnet output use as dependent input,
                 # input_name indicate the input index number.
@@ -136,9 +136,9 @@ def run_pipeline(target):
     mconfig1["pipeline"] = {
         "mod_indx": 1,
         "output": [
-            {"output_indx": 1, "dependent": [{"mod_indx": 2, "input_name": "data_0"}]},
-            {"output_indx": 2, "dependent": [{"mod_indx": 3, "input_name": "data_0"}]},
-            {"output_indx": 3, "dependent": [{"mod_indx": 0, "input_name": "1"}]},
+            {"output_indx": 0, "dependent": [{"mod_indx": 2, "input_name": "data_0"}]},
+            {"output_indx": 1, "dependent": [{"mod_indx": 3, "input_name": "data_0"}]},
+            {"output_indx": 2, "dependent": [{"mod_indx": 0, "input_name": "1"}]},
         ],
     }
     mod_config[mods[0]] = mconfig1
@@ -149,7 +149,7 @@ def run_pipeline(target):
     mconfig2["pipeline"] = {
         "mod_indx": 2,
         "output": [
-            {"output_indx": 1, "dependent": [{"mod_indx": 3, "input_name": "data_1"}]},
+            {"output_indx": 0, "dependent": [{"mod_indx": 3, "input_name": "data_1"}]},
         ],
     }
     mod_config[mods[1]] = mconfig2
@@ -160,7 +160,7 @@ def run_pipeline(target):
 
     mconfig3["pipeline"] = {
         "mod_indx": 3,
-        "output": [{"output_indx": 1, "dependent": [{"mod_indx": 0, "input_name": "2"}]}],
+        "output": [{"output_indx": 0, "dependent": [{"mod_indx": 0, "input_name": "2"}]}],
     }
     mod_config[mods[2]] = mconfig3
 
@@ -211,6 +211,7 @@ def run_pipeline(target):
     for ref_out, out in zip(outs, pipeline_outputs):
         for ref in ref_out:
             tvm.testing.assert_allclose(ref_out[ref], out[int(ref) - 1])
+            print(ref_out[ref])
 
 
 def test_pipeline():

From 47c5cc24dc01248b0c1b7ea76cb3ff2806445888 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Fri, 1 Oct 2021 18:34:45 -0700
Subject: [PATCH 23/28] Add network split logic into test.

---
 tests/python/relay/test_pipeline_executor.py | 199 +++++++++++++++----
 1 file changed, 158 insertions(+), 41 deletions(-)

diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index b8e85fc500bb..360ee35132ab 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -22,6 +22,134 @@
 from tvm.relay import transform
 from tvm.contrib import graph_executor, pipeline_executor
 
+"""
+Split graph into a serial of sbgraph.
+"""
+def pipeline_graph(expr, indices):
+    """Split Graph Into A Group Of Subgraph
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+    indices : Array[int]
+    Returns
+    -------
+    ret : Array[tvm.relay.IRModule]
+    """
+
+    def run_opt_pass(expr, opt_pass):
+        """Exectue a relay pass"""
+        assert isinstance(opt_pass, tvm.transform.Pass)
+        mod = tvm.IRModule.from_expr(expr)
+        mod = tvm.relay.transform.InferType()(mod)
+        mod = opt_pass(mod)
+        entry = mod["main"]
+        return entry if isinstance(expr, tvm.relay.Function) else entry.body
+
+    def _operator_idx_inc(expr, operator_current_idx):
+        """Increase operator index"""
+        if not isinstance(expr, tvm.relay.expr.Constant):
+            operator_current_idx = operator_current_idx + 1
+
+        return operator_current_idx
+
+    def merge_constant_expr(constant_expr, expr):
+        # merge constant express with a express
+        # Parameters
+        # ----------
+        # constant_expr:
+        #     constant expression
+        # expr:
+        #     expression to merge with constant expression
+
+        # If body not let, then reached end of the express
+        if not isinstance(constant_expr.body, tvm.relay.expr.Let):
+            return tvm.relay.expr.Let(constant_expr.var, constant_expr.value, expr)
+
+        return tvm.relay.expr.Let(
+            constant_expr.var, constant_expr.value, merge_constant_expr(constant_expr.body, expr)
+        )
+
+    def _recursion(anf, operator_indx, pipeline_mods, indices, constant_expr):
+        # Enumrate all operator of compute graph then split the compute graph
+        # into a group subgraph.
+        # Parameters
+        # ----------
+        # anf:
+        #     ANF format expression
+        # operator_indx:
+        #     current operator indice
+        # pipeline_mods:
+        #     the subgraph list get storage in this variable
+        # indices:
+        #     Array of indices use to define the subgraph scope
+        # constant_expr:
+        #     constant defined before current operator
+
+        # Do the split work
+        if isinstance(anf, tvm.relay.Function):
+            return tvm.relay.Function(
+                anf.params,
+                _recursion(anf.body, operator_indx, pipeline_mods, indices, constant_expr),
+                anf.ret_type,
+                anf.type_params,
+                anf.attrs,
+            )
+        if isinstance(anf, tvm.relay.expr.Let):
+            value = anf.value
+            operator_indx = _operator_idx_inc(value, operator_indx)
+
+            # record constan expr to make sure all sugraph can find correct
+            # constant.
+            if isinstance(value, tvm.relay.expr.Constant):
+                if not constant_expr:
+                    constant_expr = tvm.relay.expr.Let(anf.var, value, anf.var)
+                else:
+                    constant_expr = tvm.relay.expr.Let(anf.var, value, constant_expr)
+
+            if isinstance(value, tvm.relay.expr.Call):
+                if isinstance(value.op, tvm.ir.Op):
+
+                    # if have expr a(b(c(d(e)))) and indexes are [1,2,3]
+                    # then would get separate modules for a(b),c,d(e).
+                    # the split area is a(b)[0,1] c[2,2] d(e)[2,3]
+                    if indices and operator_indx == indices[0]:
+                        indices.pop(0)
+                        ann = _recursion(
+                            anf.body, operator_indx, pipeline_mods, indices, constant_expr
+                        )
+
+                        # when current subgraph use previous subgraph constant,
+                        # such constant may become free varaible due to the constant
+                        # not exist, merge the previous constant with current subgraph
+                        # to avoid such issue.
+                        if constant_expr:
+                            ann = merge_constant_expr(constant_expr, ann)
+
+                        ann = run_opt_pass(ann, transform.ToGraphNormalForm())
+                        mod = tvm.IRModule.from_expr(ann)
+                        pipeline_mods.insert(0, mod)
+                        return tvm.relay.expr.Let(anf.var, value, anf.var)
+            return tvm.relay.expr.Let(
+                anf.var,
+                value,
+                _recursion(anf.body, operator_indx, pipeline_mods, indices, constant_expr),
+            )
+        else:
+            return anf
+
+    pipeline_mods = []
+
+    # operator count start from 0, then initial value get set into -1
+    operator_indx = -1
+    constant_expr = None
+    subgraph_indices = indices.copy()
+    anf = run_opt_pass(expr, transform.ToANormalForm())
+    anf = run_opt_pass(anf, transform.InferType())
+    ann = _recursion(anf, operator_indx, pipeline_mods, subgraph_indices, constant_expr)
+    ann = run_opt_pass(ann.body, transform.ToGraphNormalForm())
+    mod = tvm.IRModule.from_expr(ann)
+    pipeline_mods.insert(0, mod)
+    return pipeline_mods
 
 def run_modules(mod_configs, dev, target, dname, data, iMod, iName, iData):
     mod_input = {}
@@ -69,54 +197,45 @@ def run_modules(mod_configs, dev, target, dname, data, iMod, iName, iData):
 
     return final_output
 
-
-def get_mannual_mod():
-    mods = []
+def get_network():
     dshape = (3, 3)
-    data = relay.var("data_0", relay.TensorType(dshape, "float32"))
+    data = relay.var("data", relay.TensorType(dshape, "float32"))
     data21 = relay.var("data_1", relay.TensorType(dshape, "float32"))
-    data_net1_output_1 = relay.var("data_0", relay.TensorType(dshape, "float32"))
-    data_net1_output_2 = relay.var("data_1", relay.TensorType(dshape, "float32"))
-    data_net2_output_1 = relay.var("data_0", relay.TensorType(dshape, "float32"))
     mvalue1 = np.full((1), 1).astype("float32")
     mvalue2 = np.full((1), 2).astype("float32")
     mvalue3 = np.full((1), 3).astype("float32")
     mv1 = relay.Constant(tvm.nd.array(mvalue1))
     mv2 = relay.Constant(tvm.nd.array(mvalue2))
     mv3 = relay.Constant(tvm.nd.array(mvalue3))
-
-    # net1 have three output, output3 is final output
-    net_output1 = relay.add(data, mv1)
-    net_output2 = relay.subtract(data, mv2)
-    net_output3 = relay.multiply(data, mv3)
-
-    # net2 use net1 output1 as input
-    net2 = relay.add(data_net1_output_1, mv2)
-    net2 = relay.add(net2, data21)
-    net2 = relay.add(net2, mv3)
-
-    # net3 use net2 output1 and net1 outpu2 as input
-    net3 = relay.multiply(data_net2_output_1, mv3)
-    net3 = relay.add(net3, data_net1_output_2)
-
-    mods.append(
-        tvm.IRModule.from_expr(
-            relay.Function([data], relay.Tuple([net_output1, net_output2, net_output3]))
-        )
-    )
-    mods.append(tvm.IRModule.from_expr(relay.Function([data_net1_output_1, data21], net2)))
-    mods.append(
-        tvm.IRModule.from_expr(relay.Function([data_net1_output_2, data_net2_output_1], net3))
-    )
-
+    data = relay.var("data", relay.TensorType(dshape, "float32"))
+    net = relay.add(data, mv1)
+    net = relay.multiply(net, mv3)
+
+    net = relay.add(net, mv2)
+    net = relay.add(net, data21)
+    net = relay.add(net, mv3)
+
+    net = relay.multiply(net, mv3)
+    net_output2 = relay.subtract(net, mv2)
+    net = relay.add(net, net)
+    func = relay.Function([data, data21], net)
+    mod = tvm.IRModule.from_expr(func)
+    return mod, dshape
+
+def get_split_mod():
+    mod, dshape = get_network()
+    """
+    #split compute graph into 4 subgraph
+    """
+    pl = [2, 5]
+    mods = pipeline_graph(mod["main"], pl)
     return mods, dshape
 
-
 def run_pipeline(target):
     """
     #Get 4 pipeline module.
     """
-    mods, dshape = get_mannual_mod()
+    mods, dshape = get_split_mod()
     """
     #Prepare batch data for pipeline feeding
     """
@@ -136,9 +255,7 @@ def run_pipeline(target):
     mconfig1["pipeline"] = {
         "mod_indx": 1,
         "output": [
-            {"output_indx": 0, "dependent": [{"mod_indx": 2, "input_name": "data_0"}]},
-            {"output_indx": 1, "dependent": [{"mod_indx": 3, "input_name": "data_0"}]},
-            {"output_indx": 2, "dependent": [{"mod_indx": 0, "input_name": "1"}]},
+            {"output_indx": 0, "dependent": [{"mod_indx": 2, "input_name": "x"}]},
         ],
     }
     mod_config[mods[0]] = mconfig1
@@ -149,7 +266,7 @@ def run_pipeline(target):
     mconfig2["pipeline"] = {
         "mod_indx": 2,
         "output": [
-            {"output_indx": 0, "dependent": [{"mod_indx": 3, "input_name": "data_1"}]},
+            {"output_indx": 0, "dependent": [{"mod_indx": 3, "input_name": "x"}]},
         ],
     }
     mod_config[mods[1]] = mconfig2
@@ -160,7 +277,7 @@ def run_pipeline(target):
 
     mconfig3["pipeline"] = {
         "mod_indx": 3,
-        "output": [{"output_indx": 0, "dependent": [{"mod_indx": 0, "input_name": "2"}]}],
+        "output": [{"output_indx": 0, "dependent": [{"mod_indx": 0, "input_name": "1"}]}],
     }
     mod_config[mods[2]] = mconfig3
 
@@ -168,7 +285,7 @@ def run_pipeline(target):
     #Run with graph executor for verification purpose
     """
     outs = [
-        run_modules(mod_config, tvm.cpu(), "llvm", "data_0", data, mods[1], "data_1", data)
+        run_modules(mod_config, tvm.cpu(), "llvm", "data", data, mods[1], "data_1", data)
         for data in datas
     ]
     """
@@ -188,7 +305,7 @@ def run_pipeline(target):
     """
     d3 = np.full(dshape, 10).astype("float32")
     for data in datas:
-        pipeline_module.set_input("data_0", data)
+        pipeline_module.set_input("data", data)
         pipeline_module.set_input("data_1", data, mod_idx=2)
         pipeline_module.run()
 

From b42de5ac868d969eb8cca744ebdf534be7f564f7 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Thu, 7 Oct 2021 10:27:50 -0700
Subject: [PATCH 24/28] change moudle index into start 0.

---
 src/runtime/pipeline/pipeline_function.cc    | 15 ++++++++-------
 src/runtime/pipeline/pipeline_struct.h       |  4 ++--
 tests/python/relay/test_pipeline_executor.py | 14 +++++++-------
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/src/runtime/pipeline/pipeline_function.cc b/src/runtime/pipeline/pipeline_function.cc
index 3033e5975baa..9912b4a8734d 100644
--- a/src/runtime/pipeline/pipeline_function.cc
+++ b/src/runtime/pipeline/pipeline_function.cc
@@ -60,8 +60,9 @@ pipeline_name_to_indx(const Array<Module>& graphRuntimes,
   for (auto outConf : pConfStr) {
     for (auto conf : outConf.second) {
       int modIndx = conf.first;
-      if (modIndx) {
-        auto mGetIndex = ((Module)graphRuntimes[modIndx - 1]).GetFunction("get_input_index");
+      // -1 is global module/pipelineexecutor
+      if (modIndx >= 0) {
+        auto mGetIndex = ((Module)graphRuntimes[modIndx]).GetFunction("get_input_index");
         confRet[outConf.first][modIndx] = (static_cast<int>(mGetIndex(conf.second))) + 1;
       } else {
         confRet[outConf.first][modIndx] = stoi(conf.second);
@@ -123,7 +124,7 @@ vector<Module> pipeline_graph_runtime(Array<Module> modules, const MOD_CONF& mod
       load_params(params_arr);
 
       // put into return vector
-      ret[mconf.first - 1] = graphModule;
+      ret[mconf.first] = graphModule;
     }
   }
   return ret;
@@ -138,7 +139,7 @@ size_t pipeline_init(Array<Module> modules, SHARED_RUNTIME_VEC* runtimes,
     QUEUE* sub_queue = createQueue<SLOT>(NULL, SUB_Q_SIZE);
     /* runtimeIndx start from 1.
      */
-    int runtimeIndx = i + 1;
+    int runtimeIndx = i;
     /* get dependency configuration information.
      */
     auto pConf = pipeline_name_to_indx(graphRuntimes, pipeline_conf.at(runtimeIndx));
@@ -148,8 +149,8 @@ size_t pipeline_init(Array<Module> modules, SHARED_RUNTIME_VEC* runtimes,
     /* set prev and next for RuntimeItem, runtime need these information to
      * poll data from prev and do notification for next.
      */
-    if (i > 0) {
-      (*runtimes)[i - 1]->next = (*runtimes)[i];
+    if (i < len - 1) {
+      (*runtimes)[i]->next = (*runtimes)[i + 1];
     }
     if (i == len - 1) {
       (*runtimes)[i]->next = (*runtimes)[0];
@@ -160,7 +161,7 @@ size_t pipeline_init(Array<Module> modules, SHARED_RUNTIME_VEC* runtimes,
       for (auto depMap : pConf) {
         /* output is final output when dependent number is 0.
          */
-        outputNum += depMap.second.find(0) != depMap.second.end();
+        outputNum += depMap.second.find(-1) != depMap.second.end();
       }
     } else {
       outputNum += runItem->runtimePtr->NumOutputs();
diff --git a/src/runtime/pipeline/pipeline_struct.h b/src/runtime/pipeline/pipeline_struct.h
index 6fe6b7410128..d4f66f3b5325 100644
--- a/src/runtime/pipeline/pipeline_struct.h
+++ b/src/runtime/pipeline/pipeline_struct.h
@@ -102,7 +102,7 @@ class Dependent {
       bFinal = true;
       outputIndx = inputIndx - 1;
     } else {
-      dependent[modIndx - 1] = inputIndx;
+      dependent[modIndx] = inputIndx;
     }
     depNum++;
   }
@@ -112,7 +112,7 @@ class Dependent {
   int GetDepModInputIndx(const int modIndx) { return dependent[modIndx - 1]; }
 
   void RemoveDependentRef(const int modIndx) {
-    dependent[modIndx - 1] = 0;
+    dependent[modIndx] = 0;
     depNum--;
   }
 
diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index 360ee35132ab..e5f8a8abc0be 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -154,7 +154,7 @@ def _recursion(anf, operator_indx, pipeline_mods, indices, constant_expr):
 def run_modules(mod_configs, dev, target, dname, data, iMod, iName, iData):
     mod_input = {}
     final_output = {}
-    indx = 1
+    indx = 0
     for mod in mod_configs:
         with tvm.transform.PassContext(opt_level=3):
             lib = relay.build(mod, target)
@@ -253,9 +253,9 @@ def run_pipeline(target):
     # third output is final output, second output for mod3, first for mod2
     # input
     mconfig1["pipeline"] = {
-        "mod_indx": 1,
+        "mod_indx": 0,
         "output": [
-            {"output_indx": 0, "dependent": [{"mod_indx": 2, "input_name": "x"}]},
+            {"output_indx": 0, "dependent": [{"mod_indx": 1, "input_name": "x"}]},
         ],
     }
     mod_config[mods[0]] = mconfig1
@@ -264,9 +264,9 @@ def run_pipeline(target):
     mconfig2["target"] = "llvm"
     mconfig2["dev"] = tvm.cpu(0)
     mconfig2["pipeline"] = {
-        "mod_indx": 2,
+        "mod_indx": 1,
         "output": [
-            {"output_indx": 0, "dependent": [{"mod_indx": 3, "input_name": "x"}]},
+            {"output_indx": 0, "dependent": [{"mod_indx": 2, "input_name": "x"}]},
         ],
     }
     mod_config[mods[1]] = mconfig2
@@ -276,8 +276,8 @@ def run_pipeline(target):
     mconfig3["dev"] = tvm.cpu(0)
 
     mconfig3["pipeline"] = {
-        "mod_indx": 3,
-        "output": [{"output_indx": 0, "dependent": [{"mod_indx": 0, "input_name": "1"}]}],
+        "mod_indx": 2,
+        "output": [{"output_indx": 0, "dependent": [{"mod_indx": -1, "input_name": "1"}]}],
     }
     mod_config[mods[2]] = mconfig3
 

From 25faf1973ce350aad6775361980e33a4745044b9 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Sat, 16 Oct 2021 00:30:22 -0700
Subject: [PATCH 25/28] fix mod_idx = 0 crash issue, but still logic wrong

---
 python/tvm/contrib/pipeline_executor.py      |  8 ++++----
 src/runtime/pipeline/pipeline_executor.cc    |  4 ++--
 src/runtime/pipeline/pipeline_function.cc    |  6 +++---
 src/runtime/pipeline/pipeline_struct.h       | 10 +++++-----
 tests/python/relay/test_pipeline_executor.py |  6 +++---
 5 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
index 04287bd7837d..bd8c9a7685f1 100644
--- a/python/tvm/contrib/pipeline_executor.py
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -92,7 +92,7 @@ def build_pipeline(mod_n_configs, export_path=None):
         mconf = mod_config["pipeline"].copy()
         # Get mod device config
         dev = mod_config["dev"]
-        mod_indx = mconf["mod_indx"] - 1
+        mod_indx = mconf["mod_indx"]
         assert mod_indx < config_len
         build_func = relay.build
         # if there is a self defined build function then use it.
@@ -197,7 +197,7 @@ def __init__(self, modules, pipeline_config):
         self._get_num_outputs = module["get_num_outputs"]
         self._get_num_inputs = module["get_num_inputs"]
 
-    def set_input(self, key, value, mod_idx=1, params=None):
+    def set_input(self, key, value, mod_idx=0, params=None):
         """Set inputs to the module via kwargs
 
         Parameters
@@ -214,12 +214,12 @@ def set_input(self, key, value, mod_idx=1, params=None):
         params : dict of str to NDArray
             Additional arguments
         """
-        assert mod_idx >= 1
+        assert mod_idx >= 0
         self._set_input(key, tvm.nd.array(value, tvm.cpu()), mod_idx)
 
         if params:
             for param in params:
-                self.graph_modules_[mod_idx - 1].set_input(**param)
+                self.graph_modules_[mod_idx].set_input(**param)
 
     def run(self):
         """Run forward execution of the graph"""
diff --git a/src/runtime/pipeline/pipeline_executor.cc b/src/runtime/pipeline/pipeline_executor.cc
index 422bae4b61cd..2c949bea058f 100644
--- a/src/runtime/pipeline/pipeline_executor.cc
+++ b/src/runtime/pipeline/pipeline_executor.cc
@@ -48,7 +48,7 @@ void SubGraphRuntime::Init(const Array<tvm::runtime::Module>& modules,
  * \param modIndx The runtime index.
  */
 void SubGraphRuntime::SetInput(int index, DLTensor* data_in, int mod_idx) {
-  if (1 == mod_idx) {
+  if (0 == mod_idx) {
     runtimes_[0]->runtimePtr->SetInput(index, data_in);
   } else {
     pipeline_setinput(input_int_map_, index, data_in, mod_idx);
@@ -93,7 +93,7 @@ NDArray SubGraphRuntime::GetInput(int index, int mIndx) const {
  * \return int corresponding to given input node name.
  */
 int SubGraphRuntime::GetInputIndex(const string& name, int mIndx) const {
-  auto gruntime = runtimes_[mIndx - 1];
+  auto gruntime = runtimes_[mIndx];
   return gruntime->runtimePtr->GetInputIndex(name);
 }
 
diff --git a/src/runtime/pipeline/pipeline_function.cc b/src/runtime/pipeline/pipeline_function.cc
index 9912b4a8734d..90475a3cc9f0 100644
--- a/src/runtime/pipeline/pipeline_function.cc
+++ b/src/runtime/pipeline/pipeline_function.cc
@@ -149,8 +149,8 @@ size_t pipeline_init(Array<Module> modules, SHARED_RUNTIME_VEC* runtimes,
     /* set prev and next for RuntimeItem, runtime need these information to
      * poll data from prev and do notification for next.
      */
-    if (i < len - 1) {
-      (*runtimes)[i]->next = (*runtimes)[i + 1];
+    if (i > 0) {
+      (*runtimes)[i - 1]->next = (*runtimes)[i];
     }
     if (i == len - 1) {
       (*runtimes)[i]->next = (*runtimes)[0];
@@ -193,7 +193,7 @@ void pipeline_run(const SHARED_RUNTIME_VEC& runtimes, const MOD_DLDATA_MAP_PTR i
   for (auto modInputs : *indxInputs) {
     int modIndx = modInputs.first;
     for (auto inputs : modInputs.second) {
-      outputs.push_back(make_shared<OutputData>(modIndx, inputs.first + 1, inputs.second->data));
+      outputs.push_back(make_shared<OutputData>(modIndx, inputs.first, inputs.second->data));
     }
   }
 
diff --git a/src/runtime/pipeline/pipeline_struct.h b/src/runtime/pipeline/pipeline_struct.h
index d4f66f3b5325..b1f80e13a4fc 100644
--- a/src/runtime/pipeline/pipeline_struct.h
+++ b/src/runtime/pipeline/pipeline_struct.h
@@ -98,9 +98,9 @@ class Dependent {
   void SetDepModInputIndx(const int modIndx, const uint8_t inputIndx) {
     assert(modIndx <= DEPENDENT_MAX);
     assert(inputIndx <= TYP_MAX(DEP_INDX_TYPE));
-    if (modIndx == 0) {
+    if (modIndx == -1) {
       bFinal = true;
-      outputIndx = inputIndx - 1;
+      outputIndx = inputIndx;
     } else {
       dependent[modIndx] = inputIndx;
     }
@@ -109,7 +109,7 @@ class Dependent {
 
   int GetOutputIndx(void) { return outputIndx; }
 
-  int GetDepModInputIndx(const int modIndx) { return dependent[modIndx - 1]; }
+  int GetDepModInputIndx(const int modIndx) { return dependent[modIndx]; }
 
   void RemoveDependentRef(const int modIndx) {
     dependent[modIndx] = 0;
@@ -471,8 +471,8 @@ class RuntimeData {
        * cross device memory copy to set input data.
        */
       int inputIndx = data[i]->dependent.GetDepModInputIndx(runtimeIndx);
-      if (inputIndx > 0) {
-        runtimePtr->SetInput(inputIndx - 1, data[i]->dlData.data);
+      if (inputIndx = 0) {
+        runtimePtr->SetInput(inputIndx, data[i]->dlData.data);
         /* data getused remove dependent reference for current runtime
          */
         data[i]->dependent.RemoveDependentRef(runtimeIndx);
diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index e5f8a8abc0be..97c3e081a094 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -184,7 +184,7 @@ def run_modules(mod_configs, dev, target, dname, data, iMod, iName, iData):
                 # input_name indicate the input index number.
                 mod_indx = dep["mod_indx"]
                 input_name = dep["input_name"]
-                if mod_indx == 0:
+                if mod_indx == -1:
                     final_output[input_name] = output_data
                 else:
                     if mod_indx in mod_input:
@@ -217,7 +217,7 @@ def get_network():
 
     net = relay.multiply(net, mv3)
     net_output2 = relay.subtract(net, mv2)
-    net = relay.add(net, net)
+    net = relay.add(net, mv3)
     func = relay.Function([data, data21], net)
     mod = tvm.IRModule.from_expr(func)
     return mod, dshape
@@ -306,7 +306,7 @@ def run_pipeline(target):
     d3 = np.full(dshape, 10).astype("float32")
     for data in datas:
         pipeline_module.set_input("data", data)
-        pipeline_module.set_input("data_1", data, mod_idx=2)
+        pipeline_module.set_input("data_1", data, mod_idx=1)
         pipeline_module.run()
 
     """

From 2e0c1c3f641fad49718f411eeb964547f6c0b864 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Sun, 17 Oct 2021 14:21:40 -0700
Subject: [PATCH 26/28] fix crash issue.

---
 src/runtime/pipeline/pipeline_struct.h       | 4 ++--
 tests/python/relay/test_pipeline_executor.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/runtime/pipeline/pipeline_struct.h b/src/runtime/pipeline/pipeline_struct.h
index b1f80e13a4fc..af541faa64b1 100644
--- a/src/runtime/pipeline/pipeline_struct.h
+++ b/src/runtime/pipeline/pipeline_struct.h
@@ -112,7 +112,7 @@ class Dependent {
   int GetDepModInputIndx(const int modIndx) { return dependent[modIndx]; }
 
   void RemoveDependentRef(const int modIndx) {
-    dependent[modIndx] = 0;
+    dependent[modIndx] = -1;
     depNum--;
   }
 
@@ -471,7 +471,7 @@ class RuntimeData {
        * cross device memory copy to set input data.
        */
       int inputIndx = data[i]->dependent.GetDepModInputIndx(runtimeIndx);
-      if (inputIndx = 0) {
+      if (inputIndx >= 0) {
         runtimePtr->SetInput(inputIndx, data[i]->dlData.data);
         /* data getused remove dependent reference for current runtime
          */
diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index 97c3e081a094..c7dd211eea97 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -277,7 +277,7 @@ def run_pipeline(target):
 
     mconfig3["pipeline"] = {
         "mod_indx": 2,
-        "output": [{"output_indx": 0, "dependent": [{"mod_indx": -1, "input_name": "1"}]}],
+        "output": [{"output_indx": 0, "dependent": [{"mod_indx": -1, "input_name": "0"}]}],
     }
     mod_config[mods[2]] = mconfig3
 

From fba531e55f032367acc28090605c5533b2e88f55 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Sun, 17 Oct 2021 22:57:19 -0700
Subject: [PATCH 27/28] fix input data not get correct set issue.

[Finding]
the final output is same with constant value, seems like input data is
0, this is because the get input index have a '+1' operation, but the
input index already start from 0. the means when doing setinput('x',..) it
should get convert to setinput(0, ..), but the wrong logic is
setinput(1,..).
---
 src/runtime/pipeline/pipeline_function.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/pipeline/pipeline_function.cc b/src/runtime/pipeline/pipeline_function.cc
index 90475a3cc9f0..f0a845c90434 100644
--- a/src/runtime/pipeline/pipeline_function.cc
+++ b/src/runtime/pipeline/pipeline_function.cc
@@ -63,7 +63,7 @@ pipeline_name_to_indx(const Array<Module>& graphRuntimes,
       // -1 is global module/pipelineexecutor
       if (modIndx >= 0) {
         auto mGetIndex = ((Module)graphRuntimes[modIndx]).GetFunction("get_input_index");
-        confRet[outConf.first][modIndx] = (static_cast<int>(mGetIndex(conf.second))) + 1;
+        confRet[outConf.first][modIndx] = (static_cast<int>(mGetIndex(conf.second)));
       } else {
         confRet[outConf.first][modIndx] = stoi(conf.second);
       }

From 1ee5831b61b5a196eb99a44a998087f3c9c8c4a1 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Tue, 2 Nov 2021 12:01:13 -0700
Subject: [PATCH 28/28] fix set param report error issue.

---
 python/tvm/contrib/pipeline_executor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
index bd8c9a7685f1..c1f7842b3e6d 100644
--- a/python/tvm/contrib/pipeline_executor.py
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -157,7 +157,7 @@ def create(pipeline_mods, mod_config):
         mod = graph_executor.GraphModule(
             pipeline_mod["default"](pipeline_mods[pipeline_mod]["dev"])
         )
-        mods.append(mod.module)
+        mods.append(mod)
 
     submodule = PipelineModule(mods, json.dumps(mod_config))
     # submodule = PipelineModule(pipeline_mods, json.dumps(mod_config))
@@ -181,7 +181,7 @@ class PipelineModule(object):
     def __init__(self, modules, pipeline_config):
         mods = []
         for module in modules:
-            mods.append(module)
+            mods.append(module.module)
 
         pipelinecreate = tvm._ffi.get_global_func("tvm.pipeline_executor.create")
         assert pipelinecreate