From 1dae13b0a0d9827d04610b226bfdeb1bf07d02e5 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Fri, 22 Apr 2022 12:18:02 +0300
Subject: [PATCH 01/21] high-level design for invoke_with_output method was
 implemented

---
 include/tvm/runtime/vm/vm.h | 10 ++++++++++
 python/tvm/runtime/vm.py    | 19 +++++++++++++++++++
 src/runtime/vm/vm.cc        |  8 ++++++++
 3 files changed, 37 insertions(+)
diff --git a/include/tvm/runtime/vm/vm.h b/include/tvm/runtime/vm/vm.h
index 5a72a99fa635..216361cdd1a1 100644
--- a/include/tvm/runtime/vm/vm.h
+++ b/include/tvm/runtime/vm/vm.h
@@ -281,6 +281,13 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
    */
   void SetOneInput(std::string name, const TVMArgValue& tag, const TVMArgValue& tensor);
 
+  /*!
+   * \brief Set pre-allocated outputs to a function.
+   * \param name The function name
+   * \param args outputs to the function.
+   */
+  void SetOutputs(std::string name, TVMArgs args);
+
   /*!
    * \brief Internal hook for profiling the start of an op.
    *
@@ -356,6 +363,9 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
   ObjectPtr<Executable> exec_;
   /*! \brief The function name to inputs mapping. */
   std::unordered_map<std::string, std::vector<ObjectRef>> inputs_;
+  bool set_outputs_enabled_ = false;
+  /*! \brief The function name to pre-allocated outputs mapping. */
+  std::unordered_map<std::string, std::vector<ObjectRef>> outputs_;
   /*!
    * \brief The "physical" devices the VM can execute primitives on. All "device indexes"
    * are w.r.t. this vector. Each entry in this vector must match the corresponding entry
diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
index 615f66fdcc1c..3415f277f786 100644
--- a/python/tvm/runtime/vm.py
+++ b/python/tvm/runtime/vm.py
@@ -399,6 +399,7 @@ def __init__(self, exe, device, memory_cfg=None):
         self._get_input_index = self.module["get_input_index"]
         self._set_input = self.module["set_input"]
         self._set_one_input = self.module["set_one_input"]
+        self._set_outputs = self.module["set_outputs"]
         self._setup_device(device, memory_cfg)
 
     def _setup_device(self, dev, memory_cfg):
@@ -560,6 +561,24 @@ def invoke_stateful(self, func_name, *args, **kwargs):
             self.set_input(func_name, *args, **kwargs)
         self._invoke_stateful(func_name)
 
+    def invoke_with_outputs(self, func_name, *args):
+        """Invoke a function with pre-allocated outputs tensors.
+        It requires use set_input method before.
+
+        This invoke method allows to avoid excess copying if memory for output tensors
+        was allocated before inference.
+
+        Parameters
+        ----------
+        func_name : str
+            The name of the function.
+
+        args : list[tvm.runtime.NDArray] or list[DLTensor]
+            The output tensors of the function.
+        """
+        self._set_outputs(func_name, *args)
+        self._invoke(func_name)
+
     def get_outputs(self):
         """Get the outputs from a call to :py:func`invoke_stateful`.
 
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 6f52f4b83c81..56f24369225a 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -224,6 +224,9 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
                                 << "(func_name, index or name, tensor)";
       SetOneInput(args[0], args[1], args[2]);
     });
+  } else if (name == "set_outputs") {
+    return PackedFunc(
+        [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { SetOutputs(args[0], args); });
   } else if (name == "load_late_bound_consts") {
     return PackedFunc([this](TVMArgs args, TVMRetValue* rv) {
       CHECK_EQ(args.size(), 1);
@@ -272,6 +275,10 @@ void VirtualMachine::SetOneInput(std::string func_name, const TVMArgValue& tag,
   SetInputTensorWithIndex(inputs_[func_name], tensor, inp_index, dev);
 }
 
+void VirtualMachine::SetOutputs(std::string name, TVMArgs args) {
+  set_outputs_enabled_ = true;
+}
+
 int64_t VirtualMachine::GetInputIndexFromVMFunction(const std::string& func_name,
                                                     const std::string& input_name) const {
   const auto& vm_func = CheckAndGetVMFunction(func_name);
@@ -765,6 +772,7 @@ void VirtualMachine::RunLoop() {
         auto caller_return_register = frames_.back().caller_return_register;
 
         if (PopFrame() == frame_start) {
+          set_outputs_enabled_ = false;
           return;
           // Otherwise we are just returning from a local call.
         } else {

From 3795e046f9cb0543f86e83463e696a2063f2d03d Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Fri, 22 Apr 2022 14:50:03 +0300
Subject: [PATCH 02/21] GetResultRegisterIndex was implemented

---
 include/tvm/runtime/vm/vm.h | 6 ++++++
 src/runtime/vm/vm.cc        | 9 +++++++++
 2 files changed, 15 insertions(+)

diff --git a/include/tvm/runtime/vm/vm.h b/include/tvm/runtime/vm/vm.h
index 216361cdd1a1..30f4d8058bba 100644
--- a/include/tvm/runtime/vm/vm.h
+++ b/include/tvm/runtime/vm/vm.h
@@ -346,6 +346,12 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
   void SetInputTensorWithIndex(std::vector<ObjectRef>& tensors,  // NOLINT(*)
                                const TVMArgValue& tensor, int index, Device dev);
 
+  /*!
+   * \brief Get index of outputs in register_file from frame
+   * \return index
+   */
+  Index GetResultRegisterIndex();
+
  protected:
   /*! \brief The virtual machine's packed function table. */
   std::vector<PackedFunc> packed_funcs_;
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 56f24369225a..f048c0a497cb 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -525,6 +525,15 @@ int64_t VirtualMachine::LoadScalarInt(Index r) const {
   return result;
 }
 
+Index VirtualMachine::GetResultRegisterIndex() {
+  Index op_index = 0;
+  while (code_[op_index].op != Opcode::Ret) {
+    ++op_index;
+  }
+
+  return code_[op_index].result;
+}
+
 void VirtualMachine::RunLoop() {
   ICHECK(this->exec_);
   ICHECK(this->code_);

From db4fe130b4481f0a882392f0d87b9a0197bab3fd Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 12 May 2022 19:01:38 +0300
Subject: [PATCH 03/21] SetOutputs method was implemented

---
 src/runtime/vm/vm.cc | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index f048c0a497cb..7d27c9d95c28 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -277,6 +277,22 @@ void VirtualMachine::SetOneInput(std::string func_name, const TVMArgValue& tag,
 
 void VirtualMachine::SetOutputs(std::string name, TVMArgs args) {
   set_outputs_enabled_ = true;
+  std::vector<ObjectRef> external_output_arrays;
+  for (int i = 0; i < args.size(); ++i) {
+    TVMArgValue output_tensor = args[i];
+    if (output_tensor.type_code() == kTVMDLTensorHandle) {
+      DLTensor* dl_tensor = output_tensor;
+      external_output_arrays.emplace_back(NDArray::FromExternalDLTensor(*dl_tensor));
+    } else if (output_tensor.type_code() == kTVMNDArrayHandle) {
+      // TODO(vvchernov): emplace_back?
+      external_output_arrays.push_back(output_tensor.AsObjectRef<tvm::runtime::NDArray>());
+    } else {
+      LOG(FATAL) << "Output tensors of not DLTensor or NDArray type are not supported now!";
+    }
+  }
+  // TODO(vvchernov): I'm not sure we need any tag here. Nevertheless it is required
+  auto output_set = ADT(0, external_output_arrays);
+  WriteRegister(GetResultRegisterIndex(), output_set);
 }
 
 int64_t VirtualMachine::GetInputIndexFromVMFunction(const std::string& func_name,

From c598286bb742796b36d6533afa9227e1e0731f77 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Mon, 16 May 2022 10:37:25 +0300
Subject: [PATCH 04/21] update writting to register for AllocTensor op

---
 include/tvm/runtime/vm/vm.h | 15 +++++++++++++
 src/runtime/vm/vm.cc        | 44 +++++++++++++++++++++++++------------
 2 files changed, 45 insertions(+), 14 deletions(-)

diff --git a/include/tvm/runtime/vm/vm.h b/include/tvm/runtime/vm/vm.h
index 30f4d8058bba..edd31b7a8372 100644
--- a/include/tvm/runtime/vm/vm.h
+++ b/include/tvm/runtime/vm/vm.h
@@ -352,6 +352,21 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
    */
   Index GetResultRegisterIndex();
 
+  /*!
+   * \brief Write new allocated tensor to register_file of frame
+   * \param instr current instruction containing shape and storage info
+   */
+  void WriteAllocatedTensor(const Instruction& instr);
+
+  /*!
+   * \brief 'set_outputs_enabled' is assumed true for using this method.
+   * It is expected that result register has already contained tensor from outside,
+   * new tensor is not allocated and write, but expected shape is checked.
+   * For other register WriteAllocatedMethod is used.
+   * \param instr current instruction containing shape and storage info
+   */
+  void WriteAllocatedTensorFromOutside(const Instruction& instr);
+
  protected:
   /*! \brief The virtual machine's packed function table. */
   std::vector<PackedFunc> packed_funcs_;
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 7d27c9d95c28..469c88fa310a 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -698,21 +698,11 @@ void VirtualMachine::RunLoop() {
       }
       case Opcode::AllocTensor: {
         OpStartHook(instr);
-        auto shape = std::vector<int64_t>(instr.alloc_tensor.ndim);
-
-        for (uint32_t i = 0; i < instr.alloc_tensor.ndim; ++i) {
-          shape[i] = instr.alloc_tensor.shape[i];
+        if (set_outputs_enabled_) {
+          WriteAllocatedTensorFromOutside(instr);
+        } else {
+          WriteAllocatedTensor(instr);
         }
-
-        auto storage_obj = ReadRegister(instr.alloc_tensor.storage);
-        auto offset = LoadScalarInt(instr.alloc_tensor.offset);
-        auto storage = Downcast<Storage>(storage_obj);
-        auto obj = storage->AllocNDArray(offset, shape, instr.alloc_tensor.dtype);
-        VLOG(2) << "allocated "
-                << RuntimeObject2String(obj, GetDevice(exec_->host_device_index),
-                                        /*show_contents=*/false);
-
-        WriteRegister(instr.dst, obj);
         OpStopHook();
         pc_++;
         goto main_loop;
@@ -858,6 +848,32 @@ void VirtualMachine::RunLoop() {
   }
 }
 
+void VirtualMachine::WriteAllocatedTensor(const Instruction& instr) {
+  auto shape = std::vector<int64_t>(instr.alloc_tensor.ndim);
+
+  for (uint32_t i = 0; i < instr.alloc_tensor.ndim; ++i) {
+    shape[i] = instr.alloc_tensor.shape[i];
+  }
+
+  auto storage_obj = ReadRegister(instr.alloc_tensor.storage);
+  auto offset = LoadScalarInt(instr.alloc_tensor.offset);
+  auto storage = Downcast<Storage>(storage_obj);
+  auto obj = storage->AllocNDArray(offset, shape, instr.alloc_tensor.dtype);
+  VLOG(2) << "allocated "
+          << RuntimeObject2String(obj, GetDevice(exec_->host_device_index),
+                                  /*show_contents=*/false);
+
+  WriteRegister(instr.dst, obj);
+}
+
+void VirtualMachine::WriteAllocatedTensorFromOutside(const Instruction& instr) {
+  if (instr.dst == GetResultRegisterIndex()) {
+    // TODO(vvchernov): check shape
+  } else {
+    WriteAllocatedTensor(instr);
+  }
+}
+
 runtime::Module CreateVirtualMachine(Executable* exec) {
   auto vm = make_object<VirtualMachine>();
   vm->LoadExecutable(GetObjectPtr<Executable>(exec));

From d9cbaf51ba4b9cedc6bbd52d91e832ee29553a0f Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 17 May 2022 19:27:11 +0300
Subject: [PATCH 05/21] update SetOutputs based on number of outputs. Take into
 account different funcs through func name

---
 include/tvm/runtime/vm/vm.h | 37 ++++++++++++---
 src/runtime/vm/vm.cc        | 89 ++++++++++++++++++++++++++++---------
 2 files changed, 98 insertions(+), 28 deletions(-)

diff --git a/include/tvm/runtime/vm/vm.h b/include/tvm/runtime/vm/vm.h
index edd31b7a8372..31731c8b78b4 100644
--- a/include/tvm/runtime/vm/vm.h
+++ b/include/tvm/runtime/vm/vm.h
@@ -226,6 +226,17 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
    */
   ObjectRef Invoke(const std::string& name, const std::vector<ObjectRef>& args);
 
+  /*!
+   * \brief Invoke a VM function.
+   * \param func The function.
+   * \param input_args The input arguments to the function.
+   * \param output_args The pre-allocated output arguments of the function.
+   * \return The object(s) representing the result.
+   */
+  ObjectRef Invoke(const VMFunction& func,
+                   const std::vector<ObjectRef>& input_args,
+                   const std::vector<ObjectRef>& output_args);
+
   /*!
    * \brief Invoke a PackedFunction
    *
@@ -249,7 +260,7 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
             const std::vector<AllocatorType>& alloc_types);
 
   /*! \brief Run VM dispatch loop. */
-  void RunLoop();
+  void RunLoop(bool set_output_enabled = false);
 
   /*! \brief Get device from the device list based on a given device index. */
   Device GetDevice(Index device_index) const;
@@ -288,6 +299,12 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
    */
   void SetOutputs(std::string name, TVMArgs args);
 
+  /*!
+   * \brief Set pre-allocated outputs to register for specified function.
+   * \param outputs set of output tensors.
+   */
+  void SetOutputTensorsToRegister(const std::vector<ObjectRef>& outputs);
+
   /*!
    * \brief Internal hook for profiling the start of an op.
    *
@@ -347,10 +364,18 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
                                const TVMArgValue& tensor, int index, Device dev);
 
   /*!
-   * \brief Get index of outputs in register_file from frame
-   * \return index
+   * \brief Convert tensor from TVMArgValue to ObjectRef.
+   * DLTensor and NDArray types are supported.
+   * \param tensor given arg value containing tensor.
+   * \return tensor in ObjectRef format
+   */
+  ObjectRef TensorFromTVMArgValueToObjectRef(const TVMArgValue& tensor) const;
+
+  /*!
+   * \brief Get index of outputs in register_file from func code
+   * \return result register index
    */
-  Index GetResultRegisterIndex();
+  Index GetResultRegisterIndex() const;
 
   /*!
    * \brief Write new allocated tensor to register_file of frame
@@ -365,7 +390,7 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
    * For other register WriteAllocatedMethod is used.
    * \param instr current instruction containing shape and storage info
    */
-  void WriteAllocatedTensorFromOutside(const Instruction& instr);
+  void WriteAllocatedTensorFromOutside(const Instruction& instr, Index res_index);
 
  protected:
   /*! \brief The virtual machine's packed function table. */
@@ -384,7 +409,7 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
   ObjectPtr<Executable> exec_;
   /*! \brief The function name to inputs mapping. */
   std::unordered_map<std::string, std::vector<ObjectRef>> inputs_;
-  bool set_outputs_enabled_ = false;
+  std::unordered_map<std::string, bool> set_outputs_enabled_;
   /*! \brief The function name to pre-allocated outputs mapping. */
   std::unordered_map<std::string, std::vector<ObjectRef>> outputs_;
   /*!
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 469c88fa310a..228d7840bdfc 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -143,8 +143,14 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
       } else {
         auto it = inputs_.find(func_name);
         ICHECK(it != inputs_.end()) << "Input has not been set for function " << func_name;
-        const std::vector<ObjectRef>& func_args = it->second;
-        *rv = Invoke(func, func_args);
+        const std::vector<ObjectRef>& input_args = it->second;
+        if (set_outputs_enabled_.count(func_name) && set_outputs_enabled_[func_name]) {
+          ICHECK(outputs_.count(func_name)) << "Outputs have not been set for function " << func_name;
+          *rv = Invoke(func, input_args, outputs_[func_name]);
+          set_outputs_enabled_[func_name] = false;
+        } else {
+          *rv = Invoke(func, input_args);
+        }
       }
     });
   } else if (name == "invoke_stateful") {
@@ -275,24 +281,47 @@ void VirtualMachine::SetOneInput(std::string func_name, const TVMArgValue& tag,
   SetInputTensorWithIndex(inputs_[func_name], tensor, inp_index, dev);
 }
 
-void VirtualMachine::SetOutputs(std::string name, TVMArgs args) {
-  set_outputs_enabled_ = true;
-  std::vector<ObjectRef> external_output_arrays;
-  for (int i = 0; i < args.size(); ++i) {
-    TVMArgValue output_tensor = args[i];
+void VirtualMachine::SetOutputs(std::string func_name, TVMArgs args) {
+  set_outputs_enabled_[func_name] = true;
+  size_t outputs_size = args.size();
+  // First args is func_name
+  ICHECK_GT(outputs_size, 1)
+    << "There is no output arguments set";
+
+  std::vector<ObjectRef> func_args(outputs_size - 1);
+  for (size_t i = 1; i < outputs_size; ++i) {
+    // TODO(vvchernov): device?
+    // TODO(vvchernov): correct index sequence for multiple outputs?
+    func_args[i-1] = TensorFromTVMArgValueToObjectRef(args[i]);
+  }
+  outputs_.erase(func_name);
+  outputs_.emplace(func_name, func_args);
+}
+
+void VirtualMachine::SetOutputTensorsToRegister(const std::vector<ObjectRef>& outputs) {
+  size_t size = outputs.size();
+
+  Index res_ind = GetResultRegisterIndex();
+  if (size == 1) {
+    WriteRegister(res_ind, outputs[0]);
+  } else {
+    // TODO(vvchernov): I'm not sure we need any tag here. Nevertheless it is required
+    auto output_set = ADT(0, outputs);
+    WriteRegister(res_ind, output_set);
+  }
+}
+
+ObjectRef VirtualMachine::TensorFromTVMArgValueToObjectRef(const TVMArgValue& output_tensor) const {
     if (output_tensor.type_code() == kTVMDLTensorHandle) {
       DLTensor* dl_tensor = output_tensor;
-      external_output_arrays.emplace_back(NDArray::FromExternalDLTensor(*dl_tensor));
+      return NDArray::FromExternalDLTensor(*dl_tensor);
     } else if (output_tensor.type_code() == kTVMNDArrayHandle) {
-      // TODO(vvchernov): emplace_back?
-      external_output_arrays.push_back(output_tensor.AsObjectRef<tvm::runtime::NDArray>());
+      return output_tensor.AsObjectRef<tvm::runtime::NDArray>();
     } else {
-      LOG(FATAL) << "Output tensors of not DLTensor or NDArray type are not supported now!";
+      LOG(FATAL) << "It supports tensor of DLTensor or NDArray type only! Given type is "
+          << output_tensor.type_code();
     }
-  }
-  // TODO(vvchernov): I'm not sure we need any tag here. Nevertheless it is required
-  auto output_set = ADT(0, external_output_arrays);
-  WriteRegister(GetResultRegisterIndex(), output_set);
+    return ObjectRef();
 }
 
 int64_t VirtualMachine::GetInputIndexFromVMFunction(const std::string& func_name,
@@ -403,6 +432,22 @@ ObjectRef VirtualMachine::Invoke(const std::string& name, const std::vector<Obje
   return Invoke(exec_->functions[func_index], args);
 }
 
+ObjectRef VirtualMachine::Invoke(const VMFunction& func,
+                                 const std::vector<ObjectRef>& input_args,
+                                 const std::vector<ObjectRef>& output_args) {
+  DLOG(INFO) << "Executing Function: " << std::endl << func;
+  for (int i = 0; i < static_cast<int>(devices_.size()); ++i) {
+    DLOG(INFO) << "Device " << i << " has device type " << devices_[i].device_type
+               << " and device id " << devices_[i].device_id
+               << (i == exec_->host_device_index ? " (using as host device)" : "");
+  }
+
+  InvokeGlobal(func, input_args);
+  SetOutputTensorsToRegister(output_args);
+  RunLoop(set_outputs_enabled_[func.name]);
+  return return_register_;
+}
+
 void VirtualMachine::InvokePacked(Index packed_index, const PackedFunc& func, Index arg_count,
                                   Index output_size, const std::vector<ObjectRef>& args) {
   size_t arity = 0;
@@ -541,7 +586,7 @@ int64_t VirtualMachine::LoadScalarInt(Index r) const {
   return result;
 }
 
-Index VirtualMachine::GetResultRegisterIndex() {
+Index VirtualMachine::GetResultRegisterIndex() const {
   Index op_index = 0;
   while (code_[op_index].op != Opcode::Ret) {
     ++op_index;
@@ -550,11 +595,12 @@ Index VirtualMachine::GetResultRegisterIndex() {
   return code_[op_index].result;
 }
 
-void VirtualMachine::RunLoop() {
+void VirtualMachine::RunLoop(bool set_output_enabled) {
   ICHECK(this->exec_);
   ICHECK(this->code_);
   pc_ = 0;
   Index frame_start = frames_.size();
+  Index res_reg_index = GetResultRegisterIndex();
   while (true) {
   main_loop:
     auto const& instr = code_[this->pc_];
@@ -698,8 +744,8 @@ void VirtualMachine::RunLoop() {
       }
       case Opcode::AllocTensor: {
         OpStartHook(instr);
-        if (set_outputs_enabled_) {
-          WriteAllocatedTensorFromOutside(instr);
+        if (set_output_enabled) {
+          WriteAllocatedTensorFromOutside(instr, res_reg_index);
         } else {
           WriteAllocatedTensor(instr);
         }
@@ -787,7 +833,6 @@ void VirtualMachine::RunLoop() {
         auto caller_return_register = frames_.back().caller_return_register;
 
         if (PopFrame() == frame_start) {
-          set_outputs_enabled_ = false;
           return;
           // Otherwise we are just returning from a local call.
         } else {
@@ -866,8 +911,8 @@ void VirtualMachine::WriteAllocatedTensor(const Instruction& instr) {
   WriteRegister(instr.dst, obj);
 }
 
-void VirtualMachine::WriteAllocatedTensorFromOutside(const Instruction& instr) {
-  if (instr.dst == GetResultRegisterIndex()) {
+void VirtualMachine::WriteAllocatedTensorFromOutside(const Instruction& instr, Index res_index) {
+  if (instr.dst == res_index) {
     // TODO(vvchernov): check shape
   } else {
     WriteAllocatedTensor(instr);

From 82d1bb46e5582d56af31ff0bdbcafcded90326e0 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Wed, 18 May 2022 16:13:46 +0300
Subject: [PATCH 06/21] clean duplicated code in Invoke methods

---
 include/tvm/runtime/vm/vm.h |  7 +++++++
 src/runtime/vm/vm.cc        | 30 ++++++++++++++----------------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/include/tvm/runtime/vm/vm.h b/include/tvm/runtime/vm/vm.h
index 31731c8b78b4..0b04c9ec974b 100644
--- a/include/tvm/runtime/vm/vm.h
+++ b/include/tvm/runtime/vm/vm.h
@@ -299,6 +299,13 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
    */
   void SetOutputs(std::string name, TVMArgs args);
 
+  /*!
+   * \brief Preparation part of Invoke method before RunLoop.
+   * \param func the function.
+   * \param args input args
+   */
+  void PrintInfoAndSetInputArgs(const VMFunction& func, const std::vector<ObjectRef>& args);
+
   /*!
    * \brief Set pre-allocated outputs to register for specified function.
    * \param outputs set of output tensors.
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 228d7840bdfc..e50c44f6665a 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -298,6 +298,18 @@ void VirtualMachine::SetOutputs(std::string func_name, TVMArgs args) {
   outputs_.emplace(func_name, func_args);
 }
 
+
+void VirtualMachine::PrintInfoAndSetInputArgs(const VMFunction& func, const std::vector<ObjectRef>& args) {
+  VLOG(2) << "Executing Function: " << std::endl << func;
+  for (int i = 0; i < static_cast<int>(devices_.size()); ++i) {
+    VLOG(2) << "Device " << i << " has device type " << devices_[i].device_type
+            << " and device id " << devices_[i].device_id
+            << (i == exec_->host_device_index ? " (using as host device)" : "");
+  }
+
+  InvokeGlobal(func, args);
+}
+
 void VirtualMachine::SetOutputTensorsToRegister(const std::vector<ObjectRef>& outputs) {
   size_t size = outputs.size();
 
@@ -411,14 +423,7 @@ void VirtualMachine::InvokeGlobal(const VMFunction& func, const std::vector<Obje
 }
 
 ObjectRef VirtualMachine::Invoke(const VMFunction& func, const std::vector<ObjectRef>& args) {
-  VLOG(2) << "Executing Function: " << std::endl << func;
-  for (int i = 0; i < static_cast<int>(devices_.size()); ++i) {
-    VLOG(2) << "Device " << i << " has device type " << devices_[i].device_type << " and device id "
-            << devices_[i].device_id
-            << (i == exec_->host_device_index ? " (using as host device)" : "");
-  }
-
-  InvokeGlobal(func, args);
+  PrintInfoAndSetInputArgs(func, args);
   RunLoop();
   return return_register_;
 }
@@ -435,14 +440,7 @@ ObjectRef VirtualMachine::Invoke(const std::string& name, const std::vector<Obje
 ObjectRef VirtualMachine::Invoke(const VMFunction& func,
                                  const std::vector<ObjectRef>& input_args,
                                  const std::vector<ObjectRef>& output_args) {
-  DLOG(INFO) << "Executing Function: " << std::endl << func;
-  for (int i = 0; i < static_cast<int>(devices_.size()); ++i) {
-    DLOG(INFO) << "Device " << i << " has device type " << devices_[i].device_type
-               << " and device id " << devices_[i].device_id
-               << (i == exec_->host_device_index ? " (using as host device)" : "");
-  }
-
-  InvokeGlobal(func, input_args);
+  PrintInfoAndSetInputArgs(func, input_args);
   SetOutputTensorsToRegister(output_args);
   RunLoop(set_outputs_enabled_[func.name]);
   return return_register_;

From 830d2d33a0dbfb72537c832dc5a19e3a52064917 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Wed, 18 May 2022 16:35:57 +0300
Subject: [PATCH 07/21] support multiple outputs

---
 include/tvm/runtime/vm/vm.h | 23 +++++++++++++++++-----
 src/runtime/vm/vm.cc        | 38 +++++++++++++++++++++++++++++--------
 2 files changed, 48 insertions(+), 13 deletions(-)

diff --git a/include/tvm/runtime/vm/vm.h b/include/tvm/runtime/vm/vm.h
index 0b04c9ec974b..da4ff274a939 100644
--- a/include/tvm/runtime/vm/vm.h
+++ b/include/tvm/runtime/vm/vm.h
@@ -308,9 +308,10 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
 
   /*!
    * \brief Set pre-allocated outputs to register for specified function.
+   * \param func_name The function's name.
    * \param outputs set of output tensors.
    */
-  void SetOutputTensorsToRegister(const std::vector<ObjectRef>& outputs);
+  void SetOutputTensorsToRegister(const std::string& func_name, const std::vector<ObjectRef>& outputs);
 
   /*!
    * \brief Internal hook for profiling the start of an op.
@@ -385,8 +386,16 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
   Index GetResultRegisterIndex() const;
 
   /*!
-   * \brief Write new allocated tensor to register_file of frame
-   * \param instr current instruction containing shape and storage info
+   * \brief Collect indices from register_file for output tensors.
+   * It helps to replace output tensors allocated in RunLoop by
+   * tensors pre-allocated outside. Scenario is when `set_output` is used
+   * \param func_name The function's name.
+   */
+  void CollectOutputTensorRegIndices(const std::string& func_name);
+
+  /*!
+   * \brief Write new allocated tensor to register_file of frame.
+   * \param instr current instruction containing shape and storage info.
    */
   void WriteAllocatedTensor(const Instruction& instr);
 
@@ -394,8 +403,9 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
    * \brief 'set_outputs_enabled' is assumed true for using this method.
    * It is expected that result register has already contained tensor from outside,
    * new tensor is not allocated and write, but expected shape is checked.
-   * For other register WriteAllocatedMethod is used.
-   * \param instr current instruction containing shape and storage info
+   * For other register WriteAllocatedTensor method is used.
+   * \param instr current instruction containing shape and storage info.
+   * \param res_index register index of result.
    */
   void WriteAllocatedTensorFromOutside(const Instruction& instr, Index res_index);
 
@@ -416,7 +426,10 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
   ObjectPtr<Executable> exec_;
   /*! \brief The function name to inputs mapping. */
   std::unordered_map<std::string, std::vector<ObjectRef>> inputs_;
+  /*! \brief The function name to flag enabling scenario with set outputs. */
   std::unordered_map<std::string, bool> set_outputs_enabled_;
+  /*! \brief The function name to indices of output tensors in register file. */
+  std::unordered_map<std::string, std::vector<Index>> output_tensor_reg_indices_;
   /*! \brief The function name to pre-allocated outputs mapping. */
   std::unordered_map<std::string, std::vector<ObjectRef>> outputs_;
   /*!
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index e50c44f6665a..167a7e24bbdb 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -310,16 +310,16 @@ void VirtualMachine::PrintInfoAndSetInputArgs(const VMFunction& func, const std:
   InvokeGlobal(func, args);
 }
 
-void VirtualMachine::SetOutputTensorsToRegister(const std::vector<ObjectRef>& outputs) {
+void VirtualMachine::SetOutputTensorsToRegister(const std::string& func_name, const std::vector<ObjectRef>& outputs) {
   size_t size = outputs.size();
 
   Index res_ind = GetResultRegisterIndex();
-  if (size == 1) {
-    WriteRegister(res_ind, outputs[0]);
-  } else {
-    // TODO(vvchernov): I'm not sure we need any tag here. Nevertheless it is required
-    auto output_set = ADT(0, outputs);
-    WriteRegister(res_ind, output_set);
+  CollectOutputTensorRegIndices(func_name);
+  auto& reg_indices = output_tensor_reg_indices_[func_name];
+  ICHECK_EQ(reg_indices.size(), size)
+      << "Number of outside output tensors should be equal to model outputs number";
+  for (size_t i = 0; i < size; ++i) {
+    WriteRegister(reg_indices[i], outputs[i]);
   }
 }
 
@@ -441,7 +441,7 @@ ObjectRef VirtualMachine::Invoke(const VMFunction& func,
                                  const std::vector<ObjectRef>& input_args,
                                  const std::vector<ObjectRef>& output_args) {
   PrintInfoAndSetInputArgs(func, input_args);
-  SetOutputTensorsToRegister(output_args);
+  SetOutputTensorsToRegister(func.name, output_args);
   RunLoop(set_outputs_enabled_[func.name]);
   return return_register_;
 }
@@ -593,6 +593,28 @@ Index VirtualMachine::GetResultRegisterIndex() const {
   return code_[op_index].result;
 }
 
+void VirtualMachine::CollectOutputTensorRegIndices(const std::string& func_name) {
+  if (!output_tensor_reg_indices_[func_name].empty()) {
+    return;
+  }
+
+  auto& reg_indices = output_tensor_reg_indices_[func_name];
+  Index res_index = GetResultRegisterIndex();
+  Index op_index = 0;
+  while (code_[op_index].dst != res_index) {
+    ++op_index;
+  }
+  if (code_[op_index].op == Opcode::AllocTensor) {
+    reg_indices.emplace_back(res_index);
+  } else if (code_[op_index].op == Opcode::AllocADT) {
+    for (Index i = 0; i < code_[op_index].num_fields; ++i) {
+      reg_indices.push_back(code_[op_index].datatype_fields[i]);
+    }
+  } else {
+    // TODO(vvchernov): possible extension
+  }
+}
+
 void VirtualMachine::RunLoop(bool set_output_enabled) {
   ICHECK(this->exec_);
   ICHECK(this->code_);

From 31fc06961bd0566be3b17aa1b562265871d0a26d Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Wed, 18 May 2022 20:19:15 +0300
Subject: [PATCH 08/21] lint fix

---
 include/tvm/runtime/vm/vm.h |  6 +++---
 src/runtime/vm/vm.cc        | 43 ++++++++++++++++++-------------------
 2 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/include/tvm/runtime/vm/vm.h b/include/tvm/runtime/vm/vm.h
index da4ff274a939..d2dc9b03324f 100644
--- a/include/tvm/runtime/vm/vm.h
+++ b/include/tvm/runtime/vm/vm.h
@@ -233,8 +233,7 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
    * \param output_args The pre-allocated output arguments of the function.
    * \return The object(s) representing the result.
    */
-  ObjectRef Invoke(const VMFunction& func,
-                   const std::vector<ObjectRef>& input_args,
+  ObjectRef Invoke(const VMFunction& func, const std::vector<ObjectRef>& input_args,
                    const std::vector<ObjectRef>& output_args);
 
   /*!
@@ -311,7 +310,8 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
    * \param func_name The function's name.
    * \param outputs set of output tensors.
    */
-  void SetOutputTensorsToRegister(const std::string& func_name, const std::vector<ObjectRef>& outputs);
+  void SetOutputTensorsToRegister(const std::string& func_name,
+                                  const std::vector<ObjectRef>& outputs);
 
   /*!
    * \brief Internal hook for profiling the start of an op.
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 167a7e24bbdb..4dcadd59c213 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -145,7 +145,8 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
         ICHECK(it != inputs_.end()) << "Input has not been set for function " << func_name;
         const std::vector<ObjectRef>& input_args = it->second;
         if (set_outputs_enabled_.count(func_name) && set_outputs_enabled_[func_name]) {
-          ICHECK(outputs_.count(func_name)) << "Outputs have not been set for function " << func_name;
+          ICHECK(outputs_.count(func_name))
+              << "Outputs have not been set for function " << func_name;
           *rv = Invoke(func, input_args, outputs_[func_name]);
           set_outputs_enabled_[func_name] = false;
         } else {
@@ -285,35 +286,34 @@ void VirtualMachine::SetOutputs(std::string func_name, TVMArgs args) {
   set_outputs_enabled_[func_name] = true;
   size_t outputs_size = args.size();
   // First args is func_name
-  ICHECK_GT(outputs_size, 1)
-    << "There is no output arguments set";
+  ICHECK_GT(outputs_size, 1) << "There is no output arguments set";
 
   std::vector<ObjectRef> func_args(outputs_size - 1);
   for (size_t i = 1; i < outputs_size; ++i) {
     // TODO(vvchernov): device?
     // TODO(vvchernov): correct index sequence for multiple outputs?
-    func_args[i-1] = TensorFromTVMArgValueToObjectRef(args[i]);
+    func_args[i - 1] = TensorFromTVMArgValueToObjectRef(args[i]);
   }
   outputs_.erase(func_name);
   outputs_.emplace(func_name, func_args);
 }
 
-
-void VirtualMachine::PrintInfoAndSetInputArgs(const VMFunction& func, const std::vector<ObjectRef>& args) {
+void VirtualMachine::PrintInfoAndSetInputArgs(const VMFunction& func,
+                                              const std::vector<ObjectRef>& args) {
   VLOG(2) << "Executing Function: " << std::endl << func;
   for (int i = 0; i < static_cast<int>(devices_.size()); ++i) {
-    VLOG(2) << "Device " << i << " has device type " << devices_[i].device_type
-            << " and device id " << devices_[i].device_id
+    VLOG(2) << "Device " << i << " has device type " << devices_[i].device_type << " and device id "
+            << devices_[i].device_id
             << (i == exec_->host_device_index ? " (using as host device)" : "");
   }
 
   InvokeGlobal(func, args);
 }
 
-void VirtualMachine::SetOutputTensorsToRegister(const std::string& func_name, const std::vector<ObjectRef>& outputs) {
+void VirtualMachine::SetOutputTensorsToRegister(const std::string& func_name,
+                                                const std::vector<ObjectRef>& outputs) {
   size_t size = outputs.size();
 
-  Index res_ind = GetResultRegisterIndex();
   CollectOutputTensorRegIndices(func_name);
   auto& reg_indices = output_tensor_reg_indices_[func_name];
   ICHECK_EQ(reg_indices.size(), size)
@@ -324,16 +324,16 @@ void VirtualMachine::SetOutputTensorsToRegister(const std::string& func_name, co
 }
 
 ObjectRef VirtualMachine::TensorFromTVMArgValueToObjectRef(const TVMArgValue& output_tensor) const {
-    if (output_tensor.type_code() == kTVMDLTensorHandle) {
-      DLTensor* dl_tensor = output_tensor;
-      return NDArray::FromExternalDLTensor(*dl_tensor);
-    } else if (output_tensor.type_code() == kTVMNDArrayHandle) {
-      return output_tensor.AsObjectRef<tvm::runtime::NDArray>();
-    } else {
-      LOG(FATAL) << "It supports tensor of DLTensor or NDArray type only! Given type is "
-          << output_tensor.type_code();
-    }
-    return ObjectRef();
+  if (output_tensor.type_code() == kTVMDLTensorHandle) {
+    DLTensor* dl_tensor = output_tensor;
+    return NDArray::FromExternalDLTensor(*dl_tensor);
+  } else if (output_tensor.type_code() == kTVMNDArrayHandle) {
+    return output_tensor.AsObjectRef<tvm::runtime::NDArray>();
+  } else {
+    LOG(FATAL) << "It supports tensor of DLTensor or NDArray type only! Given type is "
+               << output_tensor.type_code();
+  }
+  return ObjectRef();
 }
 
 int64_t VirtualMachine::GetInputIndexFromVMFunction(const std::string& func_name,
@@ -437,8 +437,7 @@ ObjectRef VirtualMachine::Invoke(const std::string& name, const std::vector<Obje
   return Invoke(exec_->functions[func_index], args);
 }
 
-ObjectRef VirtualMachine::Invoke(const VMFunction& func,
-                                 const std::vector<ObjectRef>& input_args,
+ObjectRef VirtualMachine::Invoke(const VMFunction& func, const std::vector<ObjectRef>& input_args,
                                  const std::vector<ObjectRef>& output_args) {
   PrintInfoAndSetInputArgs(func, input_args);
   SetOutputTensorsToRegister(func.name, output_args);

From 156202f54fbf73fa4b13912781fd5e0e52b2a80f Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 18 Aug 2022 11:49:17 +0300
Subject: [PATCH 09/21] update for support multi output network

---
 include/tvm/runtime/vm/vm.h |  9 ++++++---
 src/runtime/vm/vm.cc        | 27 +++++++++++++++++----------
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/include/tvm/runtime/vm/vm.h b/include/tvm/runtime/vm/vm.h
index d2dc9b03324f..8f6ea8619b3e 100644
--- a/include/tvm/runtime/vm/vm.h
+++ b/include/tvm/runtime/vm/vm.h
@@ -259,7 +259,7 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
             const std::vector<AllocatorType>& alloc_types);
 
   /*! \brief Run VM dispatch loop. */
-  void RunLoop(bool set_output_enabled = false);
+  void RunLoop(const std::vector<Index>& output_tensor_reg_indices = {});
 
   /*! \brief Get device from the device list based on a given device index. */
   Device GetDevice(Index device_index) const;
@@ -405,9 +405,12 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
    * new tensor is not allocated and write, but expected shape is checked.
    * For other register WriteAllocatedTensor method is used.
    * \param instr current instruction containing shape and storage info.
-   * \param res_index register index of result.
+   * \param output_tensor_reg_indices register indices of output tensors.
    */
-  void WriteAllocatedTensorFromOutside(const Instruction& instr, Index res_index);
+  void WriteAllocatedTensorFromOutside(const Instruction& instr,
+                                       const std::vector<Index>& output_tensor_reg_indices);
+
+  bool FindIndex(const std::vector<Index>& indices, Index val) const;
 
  protected:
   /*! \brief The virtual machine's packed function table. */
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 4dcadd59c213..c810d00942a6 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -318,8 +318,9 @@ void VirtualMachine::SetOutputTensorsToRegister(const std::string& func_name,
   auto& reg_indices = output_tensor_reg_indices_[func_name];
   ICHECK_EQ(reg_indices.size(), size)
       << "Number of outside output tensors should be equal to model outputs number";
-  for (size_t i = 0; i < size; ++i) {
-    WriteRegister(reg_indices[i], outputs[i]);
+  size_t i = 0;
+  for (auto it = reg_indices.begin(); it != reg_indices.end(); ++it, ++i) {
+    WriteRegister(*it, outputs[i]);
   }
 }
 
@@ -441,7 +442,7 @@ ObjectRef VirtualMachine::Invoke(const VMFunction& func, const std::vector<Objec
                                  const std::vector<ObjectRef>& output_args) {
   PrintInfoAndSetInputArgs(func, input_args);
   SetOutputTensorsToRegister(func.name, output_args);
-  RunLoop(set_outputs_enabled_[func.name]);
+  RunLoop(output_tensor_reg_indices_[func.name]);
   return return_register_;
 }
 
@@ -614,12 +615,11 @@ void VirtualMachine::CollectOutputTensorRegIndices(const std::string& func_name)
   }
 }
 
-void VirtualMachine::RunLoop(bool set_output_enabled) {
+void VirtualMachine::RunLoop(const std::vector<Index>& output_tensor_reg_indices) {
   ICHECK(this->exec_);
   ICHECK(this->code_);
   pc_ = 0;
   Index frame_start = frames_.size();
-  Index res_reg_index = GetResultRegisterIndex();
   while (true) {
   main_loop:
     auto const& instr = code_[this->pc_];
@@ -763,10 +763,10 @@ void VirtualMachine::RunLoop(bool set_output_enabled) {
       }
       case Opcode::AllocTensor: {
         OpStartHook(instr);
-        if (set_output_enabled) {
-          WriteAllocatedTensorFromOutside(instr, res_reg_index);
-        } else {
+        if (output_tensor_reg_indices.empty()) {
           WriteAllocatedTensor(instr);
+        } else {
+          WriteAllocatedTensorFromOutside(instr, output_tensor_reg_indices);
         }
         OpStopHook();
         pc_++;
@@ -930,14 +930,21 @@ void VirtualMachine::WriteAllocatedTensor(const Instruction& instr) {
   WriteRegister(instr.dst, obj);
 }
 
-void VirtualMachine::WriteAllocatedTensorFromOutside(const Instruction& instr, Index res_index) {
-  if (instr.dst == res_index) {
+void VirtualMachine::WriteAllocatedTensorFromOutside(
+    const Instruction& instr, const std::vector<Index>& output_tensor_reg_indices) {
+  if (FindIndex(output_tensor_reg_indices, instr.dst)) {
     // TODO(vvchernov): check shape
   } else {
+    LOG(WARNING) << "Writting of allocated tensor from outside fails. Usual approach is used";
     WriteAllocatedTensor(instr);
   }
 }
 
+bool VirtualMachine::FindIndex(const std::vector<Index>& indices, Index val) const {
+  auto it = std::find(indices.begin(), indices.end(), val);
+  return it != indices.end();
+}
+
 runtime::Module CreateVirtualMachine(Executable* exec) {
   auto vm = make_object<VirtualMachine>();
   vm->LoadExecutable(GetObjectPtr<Executable>(exec));

From 605eeafaa9276931130aada7eb6013c1257c2de9 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Fri, 19 Aug 2022 08:30:04 +0300
Subject: [PATCH 10/21] extend set_output method for ReshapeTensor Op in VM

---
 include/tvm/runtime/vm/vm.h |  8 ++++
 src/runtime/vm/vm.cc        | 76 ++++++++++++++++++++++++++++---------
 2 files changed, 67 insertions(+), 17 deletions(-)

diff --git a/include/tvm/runtime/vm/vm.h b/include/tvm/runtime/vm/vm.h
index 8f6ea8619b3e..b07dce900987 100644
--- a/include/tvm/runtime/vm/vm.h
+++ b/include/tvm/runtime/vm/vm.h
@@ -385,6 +385,12 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
    */
   Index GetResultRegisterIndex() const;
 
+  /*!
+   * \brief Calculate the index of operation which destination is result
+   * \param res_index is the index of op returning result
+   */
+  void CalculatePreResultOpIndex(Index res_index);
+
   /*!
    * \brief Collect indices from register_file for output tensors.
    * It helps to replace output tensors allocated in RunLoop by
@@ -431,6 +437,8 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
   std::unordered_map<std::string, std::vector<ObjectRef>> inputs_;
   /*! \brief The function name to flag enabling scenario with set outputs. */
   std::unordered_map<std::string, bool> set_outputs_enabled_;
+  /*! \brief The index of operation which destination is result. */
+  Index preresult_op_index_ = -1;
   /*! \brief The function name to indices of output tensors in register file. */
   std::unordered_map<std::string, std::vector<Index>> output_tensor_reg_indices_;
   /*! \brief The function name to pre-allocated outputs mapping. */
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index c810d00942a6..541e6d2ddfae 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -593,6 +593,15 @@ Index VirtualMachine::GetResultRegisterIndex() const {
   return code_[op_index].result;
 }
 
+void VirtualMachine::CalculatePreResultOpIndex(Index res_index) {
+  if (preresult_op_index_ == -1) {
+    preresult_op_index_ = 0;
+    while (code_[preresult_op_index_].dst != res_index) {
+      ++preresult_op_index_;
+    }
+  }
+}
+
 void VirtualMachine::CollectOutputTensorRegIndices(const std::string& func_name) {
   if (!output_tensor_reg_indices_[func_name].empty()) {
     return;
@@ -600,18 +609,19 @@ void VirtualMachine::CollectOutputTensorRegIndices(const std::string& func_name)
 
   auto& reg_indices = output_tensor_reg_indices_[func_name];
   Index res_index = GetResultRegisterIndex();
-  Index op_index = 0;
-  while (code_[op_index].dst != res_index) {
-    ++op_index;
-  }
-  if (code_[op_index].op == Opcode::AllocTensor) {
+  CalculatePreResultOpIndex(res_index);
+  auto& preres_instr = code_[preresult_op_index_];
+  auto op_code = preres_instr.op;
+  if (op_code == Opcode::AllocTensor) {
     reg_indices.emplace_back(res_index);
-  } else if (code_[op_index].op == Opcode::AllocADT) {
-    for (Index i = 0; i < code_[op_index].num_fields; ++i) {
-      reg_indices.push_back(code_[op_index].datatype_fields[i]);
+  } else if (op_code == Opcode::AllocADT) {
+    for (Index i = 0; i < preres_instr.num_fields; ++i) {
+      reg_indices.push_back(preres_instr.datatype_fields[i]);
     }
+  } else if (op_code == Opcode::ReshapeTensor) {
+    reg_indices.push_back(preres_instr.reshape_tensor.tensor);
   } else {
-    // TODO(vvchernov): possible extension
+    LOG(WARNING) << "Operation " << size_t(op_code) << " is not supported for set_outputs method";
   }
 }
 
@@ -763,10 +773,10 @@ void VirtualMachine::RunLoop(const std::vector<Index>& output_tensor_reg_indices
       }
       case Opcode::AllocTensor: {
         OpStartHook(instr);
-        if (output_tensor_reg_indices.empty()) {
-          WriteAllocatedTensor(instr);
-        } else {
+        if (!output_tensor_reg_indices.empty() && FindIndex(output_tensor_reg_indices, instr.dst)) {
           WriteAllocatedTensorFromOutside(instr, output_tensor_reg_indices);
+        } else {
+          WriteAllocatedTensor(instr);
         }
         OpStopHook();
         pc_++;
@@ -932,11 +942,43 @@ void VirtualMachine::WriteAllocatedTensor(const Instruction& instr) {
 
 void VirtualMachine::WriteAllocatedTensorFromOutside(
     const Instruction& instr, const std::vector<Index>& output_tensor_reg_indices) {
-  if (FindIndex(output_tensor_reg_indices, instr.dst)) {
-    // TODO(vvchernov): check shape
-  } else {
-    LOG(WARNING) << "Writting of allocated tensor from outside fails. Usual approach is used";
-    WriteAllocatedTensor(instr);
+  for (auto res_index : output_tensor_reg_indices) {
+    auto arr = Downcast<NDArray>(ReadRegister(res_index));
+    auto shape = arr.Shape();
+    size_t size = shape.size();
+    bool size_check = false;
+    if (size != instr.alloc_tensor.ndim) {
+      size_check = true;
+    } else {
+      for (size_t i = 0; i < size; ++i) {
+        if (shape[i] != instr.alloc_tensor.shape[i]) {
+          size_check = true;
+          break;
+        }
+      }
+    }
+
+    if (size_check) {
+      // Match element number
+      size_t in_el_num = 1, ex_el_num = 1;
+      for (size_t i = 0; i < size; ++i) {
+        in_el_num *= shape[i];
+      }
+      for (size_t i = 0; i < instr.alloc_tensor.ndim; ++i) {
+        ex_el_num *= instr.alloc_tensor.shape[i];
+      }
+      ICHECK_EQ(in_el_num, ex_el_num)
+          << "Element number mismatching of internal and external output tensors";
+      if (code_[preresult_op_index_].op == Opcode::ReshapeTensor) {
+        int64_t* dims = instr.alloc_tensor.shape;
+        int64_t ndim = instr.alloc_tensor.ndim;
+        std::vector<int64_t> ref_shape(dims, dims + ndim);
+        auto reshaped_tensor = arr.CreateView(ref_shape, arr->dtype);
+        WriteRegister(res_index, reshaped_tensor);
+      } else {
+        LOG_ERROR << "Internal and external output tensor shapes are mismatched";
+      }
+    }
   }
 }
 

From dec50224b863d89c6d0aa11974ba7e71dea6bf63 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Mon, 29 Aug 2022 17:21:58 +0300
Subject: [PATCH 11/21] small fix. code cleaning

---
 src/runtime/vm/vm.cc | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 541e6d2ddfae..504aada7d83a 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -943,15 +943,23 @@ void VirtualMachine::WriteAllocatedTensor(const Instruction& instr) {
 void VirtualMachine::WriteAllocatedTensorFromOutside(
     const Instruction& instr, const std::vector<Index>& output_tensor_reg_indices) {
   for (auto res_index : output_tensor_reg_indices) {
-    auto arr = Downcast<NDArray>(ReadRegister(res_index));
-    auto shape = arr.Shape();
-    size_t size = shape.size();
+    // External tensor(s) has been already written to the register
+    auto ex_arr = Downcast<NDArray>(ReadRegister(res_index));
+    auto ex_shape = ex_arr.Shape();
+    auto ex_size = ex_shape.size();
+    auto ex_dtype = ex_arr->dtype;
+
+    auto in_size = instr.alloc_tensor.ndim;
+    auto in_dtype = instr.alloc_tensor.dtype;
+    ICHECK_EQ(TypeEqual(in_dtype, ex_dtype), true)
+          << "Data types mismatching for internal and external output tensors";
+
     bool size_check = false;
-    if (size != instr.alloc_tensor.ndim) {
+    if (ex_size != in_size) {
       size_check = true;
     } else {
-      for (size_t i = 0; i < size; ++i) {
-        if (shape[i] != instr.alloc_tensor.shape[i]) {
+      for (size_t i = 0; i < in_size; ++i) {
+        if (ex_shape[i] != instr.alloc_tensor.shape[i]) {
           size_check = true;
           break;
         }
@@ -961,19 +969,18 @@ void VirtualMachine::WriteAllocatedTensorFromOutside(
     if (size_check) {
       // Match element number
       size_t in_el_num = 1, ex_el_num = 1;
-      for (size_t i = 0; i < size; ++i) {
-        in_el_num *= shape[i];
+      for (size_t i = 0; i < ex_size; ++i) {
+        ex_el_num *= ex_shape[i];
       }
-      for (size_t i = 0; i < instr.alloc_tensor.ndim; ++i) {
-        ex_el_num *= instr.alloc_tensor.shape[i];
+      for (size_t i = 0; i < in_size; ++i) {
+        in_el_num *= instr.alloc_tensor.shape[i];
       }
       ICHECK_EQ(in_el_num, ex_el_num)
           << "Element number mismatching of internal and external output tensors";
       if (code_[preresult_op_index_].op == Opcode::ReshapeTensor) {
         int64_t* dims = instr.alloc_tensor.shape;
-        int64_t ndim = instr.alloc_tensor.ndim;
-        std::vector<int64_t> ref_shape(dims, dims + ndim);
-        auto reshaped_tensor = arr.CreateView(ref_shape, arr->dtype);
+        std::vector<int64_t> ref_shape(dims, dims + int64_t(in_size));
+        auto reshaped_tensor = ex_arr.CreateView(ref_shape, ex_dtype);
         WriteRegister(res_index, reshaped_tensor);
       } else {
         LOG_ERROR << "Internal and external output tensor shapes are mismatched";

From 6318515885d86455a705d82dbc743e91173e0a94 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Mon, 29 Aug 2022 18:43:34 +0300
Subject: [PATCH 12/21] fix excess passing during shape and data type check for
 multiple outputs networks

---
 include/tvm/runtime/vm/vm.h |  6 +--
 src/runtime/vm/vm.cc        | 79 ++++++++++++++++++-------------------
 2 files changed, 40 insertions(+), 45 deletions(-)

diff --git a/include/tvm/runtime/vm/vm.h b/include/tvm/runtime/vm/vm.h
index b07dce900987..8d585f6704a6 100644
--- a/include/tvm/runtime/vm/vm.h
+++ b/include/tvm/runtime/vm/vm.h
@@ -408,13 +408,11 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
   /*!
    * \brief 'set_outputs_enabled' is assumed true for using this method.
    * It is expected that result register has already contained tensor from outside,
-   * new tensor is not allocated and write, but expected shape is checked.
+   * new memory is not allocated and write, but expected shape and data type are checked.
    * For other register WriteAllocatedTensor method is used.
    * \param instr current instruction containing shape and storage info.
-   * \param output_tensor_reg_indices register indices of output tensors.
    */
-  void WriteAllocatedTensorFromOutside(const Instruction& instr,
-                                       const std::vector<Index>& output_tensor_reg_indices);
+  void WriteAllocatedTensorFromOutside(const Instruction& instr);
 
   bool FindIndex(const std::vector<Index>& indices, Index val) const;
 
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 504aada7d83a..80a566415634 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -774,7 +774,7 @@ void VirtualMachine::RunLoop(const std::vector<Index>& output_tensor_reg_indices
       case Opcode::AllocTensor: {
         OpStartHook(instr);
         if (!output_tensor_reg_indices.empty() && FindIndex(output_tensor_reg_indices, instr.dst)) {
-          WriteAllocatedTensorFromOutside(instr, output_tensor_reg_indices);
+          WriteAllocatedTensorFromOutside(instr);
         } else {
           WriteAllocatedTensor(instr);
         }
@@ -940,51 +940,48 @@ void VirtualMachine::WriteAllocatedTensor(const Instruction& instr) {
   WriteRegister(instr.dst, obj);
 }
 
-void VirtualMachine::WriteAllocatedTensorFromOutside(
-    const Instruction& instr, const std::vector<Index>& output_tensor_reg_indices) {
-  for (auto res_index : output_tensor_reg_indices) {
-    // External tensor(s) has been already written to the register
-    auto ex_arr = Downcast<NDArray>(ReadRegister(res_index));
-    auto ex_shape = ex_arr.Shape();
-    auto ex_size = ex_shape.size();
-    auto ex_dtype = ex_arr->dtype;
+void VirtualMachine::WriteAllocatedTensorFromOutside(const Instruction& instr) {
+  // External tensor(s) has been already written to the register (instr.dst)
+  auto ex_arr = Downcast<NDArray>(ReadRegister(instr.dst));
+  auto ex_shape = ex_arr.Shape();
+  auto ex_size = ex_shape.size();
+  auto ex_dtype = ex_arr->dtype;
 
-    auto in_size = instr.alloc_tensor.ndim;
-    auto in_dtype = instr.alloc_tensor.dtype;
-    ICHECK_EQ(TypeEqual(in_dtype, ex_dtype), true)
-          << "Data types mismatching for internal and external output tensors";
+  auto in_size = instr.alloc_tensor.ndim;
+  auto in_dtype = instr.alloc_tensor.dtype;
+  ICHECK_EQ(TypeEqual(in_dtype, ex_dtype), true)
+      << "Data types mismatching for internal and external output tensors";
 
-    bool size_check = false;
-    if (ex_size != in_size) {
-      size_check = true;
-    } else {
-      for (size_t i = 0; i < in_size; ++i) {
-        if (ex_shape[i] != instr.alloc_tensor.shape[i]) {
-          size_check = true;
-          break;
-        }
+  bool size_check = false;
+  if (ex_size != in_size) {
+    size_check = true;
+  } else {
+    for (size_t i = 0; i < in_size; ++i) {
+      if (ex_shape[i] != instr.alloc_tensor.shape[i]) {
+        size_check = true;
+        break;
       }
     }
+  }
 
-    if (size_check) {
-      // Match element number
-      size_t in_el_num = 1, ex_el_num = 1;
-      for (size_t i = 0; i < ex_size; ++i) {
-        ex_el_num *= ex_shape[i];
-      }
-      for (size_t i = 0; i < in_size; ++i) {
-        in_el_num *= instr.alloc_tensor.shape[i];
-      }
-      ICHECK_EQ(in_el_num, ex_el_num)
-          << "Element number mismatching of internal and external output tensors";
-      if (code_[preresult_op_index_].op == Opcode::ReshapeTensor) {
-        int64_t* dims = instr.alloc_tensor.shape;
-        std::vector<int64_t> ref_shape(dims, dims + int64_t(in_size));
-        auto reshaped_tensor = ex_arr.CreateView(ref_shape, ex_dtype);
-        WriteRegister(res_index, reshaped_tensor);
-      } else {
-        LOG_ERROR << "Internal and external output tensor shapes are mismatched";
-      }
+  if (size_check) {
+    // Match element number
+    size_t in_el_num = 1, ex_el_num = 1;
+    for (size_t i = 0; i < ex_size; ++i) {
+      ex_el_num *= ex_shape[i];
+    }
+    for (size_t i = 0; i < in_size; ++i) {
+      in_el_num *= instr.alloc_tensor.shape[i];
+    }
+    ICHECK_EQ(in_el_num, ex_el_num)
+        << "Element number mismatching of internal and external output tensors";
+    if (code_[preresult_op_index_].op == Opcode::ReshapeTensor) {
+      int64_t* dims = instr.alloc_tensor.shape;
+      std::vector<int64_t> ref_shape(dims, dims + int64_t(in_size));
+      auto reshaped_tensor = ex_arr.CreateView(ref_shape, ex_dtype);
+      WriteRegister(instr.dst, reshaped_tensor);
+    } else {
+      LOG_ERROR << "Internal and external output tensor shapes are mismatched";
     }
   }
 }

From ef3e0963fcf1f3aa8494bc74f159e73585493a37 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Fri, 9 Sep 2022 12:04:27 +0300
Subject: [PATCH 13/21] update fatal error logs

---
 src/runtime/vm/vm.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 80a566415634..1fe9e74d4cd1 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -621,7 +621,7 @@ void VirtualMachine::CollectOutputTensorRegIndices(const std::string& func_name)
   } else if (op_code == Opcode::ReshapeTensor) {
     reg_indices.push_back(preres_instr.reshape_tensor.tensor);
   } else {
-    LOG(WARNING) << "Operation " << size_t(op_code) << " is not supported for set_outputs method";
+    LOG(FATAL) << "Operation " << size_t(op_code) << " is not supported for set_outputs method";
   }
 }
 
@@ -981,7 +981,7 @@ void VirtualMachine::WriteAllocatedTensorFromOutside(const Instruction& instr) {
       auto reshaped_tensor = ex_arr.CreateView(ref_shape, ex_dtype);
       WriteRegister(instr.dst, reshaped_tensor);
     } else {
-      LOG_ERROR << "Internal and external output tensor shapes are mismatched";
+      LOG(FATAL) << "Internal and external output tensor shapes are mismatched";
     }
   }
 }

From d8df1d54663b7b2b5476fdd361461402b59efa04 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Fri, 9 Sep 2022 12:32:00 +0300
Subject: [PATCH 14/21] clean CollectOutputTensorRegIndices method

---
 include/tvm/runtime/vm/vm.h |  6 +++---
 src/runtime/vm/vm.cc        | 13 ++++++-------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/include/tvm/runtime/vm/vm.h b/include/tvm/runtime/vm/vm.h
index 8d585f6704a6..f50794f0657d 100644
--- a/include/tvm/runtime/vm/vm.h
+++ b/include/tvm/runtime/vm/vm.h
@@ -392,12 +392,12 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
   void CalculatePreResultOpIndex(Index res_index);
 
   /*!
-   * \brief Collect indices from register_file for output tensors.
+   * \brief Get indices from register_file for output tensors.
    * It helps to replace output tensors allocated in RunLoop by
    * tensors pre-allocated outside. Scenario is when `set_output` is used
-   * \param func_name The function's name.
+   * \return indices from register_file for output tensors.
    */
-  void CollectOutputTensorRegIndices(const std::string& func_name);
+  std::vector<Index> GetOutputTensorRegIndices();
 
   /*!
    * \brief Write new allocated tensor to register_file of frame.
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 1fe9e74d4cd1..57c77c730fe3 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -314,7 +314,9 @@ void VirtualMachine::SetOutputTensorsToRegister(const std::string& func_name,
                                                 const std::vector<ObjectRef>& outputs) {
   size_t size = outputs.size();
 
-  CollectOutputTensorRegIndices(func_name);
+  if (output_tensor_reg_indices_[func_name].empty()) {
+    output_tensor_reg_indices_[func_name] = GetOutputTensorRegIndices();
+  }
   auto& reg_indices = output_tensor_reg_indices_[func_name];
   ICHECK_EQ(reg_indices.size(), size)
       << "Number of outside output tensors should be equal to model outputs number";
@@ -602,12 +604,8 @@ void VirtualMachine::CalculatePreResultOpIndex(Index res_index) {
   }
 }
 
-void VirtualMachine::CollectOutputTensorRegIndices(const std::string& func_name) {
-  if (!output_tensor_reg_indices_[func_name].empty()) {
-    return;
-  }
-
-  auto& reg_indices = output_tensor_reg_indices_[func_name];
+std::vector<Index> VirtualMachine::GetOutputTensorRegIndices() {
+  std::vector<Index> reg_indices;
   Index res_index = GetResultRegisterIndex();
   CalculatePreResultOpIndex(res_index);
   auto& preres_instr = code_[preresult_op_index_];
@@ -623,6 +621,7 @@ void VirtualMachine::CollectOutputTensorRegIndices(const std::string& func_name)
   } else {
     LOG(FATAL) << "Operation " << size_t(op_code) << " is not supported for set_outputs method";
   }
+  return reg_indices;
 }
 
 void VirtualMachine::RunLoop(const std::vector<Index>& output_tensor_reg_indices) {

From 94635528074880059dc26f0dc7776477ad3404f5 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Fri, 9 Sep 2022 17:33:41 +0300
Subject: [PATCH 15/21] extend description

---
 include/tvm/runtime/vm/vm.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/tvm/runtime/vm/vm.h b/include/tvm/runtime/vm/vm.h
index f50794f0657d..ecb7223ce77e 100644
--- a/include/tvm/runtime/vm/vm.h
+++ b/include/tvm/runtime/vm/vm.h
@@ -293,6 +293,8 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
 
   /*!
    * \brief Set pre-allocated outputs to a function.
+   * It is native implementation of 'set_outputs' python method.
+   * It is used in scenario when output tensors are allocated outside.
    * \param name The function name
    * \param args outputs to the function.
    */

From 1493e59c5578504ebae8ebac9b3b0809c4a39267 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Mon, 12 Sep 2022 21:00:24 +0300
Subject: [PATCH 16/21] clear outputs_ after invoke

---
 src/runtime/vm/vm.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 57c77c730fe3..976a1cde0b89 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -148,6 +148,7 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
           ICHECK(outputs_.count(func_name))
               << "Outputs have not been set for function " << func_name;
           *rv = Invoke(func, input_args, outputs_[func_name]);
+          outputs_[func_name].clear();
           set_outputs_enabled_[func_name] = false;
         } else {
           *rv = Invoke(func, input_args);

From 2dc148a578f9ebba69e7e8da9c09a2143fa2b556 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Mon, 12 Sep 2022 21:21:27 +0300
Subject: [PATCH 17/21] update invoke_with_outputs by input args

---
 python/tvm/runtime/vm.py | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
index 3415f277f786..25dbfde37040 100644
--- a/python/tvm/runtime/vm.py
+++ b/python/tvm/runtime/vm.py
@@ -561,9 +561,9 @@ def invoke_stateful(self, func_name, *args, **kwargs):
             self.set_input(func_name, *args, **kwargs)
         self._invoke_stateful(func_name)
 
-    def invoke_with_outputs(self, func_name, *args):
+    def invoke_with_outputs(self, func_name, input_args, output_args):
         """Invoke a function with pre-allocated outputs tensors.
-        It requires use set_input method before.
+        input_args can be None if set_input method was used before.
 
         This invoke method allows to avoid excess copying if memory for output tensors
         was allocated before inference.
@@ -573,10 +573,31 @@ def invoke_with_outputs(self, func_name, *args):
         func_name : str
             The name of the function.
 
-        args : list[tvm.runtime.NDArray] or list[DLTensor]
+        input_args: dict of str to tvm.runtime.NDArray or np.ndarray
+            Named arguments to the function.
+
+        output_args : list[tvm.runtime.NDArray] or list[DLTensor]
             The output tensors of the function.
         """
-        self._set_outputs(func_name, *args)
+        if input_args:
+            func_params = self._exec.get_function_params(func_name)
+            new_args = [None] * len(func_params)
+            cnt = 0
+            for k in input_args:
+                if k in func_params:
+                    idx = func_params.index(k)
+                    new_args[idx] = input_args[k]
+                    cnt += 1
+            assert len(args) + cnt == len(func_params)
+            idx = 0
+            for i, arg in enumerate(new_args):
+                if arg is None:
+                    new_args[i] = args[idx]
+                    idx += 1
+            args = new_args
+        cargs = convert(args)
+        self._set_input(func_name, *cargs)
+        self._set_outputs(func_name, *output_args)
         self._invoke(func_name)
 
     def get_outputs(self):

From 4c117d3ca4a2c094023ad274ffaf955fbec40499 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 29 Sep 2022 10:40:53 +0300
Subject: [PATCH 18/21] small fix in invoke_with_outputs method of VM. rpc test
 for this method was implemented

---
 python/tvm/runtime/vm.py      | 10 ++----
 tests/python/relay/test_vm.py | 66 +++++++++++++++++++++++++++++++----
 2 files changed, 61 insertions(+), 15 deletions(-)

diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
index 25dbfde37040..65f971f261e8 100644
--- a/python/tvm/runtime/vm.py
+++ b/python/tvm/runtime/vm.py
@@ -588,14 +588,8 @@ def invoke_with_outputs(self, func_name, input_args, output_args):
                     idx = func_params.index(k)
                     new_args[idx] = input_args[k]
                     cnt += 1
-            assert len(args) + cnt == len(func_params)
-            idx = 0
-            for i, arg in enumerate(new_args):
-                if arg is None:
-                    new_args[i] = args[idx]
-                    idx += 1
-            args = new_args
-        cargs = convert(args)
+            assert cnt == len(func_params)
+        cargs = convert(new_args)
         self._set_input(func_name, *cargs)
         self._set_outputs(func_name, *output_args)
         self._invoke(func_name)
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index 0b62db85c904..dbc10d02124a 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -846,26 +846,38 @@ def relay_ext_test(func):
     assert "shape_func" in opt_mod.astext(False)
 
 
-def test_vm_rpc():
+def prepare_vm_model(path, tensor_shape):
     """
-    This test checks to make sure you can export a VMExecutable,
-    upload it to a remote machine using RPC and then execute it
-    on the other machine.
+    Virtual Machine is compiled for simple topology and
+    exported as library to given path
     """
     target = tvm.target.Target("llvm --host=llvm")
 
     # Build a IRModule.
-    x = relay.var("x", shape=(10, 1))
+    x = relay.var("x", shape=tensor_shape)
     f = relay.Function([x], x + x)
     mod = IRModule.from_expr(f)
 
     # Compile to VMExecutable.
     vm_exec = vm.compile(mod, target=target)
 
+    # Export to Disk
+    vm_exec.mod.export_library(path)
+
+
+def test_vm_rpc():
+    """
+    This test checks to make sure you can export a VMExecutable,
+    upload it to a remote machine using RPC and then execute it
+    on the other machine.
+    """
+    # Shape for input and output tensors
+    shape = (10,1)
+
     # Export to Disk
     temp = utils.tempdir()
     path = temp.relpath("vm_library.so")
-    vm_exec.mod.export_library(path)
+    prepare_vm_model(path, shape)
 
     # Use local rpc server for testing.
     # Server must use popen so it doesn't inherit the current process state. It
@@ -881,7 +893,7 @@ def check_remote(server):
         device = remote.cpu()
         # Build a VM out of the executable and context.
         vm_factory = runtime.vm.VirtualMachine(rexec, device)
-        np_input = np.random.uniform(size=(10, 1)).astype("float32")
+        np_input = np.random.uniform(size=shape).astype("float32")
         input_tensor = tvm.nd.array(np_input, device)
         # Invoke its "main" function.
         out = vm_factory.invoke("main", input_tensor)
@@ -891,6 +903,46 @@ def check_remote(server):
     check_remote(rpc.Server("127.0.0.1"))
 
 
+def test_vm_invoke_with_outputs_rpc():
+    """
+    This test checks to make sure you can export a VMExecutable,
+    upload it to a remote machine using RPC and then execute it
+    on the other machine with preallocated outputs.
+    """
+    # Shape for input and output tensors
+    shape = (3,2)
+
+    # Export to Disk
+    temp = utils.tempdir()
+    path = temp.relpath("vm_library.so")
+    prepare_vm_model(path, shape)
+
+    # Use local rpc server for testing.
+    # Server must use popen so it doesn't inherit the current process state. It
+    # will crash otherwise.
+    def check_remote_invoke_with_outputs(server):
+        remote = rpc.connect(server.host, server.port, session_timeout=10)
+
+        # Upload the serialized Executable.
+        remote.upload(path)
+        # Get a handle to remote Executable.
+        rexec = remote.load_module("vm_library.so")
+
+        device = remote.cpu()
+        # Build a VM out of the executable and context.
+        vm_factory = runtime.vm.VirtualMachine(rexec, device)
+        np_input = np.random.uniform(size=shape).astype("float32")
+        input_tensor = tvm.nd.array(np_input, device)
+        np_output = np.empty(shape, dtype="float32")
+        output_tensor = tvm.nd.array(np_output, device)
+        # Invoke its "main" function.
+        vm_factory.invoke_with_outputs("main", input_args={"x": input_tensor}, output_args=[output_tensor])
+        # Check the result.
+        np.testing.assert_allclose(output_tensor.numpy(), np_input + np_input)
+
+    check_remote_invoke_with_outputs(rpc.Server("127.0.0.1"))
+
+
 def test_get_output_single():
     target = tvm.target.Target("llvm")
 

From 2a9d1b378e97169b0d2f33db7cebf9a3b8186897 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 29 Sep 2022 11:07:00 +0300
Subject: [PATCH 19/21] local test of invoke_with_outputs of VM was implemented

---
 tests/python/relay/test_vm.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index dbc10d02124a..ce700c971ac4 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -943,6 +943,28 @@ def check_remote_invoke_with_outputs(server):
     check_remote_invoke_with_outputs(rpc.Server("127.0.0.1"))
 
 
+def test_vm_invoke_with_outputs():
+    target = tvm.target.Target("llvm")
+    shape=(3, 2)
+
+    # Build a IRModule.
+    x = relay.var("x", shape=shape)
+    f = relay.Function([x], x + x)
+    mod = IRModule.from_expr(f)
+
+    # Compile to VMExecutable.
+    vm_exec = vm.compile(mod, target=target)
+    vm_factory = runtime.vm.VirtualMachine(vm_exec, tvm.cpu())
+    np_input = np.random.uniform(size=shape).astype("float32")
+    input_tensor = tvm.nd.array(np_input)
+    np_output = np.empty(shape, dtype="float32")
+    output_tensor = tvm.nd.array(np_output)
+    # Invoke
+    vm_factory.invoke_with_outputs("main", input_args={"x": input_tensor}, output_args=[output_tensor])
+    # Check the result.
+    np.testing.assert_allclose(output_tensor.numpy(), np_input + np_input)
+
+
 def test_get_output_single():
     target = tvm.target.Target("llvm")
 

From 6a34710719934f5f6fc891d9a7b1edcef5214185 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 29 Sep 2022 11:25:40 +0300
Subject: [PATCH 20/21] update description for set_outputs scenario

---
 include/tvm/runtime/vm/vm.h | 6 ++++--
 python/tvm/runtime/vm.py    | 4 +++-
 src/runtime/vm/vm.cc        | 1 -
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/tvm/runtime/vm/vm.h b/include/tvm/runtime/vm/vm.h
index ecb7223ce77e..6fa91832a731 100644
--- a/include/tvm/runtime/vm/vm.h
+++ b/include/tvm/runtime/vm/vm.h
@@ -292,9 +292,11 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
   void SetOneInput(std::string name, const TVMArgValue& tag, const TVMArgValue& tensor);
 
   /*!
-   * \brief Set pre-allocated outputs to a function.
+   * \brief Set pre-allocated output tensors to a function.
    * It is native implementation of 'set_outputs' python method.
-   * It is used in scenario when output tensors are allocated outside.
+   * It is used in scenario when output tensors are allocated outside each invocation.
+   * Note: it sets set_outputs_enabled_[name] true and fill outputs_[name]
+   * but after invocation the first is switched off and the second is cleared
    * \param name The function name
    * \param args outputs to the function.
    */
diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
index 65f971f261e8..20778c40fd51 100644
--- a/python/tvm/runtime/vm.py
+++ b/python/tvm/runtime/vm.py
@@ -562,7 +562,9 @@ def invoke_stateful(self, func_name, *args, **kwargs):
         self._invoke_stateful(func_name)
 
     def invoke_with_outputs(self, func_name, input_args, output_args):
-        """Invoke a function with pre-allocated outputs tensors.
+        # TODO(vvchernov): consider scenario then output tensors set once
+        """Invoke a function with pre-allocated output tensors.
+        The output tensors should be set every invocation.
         input_args can be None if set_input method was used before.
 
         This invoke method allows to avoid excess copying if memory for output tensors
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 976a1cde0b89..aaf4675733a8 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -292,7 +292,6 @@ void VirtualMachine::SetOutputs(std::string func_name, TVMArgs args) {
   std::vector<ObjectRef> func_args(outputs_size - 1);
   for (size_t i = 1; i < outputs_size; ++i) {
     // TODO(vvchernov): device?
-    // TODO(vvchernov): correct index sequence for multiple outputs?
     func_args[i - 1] = TensorFromTVMArgValueToObjectRef(args[i]);
   }
   outputs_.erase(func_name);

From c335fdb414203bf7e2c4b13e2674ecadf1a37318 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 29 Sep 2022 12:19:52 +0300
Subject: [PATCH 21/21] lint fixes

---
 tests/python/relay/test_vm.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index ce700c971ac4..45e305c9a195 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -872,7 +872,7 @@ def test_vm_rpc():
     on the other machine.
     """
     # Shape for input and output tensors
-    shape = (10,1)
+    shape = (10, 1)
 
     # Export to Disk
     temp = utils.tempdir()
@@ -910,7 +910,7 @@ def test_vm_invoke_with_outputs_rpc():
     on the other machine with preallocated outputs.
     """
     # Shape for input and output tensors
-    shape = (3,2)
+    shape = (3, 2)
 
     # Export to Disk
     temp = utils.tempdir()
@@ -936,7 +936,9 @@ def check_remote_invoke_with_outputs(server):
         np_output = np.empty(shape, dtype="float32")
         output_tensor = tvm.nd.array(np_output, device)
         # Invoke its "main" function.
-        vm_factory.invoke_with_outputs("main", input_args={"x": input_tensor}, output_args=[output_tensor])
+        vm_factory.invoke_with_outputs(
+            "main", input_args={"x": input_tensor}, output_args=[output_tensor]
+        )
         # Check the result.
         np.testing.assert_allclose(output_tensor.numpy(), np_input + np_input)
 
@@ -945,7 +947,7 @@ def check_remote_invoke_with_outputs(server):
 
 def test_vm_invoke_with_outputs():
     target = tvm.target.Target("llvm")
-    shape=(3, 2)
+    shape = (3, 2)
 
     # Build a IRModule.
     x = relay.var("x", shape=shape)
@@ -960,7 +962,9 @@ def test_vm_invoke_with_outputs():
     np_output = np.empty(shape, dtype="float32")
     output_tensor = tvm.nd.array(np_output)
     # Invoke
-    vm_factory.invoke_with_outputs("main", input_args={"x": input_tensor}, output_args=[output_tensor])
+    vm_factory.invoke_with_outputs(
+        "main", input_args={"x": input_tensor}, output_args=[output_tensor]
+    )
     # Check the result.
     np.testing.assert_allclose(output_tensor.numpy(), np_input + np_input)