apache · tkonolige · Sep 29, 2022 · Apr 22, 2022 · Apr 22, 2022 · May 12, 2022
diff --git a/include/tvm/runtime/vm/vm.h b/include/tvm/runtime/vm/vm.h
@@ -226,6 +226,16 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
    */
   ObjectRef Invoke(const std::string& name, const std::vector<ObjectRef>& args);
 
+  /*!
+   * \brief Invoke a VM function.
+   * \param func The function.
+   * \param input_args The input arguments to the function.
+   * \param output_args The pre-allocated output arguments of the function.
+   * \return The object(s) representing the result.
+   */
+  ObjectRef Invoke(const VMFunction& func, const std::vector<ObjectRef>& input_args,
+                   const std::vector<ObjectRef>& output_args);
+
   /*!
    * \brief Invoke a PackedFunction
    *
@@ -249,7 +259,7 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
             const std::vector<AllocatorType>& alloc_types);
 
   /*! \brief Run VM dispatch loop. */
-  void RunLoop();
+  void RunLoop(const std::vector<Index>& output_tensor_reg_indices = {});
 
   /*! \brief Get device from the device list based on a given device index. */
   Device GetDevice(Index device_index) const;
@@ -281,6 +291,32 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
    */
   void SetOneInput(std::string name, const TVMArgValue& tag, const TVMArgValue& tensor);
 
+  /*!
+   * \brief Set pre-allocated output tensors to a function.
+   * It is native implementation of 'set_outputs' python method.
+   * It is used in scenario when output tensors are allocated outside each invocation.
+   * Note: it sets set_outputs_enabled_[name] true and fill outputs_[name]
+   * but after invocation the first is switched off and the second is cleared
+   * \param name The function name
+   * \param args outputs to the function.
+   */
+  void SetOutputs(std::string name, TVMArgs args);
+
+  /*!
+   * \brief Preparation part of Invoke method before RunLoop.
+   * \param func the function.
+   * \param args input args
+   */
+  void PrintInfoAndSetInputArgs(const VMFunction& func, const std::vector<ObjectRef>& args);
+
+  /*!
+   * \brief Set pre-allocated outputs to register for specified function.
+   * \param func_name The function's name.
+   * \param outputs set of output tensors.
+   */
+  void SetOutputTensorsToRegister(const std::string& func_name,
+                                  const std::vector<ObjectRef>& outputs);
+
   /*!
    * \brief Internal hook for profiling the start of an op.
    *
@@ -339,6 +375,51 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
   void SetInputTensorWithIndex(std::vector<ObjectRef>& tensors,  // NOLINT(*)
                                const TVMArgValue& tensor, int index, Device dev);
 
+  /*!
+   * \brief Convert tensor from TVMArgValue to ObjectRef.
+   * DLTensor and NDArray types are supported.
+   * \param tensor given arg value containing tensor.
+   * \return tensor in ObjectRef format
+   */
+  ObjectRef TensorFromTVMArgValueToObjectRef(const TVMArgValue& tensor) const;
+
+  /*!
+   * \brief Get index of outputs in register_file from func code
+   * \return result register index
+   */
+  Index GetResultRegisterIndex() const;
+
+  /*!
+   * \brief Calculate the index of operation which destination is result
+   * \param res_index is the index of op returning result
+   */
+  void CalculatePreResultOpIndex(Index res_index);
+
+  /*!
+   * \brief Get indices from register_file for output tensors.
+   * It helps to replace output tensors allocated in RunLoop by
+   * tensors pre-allocated outside. Scenario is when `set_output` is used
+   * \return indices from register_file for output tensors.
+   */
+  std::vector<Index> GetOutputTensorRegIndices();
+
+  /*!
+   * \brief Write new allocated tensor to register_file of frame.
+   * \param instr current instruction containing shape and storage info.
+   */
+  void WriteAllocatedTensor(const Instruction& instr);
+
+  /*!
+   * \brief 'set_outputs_enabled' is assumed true for using this method.
+   * It is expected that result register has already contained tensor from outside,
+   * new memory is not allocated and write, but expected shape and data type are checked.
+   * For other register WriteAllocatedTensor method is used.
+   * \param instr current instruction containing shape and storage info.
+   */
+  void WriteAllocatedTensorFromOutside(const Instruction& instr);
+
+  bool FindIndex(const std::vector<Index>& indices, Index val) const;
+
  protected:
   /*! \brief The virtual machine's packed function table. */
   std::vector<PackedFunc> packed_funcs_;
@@ -356,6 +437,14 @@ class TVM_DLL VirtualMachine : public runtime::ModuleNode {
   ObjectPtr<Executable> exec_;
   /*! \brief The function name to inputs mapping. */
   std::unordered_map<std::string, std::vector<ObjectRef>> inputs_;
+  /*! \brief The function name to flag enabling scenario with set outputs. */
+  std::unordered_map<std::string, bool> set_outputs_enabled_;
+  /*! \brief The index of operation which destination is result. */
+  Index preresult_op_index_ = -1;
+  /*! \brief The function name to indices of output tensors in register file. */
+  std::unordered_map<std::string, std::vector<Index>> output_tensor_reg_indices_;
+  /*! \brief The function name to pre-allocated outputs mapping. */
+  std::unordered_map<std::string, std::vector<ObjectRef>> outputs_;
   /*!
    * \brief The "physical" devices the VM can execute primitives on. All "device indexes"
    * are w.r.t. this vector. Each entry in this vector must match the corresponding entry

diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
@@ -399,6 +399,7 @@ def __init__(self, exe, device, memory_cfg=None):
         self._get_input_index = self.module["get_input_index"]
         self._set_input = self.module["set_input"]
         self._set_one_input = self.module["set_one_input"]
+        self._set_outputs = self.module["set_outputs"]
         self._setup_device(device, memory_cfg)
 
     def _setup_device(self, dev, memory_cfg):
@@ -560,6 +561,41 @@ def invoke_stateful(self, func_name, *args, **kwargs):
             self.set_input(func_name, *args, **kwargs)
         self._invoke_stateful(func_name)
 
+    def invoke_with_outputs(self, func_name, input_args, output_args):
+        # TODO(vvchernov): consider scenario then output tensors set once
+        """Invoke a function with pre-allocated output tensors.
+        The output tensors should be set every invocation.
+        input_args can be None if set_input method was used before.
+
+        This invoke method allows to avoid excess copying if memory for output tensors
+        was allocated before inference.
+
+        Parameters
+        ----------
+        func_name : str
+            The name of the function.
+
+        input_args: dict of str to tvm.runtime.NDArray or np.ndarray
+            Named arguments to the function.
+
+        output_args : list[tvm.runtime.NDArray] or list[DLTensor]
+            The output tensors of the function.
+        """
+        if input_args:
+            func_params = self._exec.get_function_params(func_name)
+            new_args = [None] * len(func_params)
+            cnt = 0
+            for k in input_args:
+                if k in func_params:
+                    idx = func_params.index(k)
+                    new_args[idx] = input_args[k]
+                    cnt += 1
+            assert cnt == len(func_params)
+        cargs = convert(new_args)
+        self._set_input(func_name, *cargs)
+        self._set_outputs(func_name, *output_args)
+        self._invoke(func_name)
+
     def get_outputs(self):
         """Get the outputs from a call to :py:func`invoke_stateful`.