diff --git a/python/gen_requirements.py b/python/gen_requirements.py
index 1a55dccd1130..55f2cad8411d 100644
--- a/python/gen_requirements.py
+++ b/python/gen_requirements.py
@@ -71,6 +71,7 @@
                 "psutil",
                 "scipy",
                 "tornado",
+                "typing_extensions",
             ],
         ),
     ),
@@ -276,6 +277,7 @@
     ("torch", None),
     ("torchvision", None),
     ("tornado", None),
+    ("typing_extensions", None),
     ("xgboost", ">=1.1.0"),  # From PR #4953 & Issue #12009
 ]
 
diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc
index 3c3d931df5d9..d72e198fec0d 100644
--- a/src/runtime/graph_executor/graph_executor.cc
+++ b/src/runtime/graph_executor/graph_executor.cc
@@ -475,8 +475,13 @@ void GraphExecutor::SetupStorage() {
   // is mapped to this pool.
   data_entry_.resize(num_node_entries());
   data_alignment_.resize(num_node_entries());
+  // sid_to_eid has a size of storage_id's size, which is the size of storage_pool_.
+  sid_to_eid_.resize(storage_pool_.size());
   for (size_t i = 0; i < data_entry_.size(); ++i) {
     int storage_id = attrs_.storage_id[i];
+    // Update "storage_id -> entry_id" pair.
+    sid_to_eid_[storage_id].push_back(i);
+
     ICHECK_LT(static_cast<size_t>(storage_id), storage_pool_.size());
     data_entry_[i] = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
 
@@ -504,14 +509,14 @@ void GraphExecutor::SetupOpExecs() {
   for (uint32_t nid = 0; nid < this->GetNumOfNodes(); ++nid) {
     const auto& inode = nodes_[nid];
     if (inode.op_type == "null") continue;
-    std::vector<DLTensor> args;
+    std::vector<DLTensor*> args;
     for (const auto& e : inode.inputs) {
       uint32_t eid = this->entry_id(e);
-      args.push_back(*(data_entry_[eid].operator->()));
+      args.push_back(const_cast<DLTensor*>(data_entry_[eid].operator->()));
     }
     for (uint32_t index = 0; index < inode.param.num_outputs; ++index) {
       uint32_t eid = this->entry_id(nid, index);
-      args.push_back(*(data_entry_[eid].operator->()));
+      args.push_back(const_cast<DLTensor*>(data_entry_[eid].operator->()));
     }
     ICHECK(inode.op_type == "tvm_op") << "Can only take tvm_op as op";
 
@@ -524,6 +529,16 @@ void GraphExecutor::SetupOpExecs() {
       if (input_node_eids.count(input_eid) > 0) {
         input_dltensors_[input_eid].push_back(
             static_cast<DLTensor*>(op_args->arg_values[i].v_handle));
+
+        // Data entry who has the same storage_id should also be pushed into "input_dltensors" and
+        // being able to be updated by "SetInputZeroCopy()". This is to handle the situation that a
+        // "relay.reshape" follows immediately after input and input dltensor and reshape's output
+        // dltensor point to the same data_entry.
+        auto storage_id = attrs_.storage_id[input_eid];
+        for (auto eid : sid_to_eid_[storage_id]) {
+          input_dltensors_[input_eid].push_back(
+              const_cast<DLTensor*>(data_entry_[eid].operator->()));
+        }
       }
       // check if any model output is the input of the op
       if (output_node_eids.count(input_eid) > 0) {
@@ -544,7 +559,7 @@ void GraphExecutor::SetupOpExecs() {
 }
 
 std::pair<std::function<void()>, std::shared_ptr<GraphExecutor::OpArgs>> GraphExecutor::CreateTVMOp(
-    const TVMOpParam& param, const std::vector<DLTensor>& args) {
+    const TVMOpParam& param, const std::vector<DLTensor*>& args) {
   std::shared_ptr<GraphExecutor::OpArgs> arg_ptr = std::make_shared<GraphExecutor::OpArgs>();
   // setup address.
   arg_ptr->args = args;
@@ -553,7 +568,7 @@ std::pair<std::function<void()>, std::shared_ptr<GraphExecutor::OpArgs>> GraphEx
   }
   for (size_t i = 0; i < arg_ptr->args.size(); ++i) {
     TVMValue v;
-    DLTensor* t = &arg_ptr->args[i];
+    DLTensor* t = arg_ptr->args[i];
     v.v_handle = t;
     arg_ptr->arg_values.push_back(v);
     arg_ptr->arg_tcodes.push_back(kTVMDLTensorHandle);
diff --git a/src/runtime/graph_executor/graph_executor.h b/src/runtime/graph_executor/graph_executor.h
index 0a7086c9f125..c93f35976cc0 100644
--- a/src/runtime/graph_executor/graph_executor.h
+++ b/src/runtime/graph_executor/graph_executor.h
@@ -66,7 +66,7 @@ struct TVMOpParam {
  */
 class TVM_DLL GraphExecutor : public ModuleNode {
   struct OpArgs {
-    std::vector<DLTensor> args;
+    std::vector<DLTensor*> args;
     std::vector<TVMValue> arg_values;
     std::vector<int> arg_tcodes;
     std::vector<int64_t> shape_data;
@@ -437,7 +437,7 @@ class TVM_DLL GraphExecutor : public ModuleNode {
    * \return The created executor.
    */
   std::pair<std::function<void()>, std::shared_ptr<OpArgs>> CreateTVMOp(
-      const TVMOpParam& attrs, const std::vector<DLTensor>& args);
+      const TVMOpParam& attrs, const std::vector<DLTensor*>& args);
   // Get node entry index.
   uint32_t entry_id(uint32_t nid, uint32_t index) const { return node_row_ptr_[nid] + index; }
   // Get node entry index.
@@ -460,6 +460,8 @@ class TVM_DLL GraphExecutor : public ModuleNode {
   std::vector<std::vector<DLTensor*>> output_dltensors_;
   /*! \brief Used for quick node(both model output and op input) DLTensor* lookup given an eid. */
   std::vector<std::vector<DLTensor*>> both_output_opinput_dltensors_;
+  /*! \brief Used for quick entry_id lookup given an storage_id. */
+  std::vector<std::vector<uint32_t>> sid_to_eid_;
   /*! \brief Used for quick entry indexing. */
   std::vector<uint32_t> node_row_ptr_;
   /*! \brief Output entries. */
diff --git a/tests/python/unittest/test_set_input_zero_copy.py b/tests/python/unittest/test_set_input_zero_copy.py
new file mode 100644
index 000000000000..3effbaed152f
--- /dev/null
+++ b/tests/python/unittest/test_set_input_zero_copy.py
@@ -0,0 +1,137 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-function-docstring,missing-module-docstring
+import tvm
+from tvm import relay
+import numpy as np
+from tvm.contrib import graph_executor
+from tvm import testing
+import numpy as np
+import pytest
+
+
+dev = tvm.cpu(0)
+target = tvm.target.Target("llvm")
+
+
+def build_relay_module(func):
+    mod = tvm.IRModule()
+    mod["main"] = func
+    lib = relay.build(mod, target=target)
+
+    return graph_executor.GraphModule(lib["default"](dev))
+
+
+@testing.requires_llvm
+def test_simple_graph():
+    # Simple relay func:
+    # 1. y = x + 1
+    # 2. return y
+    shape = (2, 2)
+    x = relay.var("x", shape=shape, dtype="float32")
+    y = relay.add(x, relay.ones(shape, dtype="float32"))
+    func = relay.Function([x], y)
+
+    # Build 2 exactly same relay modules.
+    mod = build_relay_module(func)
+    mod_zero_copy = build_relay_module(func)
+    x_np = np.random.uniform(size=shape).astype(np.float32)
+
+    # Use set_input()
+    x_nd = tvm.nd.array(x_np, device=dev)
+    mod.set_input("x", x_nd)
+    mod.run()
+
+    # Use set_input_zero_copy()
+    x_nd_zero_copy = tvm.nd.array(x_np, device=dev)
+    index = mod_zero_copy.get_input_index("x")
+    mod_zero_copy.module["set_input_zero_copy"](index, x_nd_zero_copy)
+    mod_zero_copy.run()
+
+    # Expect get same output "x".
+    testing.assert_allclose(mod.get_output(0).numpy(), mod_zero_copy.get_output(0).numpy())
+
+
+@testing.requires_llvm
+def test_input_in_output():
+    # Relay func that input is also in output:
+    # 1. y = x + 1
+    # 2. return [x, y]
+    shape = (3, 4)
+    x = relay.var("x", shape=shape, dtype="float32")
+    y = relay.add(x, relay.ones(shape, dtype="float32"))
+    func = relay.Function([x], relay.expr.Tuple([x, y]))
+
+    # Build 2 exactly same relay modules.
+    mod = build_relay_module(func)
+    mod_zero_copy = build_relay_module(func)
+
+    x_np = np.random.uniform(size=shape).astype(np.float32)
+
+    # Use set_input()
+    x_nd = tvm.nd.array(x_np, device=dev)
+    mod.set_input("x", x_nd)
+    mod.run()
+
+    # Use set_input_zero_copy()
+    x_nd_zero_copy = tvm.nd.array(x_np, device=dev)
+    index = mod_zero_copy.get_input_index("x")
+    mod_zero_copy.module["set_input_zero_copy"](index, x_nd_zero_copy)
+    mod_zero_copy.run()
+
+    # Expect get same output "x".
+    testing.assert_allclose(mod.get_output(0).numpy(), mod_zero_copy.get_output(0).numpy())
+
+
+@testing.requires_llvm
+def test_reshape_after_input():
+    # Relay func that a reshape op follows immediately after input:
+    # 1. y = x + 1
+    # 2. return [x, y]
+    shape = (3, 4)
+    x = relay.var("x", shape=shape, dtype="float32")
+    y = relay.reshape(x, (1, 12))
+    z = relay.add(y, relay.ones((1, 12), dtype="float32"))
+    func = relay.Function([x], relay.expr.Tuple([x, y, z]))
+
+    # Build 2 exactly same relay modules.
+    mod = build_relay_module(func)
+    mod_zero_copy = build_relay_module(func)
+
+    x_np = np.random.uniform(size=shape).astype(np.float32)
+
+    # Use set_input()
+    x_nd = tvm.nd.array(x_np, device=dev)
+    mod.set_input("x", x_nd)
+    mod.run()
+
+    # Use set_input_zero_copy()
+    x_nd_zero_copy = tvm.nd.array(x_np, device=dev)
+    index = mod_zero_copy.get_input_index("x")
+    mod_zero_copy.module["set_input_zero_copy"](index, x_nd_zero_copy)
+    mod_zero_copy.run()
+
+    # Expect get same output "x".
+    testing.assert_allclose(mod.get_output(0).numpy(), mod_zero_copy.get_output(0).numpy())
+    # Expect get same output "y".
+    testing.assert_allclose(mod.get_output(1).numpy(), mod_zero_copy.get_output(1).numpy())
+
+
+if __name__ == "__main__":
+    test_simple_graph()
+    test_input_in_output()
+    test_reshape_after_input()