From bf903a20bb854b4e6830cefbb34243fe9f2a96cb Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Thu, 2 Nov 2023 15:05:09 -0500
Subject: [PATCH] [Unity] Implement RemoveUnusedOutputs transform

This commit implements `relax.transform.RemoveUnusedOutputs`, an
extension of the current `DeadCodeElimination` pass.  If an internal
callee produces multiple outputs, but only some of those outputs are
used, the callee can be rewritten to only produce the outputs that are
used.  This is intended to allow more aggressive DCE in the callee, as
reducing the set of outputs can result in intermediate computations
becoming unused.
---
 include/tvm/relax/transform.h                 |   6 +
 python/tvm/relax/transform/__init__.py        |   1 +
 python/tvm/relax/transform/transform.py       |  10 +
 python/tvm/script/parser/relax/parser.py      |   3 +
 python/tvm/testing/utils.py                   |   5 +-
 src/relax/transform/remove_unused_outputs.cc  | 326 ++++++++++++++++++
 .../test_transform_remove_unused_outputs.py   | 123 +++++++
 7 files changed, 471 insertions(+), 3 deletions(-)
 create mode 100644 src/relax/transform/remove_unused_outputs.cc
 create mode 100644 tests/python/relax/test_transform_remove_unused_outputs.py

diff --git a/include/tvm/relax/transform.h b/include/tvm/relax/transform.h
index ae6d4059a633..b043765a6990 100644
--- a/include/tvm/relax/transform.h
+++ b/include/tvm/relax/transform.h
@@ -275,6 +275,12 @@ TVM_DLL Pass LiftTransformParams();
  */
 TVM_DLL Pass UpdateVDevice(VDevice new_vdevice, int64_t index);
 
+/*! \brief Remove unused outputs from internal functions
+ *
+ * \return The Pass
+ */
+TVM_DLL Pass RemoveUnusedOutputs();
+
 /*!
  * \brief Annotate Op Pattern Kind for TIR functions, which is used in FuseOps.
  * \note It is an auto-detect pass for "unscheduled prim_funcs", the op_pattern will be
diff --git a/python/tvm/relax/transform/__init__.py b/python/tvm/relax/transform/__init__.py
index 4d841a8e7b92..0ce0ebba1105 100644
--- a/python/tvm/relax/transform/__init__.py
+++ b/python/tvm/relax/transform/__init__.py
@@ -55,6 +55,7 @@
     PatternCheckContext,
     RealizeVDevice,
     RemovePurityChecking,
+    RemoveUnusedOutputs,
     RewriteCUDAGraph,
     RewriteDataflowReshape,
     RunCodegen,
diff --git a/python/tvm/relax/transform/transform.py b/python/tvm/relax/transform/transform.py
index 049bce7428da..428f8c24efd7 100644
--- a/python/tvm/relax/transform/transform.py
+++ b/python/tvm/relax/transform/transform.py
@@ -558,6 +558,16 @@ def FoldConstant() -> tvm.ir.transform.Pass:
     return _ffi_api.FoldConstant()  # type: ignore
 
 
+def RemoveUnusedOutputs() -> tvm.ir.transform.Pass:
+    """Remove unused outputs from internal functions
+
+    Returns
+    -------
+    ret: tvm.ir.transform.Pass
+    """
+    return _ffi_api.RemoveUnusedOutputs()  # type: ignore
+
+
 def AnnotateTIROpPattern() -> tvm.ir.transform.Pass:
     """Annotate Op Pattern Kind for TIR functions
 
diff --git a/python/tvm/script/parser/relax/parser.py b/python/tvm/script/parser/relax/parser.py
index 863c249975a7..1a0c3cea8e0b 100644
--- a/python/tvm/script/parser/relax/parser.py
+++ b/python/tvm/script/parser/relax/parser.py
@@ -367,6 +367,9 @@ def visit_if(self: Parser, node: doc.If) -> None:
 @dispatch.register(token="relax", type_name="enter_token")
 def enter_token(self: Parser) -> Dict[str, Any]:
     def relax_call(self, *args) -> Expr:
+
+        args = [convert_to_expr(arg) if isinstance(arg, tuple) else arg for arg in args]
+
         if all(isinstance(x, Expr) for x in args):
             return relax.Call(self, args)
         arg_types = [type(x) for x in args]
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 7c8b0e883c71..29c9463ba5c8 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -1938,7 +1938,7 @@ def __init_subclass__(cls):
 
     @classmethod
     def _normalize_ir_module(cls, func):
-        if isinstance(func, tvm.tir.PrimFunc):
+        if isinstance(func, (tvm.tir.PrimFunc, tvm.IRModule)):
 
             def inner(self):
                 # pylint: disable=unused-argument
@@ -2042,8 +2042,7 @@ def inner(self):
 
     @staticmethod
     def _is_method(func):
-        sig = inspect.signature(func)
-        return "self" in sig.parameters
+        return callable(func) and "self" in inspect.signature(func).parameters
 
     def test_compare(self, before, expected, transform):
         """Unit test to compare the expected TIR PrimFunc to actual"""
diff --git a/src/relax/transform/remove_unused_outputs.cc b/src/relax/transform/remove_unused_outputs.cc
new file mode 100644
index 000000000000..e3bf12382c67
--- /dev/null
+++ b/src/relax/transform/remove_unused_outputs.cc
@@ -0,0 +1,326 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <tvm/relax/analysis.h>
+#include <tvm/relax/expr_functor.h>
+#include <tvm/relax/transform.h>
+#include <tvm/relax/utils.h>
+
+#include <algorithm>
+#include <optional>
+#include <tuple>
+
+namespace tvm {
+namespace relax {
+
+namespace {
+
+template <typename T>
+using PSet = std::unordered_set<T, ObjectPtrHash, ObjectPtrEqual>;
+
+template <typename T, typename U>
+using PMap = std::unordered_map<T, U, ObjectPtrHash, ObjectPtrEqual>;
+
+class PartialTupleUsageCollector : ExprVisitor {
+ public:
+  static PMap<GlobalVar, std::vector<bool>> Collect(const IRModule& mod) {
+    PMap<GlobalVar, size_t> num_outputs;
+
+    for (const auto& [gvar, base_func] : mod->functions) {
+      bool is_exposed = base_func->attrs.GetAttr<String>(tvm::attr::kGlobalSymbol).defined();
+
+      if (!is_exposed) {
+        if (auto relax_func = base_func.as<FunctionNode>()) {
+          if (auto out_tuple = relax_func->ret_struct_info.as<TupleStructInfoNode>()) {
+            num_outputs[gvar] = out_tuple->fields.size();
+          }
+        }
+      }
+    }
+
+    if (num_outputs.empty()) {
+      // Early bail-out if the module has no private functions that
+      // return tuples.
+      return {};
+    }
+
+    PartialTupleUsageCollector visitor(std::move(num_outputs));
+    for (const auto& [gvar, base_func] : mod->functions) {
+      if (auto func = base_func.as<Function>()) {
+        visitor.VisitExpr(func.value());
+      }
+    }
+
+    PMap<GlobalVar, std::vector<bool>> to_update;
+    for (const auto& [gvar, mask] : visitor.output_usage_mask_) {
+      bool has_unused_output =
+          std::any_of(mask.begin(), mask.end(), [](const bool is_used) { return !is_used; });
+      if (has_unused_output) {
+        to_update[gvar] = mask;
+      }
+    }
+
+    return to_update;
+  }
+
+ private:
+  explicit PartialTupleUsageCollector(PMap<GlobalVar, size_t> num_outputs) {
+    for (const auto& [gvar, num_output] : num_outputs) {
+      output_usage_mask_[gvar] = std::vector<bool>(num_output, false);
+    }
+  }
+
+  void VisitBinding(const Binding& binding) override {
+    ExprVisitor::VisitBinding(binding);
+    known_bindings_.Set(binding->var, GetBoundValue(binding));
+  }
+
+  void VisitExpr_(const TupleGetItemNode* op) override {
+    Expr tuple = UnwrapBindings(op->tuple);
+
+    if (auto call = tuple.as<CallNode>()) {
+      if (auto opt_callee = call->op.as<GlobalVar>()) {
+        auto callee = opt_callee.value();
+        if (auto it = output_usage_mask_.find(callee); it != output_usage_mask_.end()) {
+          auto& used_indices = it->second;
+
+          CHECK_GE(op->index, 0) << "IndexError: "
+                                 << "Indices for TupleGetItem must be non-negative, "
+                                 << "but expression " << GetRef<Expr>(op)
+                                 << " uses a tuple index of " << op->index;
+          size_t index = op->index;
+
+          CHECK_LT(index, used_indices.size())
+              << "IndexError: "
+              << "Indices for TupleGetItem must be less than the size of the tuple, "
+              << "but expression " << GetRef<Expr>(op) << " uses a tuple index of " << op->index
+              << " for a tuple of size " << used_indices.size();
+          used_indices[index] = true;
+        }
+      }
+    }
+  }
+
+  Expr UnwrapBindings(Expr expr) const {
+    auto get_bound_value = [&](const Expr& expr) -> Optional<Expr> {
+      if (auto var = expr.as<Var>()) {
+        if (auto known_binding = known_bindings_.Get(var.value())) {
+          return known_binding.value();
+        }
+      }
+      return NullOpt;
+    };
+
+    while (auto unwrapped = get_bound_value(expr)) {
+      expr = unwrapped.value();
+    }
+    return expr;
+  }
+
+  Map<Var, Expr> known_bindings_;
+  PMap<GlobalVar, std::vector<bool>> output_usage_mask_;
+};
+
+Function UpdateCallee(Function func, const std::vector<bool>& usage_mask) {
+  auto old_func_sinfo = func->struct_info_.as<FuncStructInfoNode>();
+
+  auto old_ret_sinfo = func->ret_struct_info.as<TupleStructInfoNode>();
+  ICHECK(old_ret_sinfo) << "All functions returning non-tuple outputs "
+                        << "should have been pruned already by PartialTupleUsageCollector";
+
+  Array<Expr> outputs;
+
+  // This helper variable will be removed by the post-proc of
+  // CanonicalizeBindings and DeadCodeElimination.
+  Var previous_outputs("previous_outputs", func->ret_struct_info);
+
+  for (size_t i = 0; i < usage_mask.size(); i++) {
+    if (usage_mask[i]) {
+      outputs.push_back(TupleGetItem(previous_outputs, i));
+    }
+  }
+
+  Expr new_output = outputs.size() == 1 ? outputs[0] : Tuple(outputs);
+  StructInfo new_return_sinfo =
+      outputs.size() == 1 ? GetStructInfo(outputs[0]) : TupleStructInfo(outputs.Map(GetStructInfo));
+
+  VarBinding binding(previous_outputs, func->body);
+  BindingBlock binding_block({binding});
+  SeqExpr new_body({binding_block}, new_output);
+
+  auto old_sinfo = Downcast<FuncStructInfo>(func->struct_info_);
+  FuncStructInfo new_sinfo(old_func_sinfo->params.value(), new_return_sinfo,
+                           old_func_sinfo->purity);
+
+  auto write_ptr = func.CopyOnWrite();
+  write_ptr->struct_info_ = new_sinfo;
+  write_ptr->body = new_body;
+
+  return func;
+}
+
+class CallSiteMutator : public ExprMutator {
+ public:
+  explicit CallSiteMutator(PMap<GlobalVar, std::function<Expr(Call)>> callsite_updaters)
+      : callsite_updaters_(callsite_updaters) {}
+
+  using ExprMutator::VisitExpr_;
+
+  Expr VisitExpr_(const CallNode* op) override {
+    auto node = Downcast<Call>(ExprMutator::VisitExpr_(op));
+
+    if (auto gvar = node->op.as<GlobalVar>()) {
+      if (auto it = callsite_updaters_.find(gvar.value()); it != callsite_updaters_.end()) {
+        return it->second(node);
+      }
+    }
+
+    return node;
+  }
+
+  PMap<GlobalVar, std::function<Expr(Call)>> callsite_updaters_;
+};
+
+}  // namespace
+
+namespace transform {
+
+Pass RemoveUnusedOutputs() {
+  runtime::TypedPackedFunc<IRModule(IRModule, PassContext)> pass_func =
+      [=](IRModule mod, PassContext pc) -> IRModule {
+    auto usage = PartialTupleUsageCollector::Collect(mod);
+
+    if (usage.empty()) {
+      // Early bail-out if there are no updates to make.
+      return mod;
+    }
+
+    PMap<GlobalVar, std::function<Expr(Call)>> callsite_updaters;
+
+    {
+      IRModule new_callees;
+
+      for (const auto& [gvar, base_func] : mod->functions) {
+        if (auto func = base_func.as<Function>()) {
+          if (auto it = usage.find(gvar); it != usage.end()) {
+            const auto& usage_mask = it->second;
+            auto new_func = UpdateCallee(func.value(), usage_mask);
+
+            GlobalVar new_gvar(gvar->name_hint, new_func->checked_type_);
+            new_gvar->struct_info_ = new_func->struct_info_;
+            new_callees->Add(new_gvar, new_func);
+
+            callsite_updaters[gvar] = [old_gvar = gvar, new_gvar, usage_mask](Call call) -> Expr {
+              ICHECK(call->op.same_as(old_gvar)) << "InternalError: "
+                                                 << "Updater should be applied to " << old_gvar
+                                                 << ", but was applied to " << call->op;
+
+              auto old_call_sinfo = call->struct_info_.as<TupleStructInfoNode>();
+              ICHECK(old_call_sinfo)
+                  << "InternalError: "
+                  << "Updater should be applied to Call producing an output tuple, "
+                  << "but " << call << " has struct info " << call->struct_info_;
+              CHECK_EQ(usage_mask.size(), old_call_sinfo->fields.size())
+                  << "Function " << call->op << " produces " << usage_mask.size() << " outputs, "
+                  << "but " << call << " was used in a context expecting "
+                  << old_call_sinfo->fields.size() << " outputs.";
+
+              Call new_call(new_gvar, call->args);
+
+              int num_outputs_used = 0;
+              for (bool used : usage_mask) {
+                num_outputs_used += used;
+              }
+
+              Array<Expr> new_results;
+              int new_result_index = 0;
+              for (size_t i = 0; i < usage_mask.size(); i++) {
+                if (usage_mask[i]) {
+                  // This element of the old output tuple was used.  We replace
+                  // it either with access into the new output tuple, if callee
+                  // still produces multiple outputs, or with the output
+                  // itself, if the callee has been reduced to producing a
+                  // single output.
+                  auto replacement = [&]() -> Expr {
+                    if (num_outputs_used == 1) {
+                      return new_call;
+                    } else {
+                      return TupleGetItem(new_call, new_result_index);
+                    }
+                  }();
+                  new_results.push_back(replacement);
+                  new_result_index++;
+                } else {
+                  // This element of the tuple was unused in the old output,
+                  // and is no longer generated from the modified callee.  We
+                  // could remember the index mapping and re-index any access
+                  // into the old tuple, but it's simpler to just let
+                  // CanonicalizeBindings and DCE handle it.
+                  new_results.push_back(
+                      relax::PrimValue(FloatImm(DataType::Float(64), std::nan(""))));
+                }
+              }
+
+              return Tuple(new_results);
+            };
+          }
+        }
+      }
+
+      auto write_ptr = mod.CopyOnWrite();
+      for (const auto& [gvar, callee] : new_callees->functions) {
+        write_ptr->Remove(write_ptr->GetGlobalVar(gvar->name_hint));
+        write_ptr->Add(gvar, callee);
+      }
+    }
+
+    CallSiteMutator mutator(std::move(callsite_updaters));
+
+    IRModule caller_updates;
+
+    for (const auto& [gvar, base_func] : mod->functions) {
+      if (auto func = base_func.as<Function>()) {
+        auto mutated = Downcast<Function>(mutator.VisitExpr(func.value()));
+        if (!mutated.same_as(base_func)) {
+          caller_updates->Add(gvar, mutated);
+        }
+      }
+    }
+
+    if (caller_updates->functions.size()) {
+      mod.CopyOnWrite()->Update(caller_updates);
+    }
+    return mod;
+  };
+  auto inner_pass = CreateModulePass(pass_func, 0, "RemoveUnusedOutputsInner", {});
+  return tvm::transform::Sequential(
+      {
+          inner_pass,
+          CanonicalizeBindings(),
+          DeadCodeElimination({}),
+      },
+      "RemoveUnusedOutputs");
+}
+
+TVM_REGISTER_GLOBAL("relax.transform.RemoveUnusedOutputs").set_body_typed(RemoveUnusedOutputs);
+
+}  // namespace transform
+
+}  // namespace relax
+}  // namespace tvm
diff --git a/tests/python/relax/test_transform_remove_unused_outputs.py b/tests/python/relax/test_transform_remove_unused_outputs.py
new file mode 100644
index 000000000000..c0405ca58d00
--- /dev/null
+++ b/tests/python/relax/test_transform_remove_unused_outputs.py
@@ -0,0 +1,123 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import tvm.testing
+from tvm.script import ir as I, relax as R, tir as T
+
+
+class BaseCompare(tvm.testing.CompareBeforeAfter):
+    transform = tvm.relax.transform.RemoveUnusedOutputs()
+
+
+class TestSimple(BaseCompare):
+    @I.ir_module
+    class Before:
+        @R.function
+        def main():
+            args = Before.func()
+            return args[0]
+
+        @R.function(private=True)
+        def func() -> R.Tuple([R.Tensor, R.Tensor]):
+            A = R.zeros([16, 16], "int32")
+            B = R.ones([16, 16], "int32")
+            return (A, B)
+
+    @I.ir_module
+    class Expected:
+        @R.function
+        def main():
+            A = Expected.func()
+            return A
+
+        @R.function(private=True)
+        def func() -> R.Tensor([16, 16], "int32"):
+            A = R.zeros([16, 16], "int32")
+            return A
+
+
+class TestUseMultipleOutputs(BaseCompare):
+    @I.ir_module
+    class Before:
+        @R.function
+        def main():
+            args = Before.func()
+            return (args[0], args[2])
+
+        @R.function(private=True)
+        def func() -> R.Tuple([R.Tensor, R.Tensor, R.Tensor]):
+            A = R.zeros([16, 16], "int32")
+            B = R.ones([16, 16], "int32")
+            C = R.zeros([32, 32], "int32")
+            return (A, B, C)
+
+    @I.ir_module
+    class Expected:
+        @R.function
+        def main():
+            args = Expected.func()
+            return (args[0], args[1])
+
+        @R.function(private=True)
+        def func() -> R.Tuple([R.Tensor([16, 16], "int32"), R.Tensor([32, 32], "int32")]):
+            A = R.zeros([16, 16], "int32")
+            C = R.zeros([32, 32], "int32")
+            return (A, C)
+
+
+class TestMultipleCallSites(BaseCompare):
+    @I.ir_module
+    class Before:
+        @R.function
+        def main_a():
+            args = Before.func()
+            return args[0]
+
+        @R.function
+        def main_b():
+            args = Before.func()
+            return args[2]
+
+        @R.function(private=True)
+        def func() -> R.Tuple([R.Tensor, R.Tensor, R.Tensor]):
+            A = R.zeros([16, 16], "int32")
+            B = R.ones([16, 16], "int32")
+            C = R.zeros([32, 32], "int32")
+            return (A, B, C)
+
+    @I.ir_module
+    class Expected:
+        @R.function
+        def main_a():
+            args = Expected.func()
+            return args[0]
+
+        @R.function
+        def main_b():
+            args = Expected.func()
+            return args[1]
+
+        @R.function(private=True)
+        def func() -> R.Tuple([R.Tensor([16, 16], "int32"), R.Tensor([32, 32], "int32")]):
+            A = R.zeros([16, 16], "int32")
+            C = R.zeros([32, 32], "int32")
+            return (A, C)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()