NVIDIA · zasdfgbnm · Mar 31, 2023 · Mar 30, 2023 · Mar 30, 2023 · Mar 30, 2023
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -60,6 +60,7 @@ endif()
 # nvfuser codegen sources
 set(NVFUSER_SRCS)
 list(APPEND NVFUSER_SRCS
+    ${NVFUSER_SRCS_DIR}/assume.cpp
     ${NVFUSER_SRCS_DIR}/compute_at.cpp
     ${NVFUSER_SRCS_DIR}/inlining.cpp
     ${NVFUSER_SRCS_DIR}/compute_at_map.cpp

diff --git a/csrc/assume.cpp b/csrc/assume.cpp
@@ -0,0 +1,47 @@
+#include <assume.h>
+#include <ir_all_nodes.h>
+#include <ir_builder.h>
+
+#include <vector>
+
+namespace nvfuser::assume {
+
+Bool* tensorsAreNotEmpty(Val* value) {
+  std::vector<Val*> todo{value};
+  std::vector<Val*> tensor_sizes;
+  while (!todo.empty()) {
+    auto v = todo.back();
+    todo.pop_back();
+    if (auto ns = dynamic_cast<NamedScalar*>(v)) {
+      if (ns->isTensorSize()) {
+        tensor_sizes.emplace_back(v);
+        continue;
+      }
+    }
+    if (auto def = v->definition()) {
+      for (auto inp : def->inputs()) {
+        todo.emplace_back(inp);
+      }
+    }
+  }
+  Bool* result = nullptr;
+  // tensor_sizes might contain duplicate, and we should remove this duplication
+  std::vector<Val*> tensor_sizes_applied;
+  for (auto ts : tensor_sizes) {
+    bool is_duplicate = false;
+    for (auto existing : tensor_sizes_applied) {
+      if (existing->sameAs(ts)) {
+        is_duplicate = true;
+        break;
+      }
+    }
+    if (!is_duplicate) {
+      tensor_sizes_applied.emplace_back(ts);
+      result = SimplifyingIrBuilder::andExpr(
+          result, SimplifyingIrBuilder::gtExpr(ts, ts->container()->zeroVal()));
+    }
+  }
+  return result;
+}
+
+} // namespace nvfuser::assume
diff --git a/csrc/assume.h b/csrc/assume.h
@@ -0,0 +1,18 @@
+#include <ir_all_nodes.h>
+
+// Return boolean predicates representing the conditional you want to assume.
+// The return value is typically used as the `assumptions` argument of
+// `simplifyExpr`
+
+namespace nvfuser::assume {
+
+// Return a boolean predicate stating that all tensor sizes appearing in `value`
+// are positive. Return nullptr if `value` does not depend on any tensor size.
+// For example:
+//   tensorsAreNotEmpty(ceilDiv(T0.size[0], 5) * T0.size[1])
+//     -> T0.size[0] > 0 && T0.size[1] > 0
+//   tensorsAreNotEmpty(ceilDiv(i1, 5) * i2)
+//     -> nullptr
+Bool* tensorsAreNotEmpty(Val* value);
+
+} // namespace nvfuser::assume
diff --git a/csrc/expr_simplifier.cpp b/csrc/expr_simplifier.cpp
@@ -1296,6 +1296,13 @@ bool isPositiveHelper(Val* value, const Context& context) {
       }
       return true;
     }
+  } else if (auto bop = dynamic_cast<BinaryOp*>(value->definition())) {
+    auto op = bop->getBinaryOpType();
+    if (op == BinaryOpType::CeilDiv) {
+      return isPositive(bop->lhs(), context) &&
+          isValidDenominator(bop->rhs(), context) &&
+          isNonNegative(bop->rhs(), context);
+    }
   }
   for (const auto& [a, b] : context.getKnownLessThan()) {
     if (a->isZero() && b->sameAs(value)) {
@@ -1611,6 +1618,33 @@ Val* eliminateTrivialComputation(Val* value, const Context& context) {
         }
       }
     }
+    { // max(a, b) -> a if a >= b, min(a, b) -> b if a >= b
+      if (op == BinaryOpType::Max || op == BinaryOpType::Min) {
+        std::vector<Val*> simplified_input;
+        for (auto v : fop->inputs()) {
+          bool found_redundant = false;
+          for (auto& v2 : simplified_input) {
+            if ((op == BinaryOpType::Max && prove::lessEqual(v, v2, context)) ||
+                (op == BinaryOpType::Min && prove::lessEqual(v2, v, context))) {
+              found_redundant = true;
+              break;
+            } else if (
+                (op == BinaryOpType::Max && prove::lessEqual(v2, v, context)) ||
+                (op == BinaryOpType::Min && prove::lessEqual(v, v2, context))) {
+              found_redundant = true;
+              v2 = v;
+              break;
+            }
+          }
+          if (!found_redundant) {
+            simplified_input.emplace_back(v);
+          }
+        }
+        if (simplified_input.size() < fop->inputs().size()) {
+          return maybeFlattenedOpOf(op, std::move(simplified_input));
+        }
+      }
+    }
   } else if (auto bop = dynamic_cast<BinaryOp*>(value->definition())) {
     auto lhs = foldConstants(bop->lhs());
     auto rhs = foldConstants(bop->rhs());

diff --git a/csrc/parallel_dimension_map.cpp b/csrc/parallel_dimension_map.cpp
@@ -8,6 +8,7 @@
 #include <parallel_dimension_map.h>
 
 #include <ATen/cuda/CUDAContext.h>
+#include <assume.h>
 #include <disjoint_set.h>
 #include <expr_simplifier.h>
 #include <ir_utils.h>
@@ -69,7 +70,16 @@ void ParallelDimensionMap::build(Fusion* fusion) {
 
   // Simplify dim_map_
   for (auto& [k, v] : dim_map_) {
-    v = simplifyExpr(v);
+    // Well, this isn't really correct, but we need this assumption to better
+    // handle non-empty cases. If this turn out to be an issue, I believe we
+    // then need to find a more systematic way to handle empty tensor, rather
+    // than just disable this assumption.
+    auto assume = assume::tensorsAreNotEmpty(v);
+    if (assume != nullptr) {
+      v = simplifyExpr(v, {}, {assume});
+    } else {
+      v = simplifyExpr(v);
+    }
   }
 
   // Compute exact_types_

diff --git a/csrc/utils.h b/csrc/utils.h
@@ -12,6 +12,7 @@
 #include <torch/csrc/jit/ir/ir.h>
 #include <type.h>
 
+#include <deque>
 #include <sstream>
 #include <string>
 #include <type_traits>
@@ -374,6 +375,13 @@ std::string toDelimitedString(
   return toDelimitedString(vec.begin(), vec.end(), delim);
 }
 
+template <typename Printable>
+std::string toDelimitedString(
+    const std::deque<Printable>& dq,
+    std::string delim = ", ") {
+  return toDelimitedString(dq.begin(), dq.end(), delim);
+}
+
 template <int64_t index, int64_t stop, int64_t step, typename func_t>
 void unrolled_for(func_t fun) {
   if constexpr (index < stop) {