Skip to content
Merged
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ endif()
# nvfuser codegen sources
set(NVFUSER_SRCS)
list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/assume.cpp
${NVFUSER_SRCS_DIR}/compute_at.cpp
${NVFUSER_SRCS_DIR}/inlining.cpp
${NVFUSER_SRCS_DIR}/compute_at_map.cpp
Expand Down
47 changes: 47 additions & 0 deletions csrc/assume.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#include <assume.h>
#include <ir_all_nodes.h>
#include <ir_builder.h>

#include <vector>

namespace nvfuser::assume {

Bool* tensorsAreNotEmpty(Val* value) {
std::vector<Val*> todo{value};
std::vector<Val*> tensor_sizes;
while (!todo.empty()) {
auto v = todo.back();
todo.pop_back();
if (auto ns = dynamic_cast<NamedScalar*>(v)) {
if (ns->isTensorSize()) {
tensor_sizes.emplace_back(v);
continue;
}
}
if (auto def = v->definition()) {
for (auto inp : def->inputs()) {
todo.emplace_back(inp);
}
}
}
Bool* result = nullptr;
// tensor_sizes might contain duplicate, and we should remove this duplication
std::vector<Val*> tensor_sizes_applied;
for (auto ts : tensor_sizes) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tensor_sizes can contain tensor size named scalars redundantly. Are you assuming the redundancy should be eliminated by the expr simplification?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for catching! They will not be eliminated by expr simplifier, because the assumptions argument are not simplified. I don't think this will cause any issue, but still, we should get rid of it. I will update this PR.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

bool is_duplicate = false;
for (auto existing : tensor_sizes_applied) {
if (existing->sameAs(ts)) {
is_duplicate = true;
break;
}
}
if (!is_duplicate) {
tensor_sizes_applied.emplace_back(ts);
result = SimplifyingIrBuilder::andExpr(
result, SimplifyingIrBuilder::gtExpr(ts, ts->container()->zeroVal()));
}
}
return result;
}

} // namespace nvfuser::assume
18 changes: 18 additions & 0 deletions csrc/assume.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#include <ir_all_nodes.h>

// Return boolean predicates representing the conditional you want to assume.
// The return value is typically used as the `assumptions` argument of
// `simplifyExpr`

namespace nvfuser::assume {

// Return a boolean predicate stating that all tensor sizes appearing in `value`
// are positive. Return nullptr if `value` does not depend on any tensor size.
// For example:
// tensorsAreNotEmpty(ceilDiv(T0.size[0], 5) * T0.size[1])
// -> T0.size[0] > 0 && T0.size[1] > 0
// tensorsAreNotEmpty(ceilDiv(i1, 5) * i2)
// -> nullptr
Bool* tensorsAreNotEmpty(Val* value);

} // namespace nvfuser::assume
34 changes: 34 additions & 0 deletions csrc/expr_simplifier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1296,6 +1296,13 @@ bool isPositiveHelper(Val* value, const Context& context) {
}
return true;
}
} else if (auto bop = dynamic_cast<BinaryOp*>(value->definition())) {
auto op = bop->getBinaryOpType();
if (op == BinaryOpType::CeilDiv) {
return isPositive(bop->lhs(), context) &&
isValidDenominator(bop->rhs(), context) &&
isNonNegative(bop->rhs(), context);
}
}
for (const auto& [a, b] : context.getKnownLessThan()) {
if (a->isZero() && b->sameAs(value)) {
Expand Down Expand Up @@ -1611,6 +1618,33 @@ Val* eliminateTrivialComputation(Val* value, const Context& context) {
}
}
}
{ // max(a, b) -> a if a >= b, min(a, b) -> b if a >= b
if (op == BinaryOpType::Max || op == BinaryOpType::Min) {
std::vector<Val*> simplified_input;
for (auto v : fop->inputs()) {
bool found_redundant = false;
for (auto& v2 : simplified_input) {
if ((op == BinaryOpType::Max && prove::lessEqual(v, v2, context)) ||
(op == BinaryOpType::Min && prove::lessEqual(v2, v, context))) {
found_redundant = true;
break;
} else if (
(op == BinaryOpType::Max && prove::lessEqual(v2, v, context)) ||
(op == BinaryOpType::Min && prove::lessEqual(v, v2, context))) {
found_redundant = true;
v2 = v;
break;
}
}
if (!found_redundant) {
simplified_input.emplace_back(v);
}
}
if (simplified_input.size() < fop->inputs().size()) {
return maybeFlattenedOpOf(op, std::move(simplified_input));
}
}
}
} else if (auto bop = dynamic_cast<BinaryOp*>(value->definition())) {
auto lhs = foldConstants(bop->lhs());
auto rhs = foldConstants(bop->rhs());
Expand Down
12 changes: 11 additions & 1 deletion csrc/parallel_dimension_map.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <parallel_dimension_map.h>

#include <ATen/cuda/CUDAContext.h>
#include <assume.h>
#include <disjoint_set.h>
#include <expr_simplifier.h>
#include <ir_utils.h>
Expand Down Expand Up @@ -69,7 +70,16 @@ void ParallelDimensionMap::build(Fusion* fusion) {

// Simplify dim_map_
for (auto& [k, v] : dim_map_) {
v = simplifyExpr(v);
// Well, this isn't really correct, but we need this assumption to better
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But doesn't this actually break if empty tensors are given?

I know we haven't been very strict with empty tensors, so it's highly likely there are other things that would break with empty tensors, so this isn't a new issue.

Maybe we want to add a property to Kernel asserting there's no empty tensor? Similar to what we do with the index type, we could lower a fusion with an optional assertion (based on actual input tensor sizes) that it is safe to assume no tensor is empty. That seems to work good enough, doesn't it?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, I don't know if there is anything really breaks today. If a tensor size is 0, and it appears in the parallel dim, this usually means we want to launch 0 blocks or 0 threads, this usually means all the fusion's output are empty tensor, and the lowering of this case is already skipped. The thing that I am really worried about is, if some extent is T0.size[0] + 1, then empty tensor does not imply empty output. I don't think we currently have this schedule yet, but it might become a pitfall for the future.

Regarding the property for non-empty tensor in Kernel, I am not sure. But isn't that a big project to implement it? For example, if you have such a property, you need to know whether to set it as true or false so that you can enable more simplifications as much as possible. And doesn't that requires modifying our cache system to handle 0 as a special number (just like how we treated 1 as broadcast), and if I see an empty tensor, we need to make sure we don't use the cache for non-empty tensor?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I didn't pay close attention that this is only for vals that represent parallel dimensions. Yes, in this case, I see no concern.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Regarding the property for non-empty tensor in Kernel, I am not sure. But isn't that a big project to implement it? For example, if you have such a property, you need to know whether to set it as true or false so that you can enable more simplifications as much as possible. And doesn't that requires modifying our cache system to handle 0 as a special number (just like how we treated 1 as broadcast), and if I see an empty tensor, we need to make sure we don't use the cache for non-empty tensor?

I don't think that's too different what we already do with the index type. Granted, we do have a couple of issues with it, but all we would need to do is to assume tensors can be empty by default and allow to have the non-empty tensor only with an optional flag that should be part of CompileOptions. Not necessary for this PR, but I think this approach should fit well by following the same approach as the index type specialization.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's leave this PR as is, and figure out our strategy for it when we see a bug in empty tensor. For matmul, it strongly rely on the assumption that tensors are not empty, but I don't see any other case where this is important. But I am feeling weird about the logic "if matmul, then assume nonempty, otherwise don't assume so". Whether to assume nonempty should be based on inputs, not on whether it is matmul.

// handle non-empty cases. If this turn out to be an issue, I believe we
// then need to find a more systematic way to handle empty tensor, rather
// than just disable this assumption.
auto assume = assume::tensorsAreNotEmpty(v);
if (assume != nullptr) {
v = simplifyExpr(v, {}, {assume});
} else {
v = simplifyExpr(v);
}
}

// Compute exact_types_
Expand Down
8 changes: 8 additions & 0 deletions csrc/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <torch/csrc/jit/ir/ir.h>
#include <type.h>

#include <deque>
#include <sstream>
#include <string>
#include <type_traits>
Expand Down Expand Up @@ -374,6 +375,13 @@ std::string toDelimitedString(
return toDelimitedString(vec.begin(), vec.end(), delim);
}

template <typename Printable>
std::string toDelimitedString(
const std::deque<Printable>& dq,
std::string delim = ", ") {
return toDelimitedString(dq.begin(), dq.end(), delim);
}

template <int64_t index, int64_t stop, int64_t step, typename func_t>
void unrolled_for(func_t fun) {
if constexpr (index < stop) {
Expand Down
Loading