Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ jobs:
git checkout origin/main
head_commit=$(git rev-parse HEAD)
git checkout $this_commit
git --no-pager diff --name-only $head_commit | grep -e "csrc/.*\.cpp" -e "csrc/.*\.h" | xargs lintrunner --take CLANGTIDY --force-color
# diff-filter for lower case letter:
# https://github.com/git/git/commit/7f2ea5f0f2fb056314092cce23202096ca70f076
git --no-pager diff --diff-filter=d --name-only $head_commit | grep -e "csrc/.*\.cpp" -e "csrc/.*\.h" | xargs lintrunner --take CLANGTIDY --force-color

lintrunner:
runs-on: ubuntu-latest
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ endif()
# nvfuser codegen sources
set(NVFUSER_SRCS)
list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/assume.cpp
${NVFUSER_SRCS_DIR}/compute_at.cpp
${NVFUSER_SRCS_DIR}/inlining.cpp
${NVFUSER_SRCS_DIR}/compute_at_map.cpp
Expand Down Expand Up @@ -181,6 +180,7 @@ list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/utils.cpp
${NVFUSER_SRCS_DIR}/mma_type.cpp
${NVFUSER_SRCS_DIR}/scheduler/mma_utils.cpp
${NVFUSER_SRCS_DIR}/optimization/add_axioms.cpp
${NVFUSER_SRCS_DIR}/optimization/consecutive_cast.cpp
${NVFUSER_SRCS_DIR}/optimization/pre_segmenter.cpp
)
Expand Down
3 changes: 3 additions & 0 deletions benchmark/matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <ir/all_nodes.h>
#include <ir/utils.h>
#include <ops/all_ops.h>
#include <optimization/pre_segmenter.h>
#include <scheduler/all_schedulers.h>
#include <scheduler/matmul.h>
#include <scheduler/matmul_heuristic.h>
Expand Down Expand Up @@ -141,6 +142,8 @@ static void SingleMatmulBase(
// Define fusion graph
setupMatmul(fusion, layout, params, turing_or_later);

optimization::OptimizationPass<optimization::PreSegmenter>::runPass(fusion);

// inputs
at::manual_seed(0);

Expand Down
48 changes: 0 additions & 48 deletions csrc/assume.cpp

This file was deleted.

18 changes: 0 additions & 18 deletions csrc/assume.h

This file was deleted.

24 changes: 21 additions & 3 deletions csrc/ir/container.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,13 @@ IrCloner IrContainer::copy(const IrContainer* from, IrContainer* to) {
to->val_type_name_map_ = from->val_type_name_map_;
to->expr_name_counter_ = from->expr_name_counter_;

if (from->axioms_ != nullptr) {
to->axioms_ = std::make_unique<std::vector<Bool*>>();
for (auto pred : *from->axioms_) {
to->axioms_->emplace_back(ir_cloner.clone(pred));
}
}

return ir_cloner;
}

Expand Down Expand Up @@ -189,7 +196,7 @@ void IrContainer::clear() noexcept {
exprs_.clear();
exprs_up_.clear();
raw_ptrs_.clear();

axioms_.reset();
val_type_name_map_.clear();
expr_name_counter_ = 0;
}
Expand Down Expand Up @@ -305,7 +312,7 @@ NamedScalar* IrContainer::magicZeroVal() {
return magic_zero_val_.get();
}

const std::vector<Bool*>& IrContainer::axioms() {
void IrContainer::lazyInitAxioms() {
if (!axioms_) {
axioms_ = std::make_unique<std::vector<Bool*>>();
axioms_->reserve(kParallelTypeThreads.size() * 3);
Expand All @@ -318,7 +325,18 @@ const std::vector<Bool*>& IrContainer::axioms() {
axioms_->push_back(SimplifyingIrBuilder::ltExpr(pidx, pdim));
}
}
return *axioms_;
}

void IrContainer::assumePositive(Val* val) {
TORCH_INTERNAL_ASSERT(val->container() == this);
lazyInitAxioms();
axioms_->emplace_back(IrBuilder::gtExpr(val, zeroVal()));
}

void IrContainer::assumeNonNegative(Val* val) {
TORCH_INTERNAL_ASSERT(val->container() == this);
lazyInitAxioms();
axioms_->emplace_back(IrBuilder::geExpr(val, zeroVal()));
}

} // namespace nvfuser
10 changes: 9 additions & 1 deletion csrc/ir/container.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,13 @@ class TORCH_CUDA_CU_API IrContainer : public PolymorphicBase {
Val* zeroVal(DataType dtype);
Val* oneVal(DataType dtype);
// Axioms about CUDA programming, for example: threadIdx.x < blockDim.x
const std::vector<Bool*>& axioms();
const std::vector<Bool*>& axioms() {
lazyInitAxioms();
return *axioms_;
}

void assumePositive(Val* val);
void assumeNonNegative(Val* val);

protected:
static IrCloner copy(const IrContainer* from, IrContainer* to);
Expand Down Expand Up @@ -131,6 +137,8 @@ class TORCH_CUDA_CU_API IrContainer : public PolymorphicBase {

void clear() noexcept;

void lazyInitAxioms();

// Deque of unique pointer is the memory owning data structure
std::deque<std::unique_ptr<Val>> vals_up_;

Expand Down
5 changes: 5 additions & 0 deletions csrc/ir/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,11 @@ class ValReplacementMutator : private OptOutMutator {
more.emplace_back(v);
}
}
for (auto v : fusion->axioms()) {
if (std::find(stmts.begin(), stmts.end(), v) == stmts.end()) {
more.emplace_back(v);
}
}
auto more_stmts = StmtSort::getStmts(fusion, more, true, true);
more_stmts.insert(more_stmts.end(), stmts.begin(), stmts.end());

Expand Down
41 changes: 41 additions & 0 deletions csrc/optimization/add_axioms.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
#include <optimization/add_axioms.h>

#include <unordered_set>
#include <vector>

#include <ir/utils.h>

namespace nvfuser::optimization {

void AddAxiomsPass::runPass(Fusion* fusion) {
auto all_vals = fusion->usedMathVals();
std::unordered_set<Val*> assumed_vals;
for (auto tv : ir_utils::filterByType<TensorView>(all_vals)) {
std::vector<const std::vector<nvfuser::IterDomain*>*> interested_domains{
&tv->getRootDomain()};
if (tv->hasRFactor()) {
interested_domains.push_back(&tv->getRFactorDomain());
}
if (tv->hasAllocation()) {
interested_domains.push_back(&tv->getAllocationDomain());
}
for (auto dom : interested_domains) {
for (auto id : *dom) {
auto extent = id->extent();
if (extent->definition() == nullptr && !extent->isConstScalar() &&
assumed_vals.insert(extent).second) {
fusion->assumePositive(extent);
}
}
}
}
}

} // namespace nvfuser::optimization
20 changes: 20 additions & 0 deletions csrc/optimization/add_axioms.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
#include <optimization/optimization_pass.h>

namespace nvfuser::optimization {

//! AddAxiomsPass adds extent > 0 as axioms of the IR container for all tensors
class TORCH_CUDA_CU_API AddAxiomsPass : public OptimizationPass<AddAxiomsPass> {
friend class OptimizationPass<AddAxiomsPass>;

protected:
static void runPass(Fusion* fusion);
};

} // namespace nvfuser::optimization
2 changes: 2 additions & 0 deletions csrc/optimization/pre_segmenter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@
// clang-format on
#include <optimization/pre_segmenter.h>

#include <optimization/add_axioms.h>
#include <optimization/consecutive_cast.h>

namespace nvfuser::optimization {

void PreSegmenter::runPass(Fusion* fusion) {
// removes consecutive cast operations
OptimizationPass<ConsecutiveCastPass>::runPass(fusion);
OptimizationPass<AddAxiomsPass>::runPass(fusion);
}

} // namespace nvfuser::optimization
12 changes: 1 addition & 11 deletions csrc/parallel_dimension_map.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
#include <parallel_dimension_map.h>

#include <ATen/cuda/CUDAContext.h>
#include <assume.h>
#include <device_lower/lower2device.h>
#include <disjoint_set.h>
#include <expr_simplifier.h>
Expand Down Expand Up @@ -70,16 +69,7 @@ void ParallelDimensionMap::build(Fusion* fusion) {

// Simplify dim_map_
for (auto& [k, v] : dim_map_) {
// Well, this isn't really correct, but we need this assumption to better
// handle non-empty cases. If this turn out to be an issue, I believe we
// then need to find a more systematic way to handle empty tensor, rather
// than just disable this assumption.
auto assume = assume::tensorsAreNotEmpty(v);
if (assume != nullptr) {
v = simplifyExpr(v, {}, {assume});
} else {
v = simplifyExpr(v);
}
v = simplifyExpr(v);
}

// Compute exact_types_
Expand Down
16 changes: 1 addition & 15 deletions test/test_expr_simplifier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
// clang-format on
#include <gtest/gtest.h>

#include <assume.h>
#include <expr_simplifier.h>
#include <ops/all_ops.h>
#include <test/utils.h>
Expand Down Expand Up @@ -1011,23 +1010,10 @@ TEST_F(ExprSimplifierTest, MinMax_CUDA) {

auto expr =
"max( max( ceilDiv( T0.size[0] , 128 ) * 4 , ceilDiv( T0.size[0] , 128 ) ) , 4 )"_;
EXPECT_TRUE(simplify(expr, assume::tensorsAreNotEmpty(expr))
EXPECT_TRUE(simplify(expr, "T0.size[0] > 0"_b)
->sameAs("ceilDiv( T0.size[0] , 128 ) * 4"_));
}

TEST_F(ExprSimplifierTest, Assume_CUDA) {
auto expr =
"max( max( ceilDiv( T0.size[0] , 128 ) * 4 , ceilDiv( T0.size[1] , 128 ) ) , 4 )"_;
EXPECT_EQ(
simplifyExpr(IrBuilder::eqExpr(
assume::tensorsAreNotEmpty(expr),
"T0.size[0] > 0 && T0.size[1] > 0"_))
->getBool(),
true);
expr = "ceilDiv( T0.size[0] , T0.size[0] ) * T0.size[0]"_;
EXPECT_TRUE(assume::tensorsAreNotEmpty(expr)->sameAs("T0.size[0] > 0"_));
}

TEST_F(ExprSimplifierTest, PredicateDivToMul_CUDA) {
auto simplified = simplifyExpr("i1 / T0.size[0] < i2"_, {}, {"i1 >= 0"_b});
auto expect = "i1 < ( i2 * T0.size[0] )"_;
Expand Down
4 changes: 4 additions & 0 deletions test/test_gpu_tensorcore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <mma_type.h>
#include <mutator.h>
#include <ops/all_ops.h>
#include <optimization/pre_segmenter.h>
#include <root_domain_map.h>
#include <scheduler/all_schedulers.h>
#include <scheduler/matmul.h>
Expand Down Expand Up @@ -936,6 +937,9 @@ TEST_F(NVFuserTest, FusionAmpereSwizzle_CUDA) {

fusion.addOutput(tv2);

optimization::OptimizationPass<optimization::PreSegmenter>::runPass(
&fusion);

MatMulTileOptions gemm_tile;
gemm_tile.cta_tile = GemmTile(128, 128, 32);
gemm_tile.warp_tile = GemmTile(64, 64, 32);
Expand Down