From faed8fbf4b13fa339b7e56b3d001d5adfe3b4efb Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 18 Jul 2023 18:24:51 -0700 Subject: [PATCH 01/33] Initial kernel input support --- CMakeLists.txt | 2 + csrc/codegen.cpp | 9 ++- csrc/device_lower/lower2device.cpp | 12 +++- csrc/device_lower/pass/hoist_to_host.cpp | 54 +++++++++++++++ csrc/device_lower/pass/hoist_to_host.h | 20 ++++++ csrc/ir/base_nodes.cpp | 80 +++------------------ csrc/ir/utils.cpp | 20 ++++++ csrc/ir/utils.h | 5 ++ csrc/kernel.cpp | 12 ++++ csrc/kernel.h | 16 +++++ test/test_kernel_inputs.cpp | 88 ++++++++++++++++++++++++ 11 files changed, 245 insertions(+), 73 deletions(-) create mode 100644 csrc/device_lower/pass/hoist_to_host.cpp create mode 100644 csrc/device_lower/pass/hoist_to_host.h create mode 100644 test/test_kernel_inputs.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 25ca3f851e0..26faf1029d2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -119,6 +119,7 @@ list(APPEND NVFUSER_SRCS ${NVFUSER_SRCS_DIR}/device_lower/pass/double_buffer.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/expr_sort.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/fusion_simplifier.cpp + ${NVFUSER_SRCS_DIR}/device_lower/pass/hoist_to_host.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/index.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/scalar_hoist.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/insert_syncs.cpp @@ -401,6 +402,7 @@ if(BUILD_TEST) ${NVFUSER_ROOT}/test/test_gpu_compute_with.cpp ${NVFUSER_ROOT}/test/test_expr_simplifier.cpp ${NVFUSER_ROOT}/test/test_external_src.cpp + ${NVFUSER_ROOT}/test/test_kernel_inputs.cpp ${NVFUSER_ROOT}/test/test_swizzle.cpp ${NVFUSER_ROOT}/test/test_tensor_factories.cpp ${NVFUSER_ROOT}/test/test_gpu_fused_reduction.cpp diff --git a/csrc/codegen.cpp b/csrc/codegen.cpp index fd18a854b83..9ec6420172c 100644 --- a/csrc/codegen.cpp +++ b/csrc/codegen.cpp @@ -233,8 +233,9 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { std::vector params; // Inputs & Outputs - for (auto val : kernel_->inputs()) { + for (auto val : kernel_->getKernelInputs()) { params.push_back(val); + kernel_inputs_.insert(val); } for (auto val : kernel_->outputs()) { TORCH_INTERNAL_ASSERT( @@ -273,7 +274,6 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { } } else { TORCH_INTERNAL_ASSERT(params[i]->isScalar()); // NOLINT (LLVM bug 48525) - TORCH_INTERNAL_ASSERT(params[i]->definition() == nullptr); code_ << params[i]->dtype() << " " << var_name_ss.str(); } @@ -479,7 +479,8 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { } const auto def = s->definition(); const bool has_alloc = alloc_map_.find(s) != alloc_map_.end(); - if (def != nullptr && !has_alloc) { + const bool is_param = kernel_inputs_.find(s) != kernel_inputs_.end(); + if (def != nullptr && !has_alloc && !is_param) { code_ << "(" << genInline(def) << ")"; } else if (s->isConst()) { auto value = s->value(); @@ -2941,6 +2942,8 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { std::vector aligned_scope_exprs_; //! Keep track of the Val* and its generated variable name std::unordered_map val_to_name_; + //! Keep track of variables in the kernel inputs + std::unordered_set kernel_inputs_; }; } // namespace diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp index 43fb2bf1fad..b07be86cdf0 100644 --- a/csrc/device_lower/lower2device.cpp +++ b/csrc/device_lower/lower2device.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -274,8 +275,13 @@ void GpuLower::lower(Fusion* fusion) { assignRNGOffset(fusion_); FusionGuard fg(fusion_); + kernel_->setKernelInputs(kernel_->inputs()); + dumpExprsIfEnabled(fusion_->exprs(), "initialize lowering"); + hoistScalarComputationToHost(kernel_.get()); + dumpExprsIfEnabled(fusion_->exprs(), "hoistScalarComputationToHost"); + // prepare for lowering validateIr(fusion_); dumpExprsIfEnabled(fusion_->exprs(), "validateIr"); @@ -400,9 +406,13 @@ void GpuLower::lower(Fusion* fusion) { const auto exprs_sorted = reorderExprsForComputeAt(); dumpExprsIfEnabled(exprs_sorted, "reorderExprsForComputeAt"); + // Remove expressions that are hoisted to host + const auto host_removed = removeExprsHoistedToHost(kernel_.get(), exprs_sorted); + dumpExprsIfEnabled(host_removed, "reorderExprsForComputeAt"); + // Generate loop-nests and place each expression at its // corresponding loop - const auto exprs_lowered = LoopNestGenerator::loweredExprs(exprs_sorted); + const auto exprs_lowered = LoopNestGenerator::loweredExprs(host_removed); dumpExprsIfEnabled(exprs_lowered, "LoopNestGenerator"); // Replace squeezes, Transpose, Shift, Gather, and View ops with diff --git a/csrc/device_lower/pass/hoist_to_host.cpp b/csrc/device_lower/pass/hoist_to_host.cpp new file mode 100644 index 00000000000..e4e37c93956 --- /dev/null +++ b/csrc/device_lower/pass/hoist_to_host.cpp @@ -0,0 +1,54 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on + +#include + +namespace nvfuser { + +void hoistScalarComputationToHost(kir::Kernel* kernel) { + if (!kernel->hasManaged("hoist_to_host")) { + return; + } + for (auto v : kernel->getManaged>("hoist_to_host")) { + TORCH_INTERNAL_ASSERT( + !v->isA(), + "Hoisting tensor computation to host is not supported yet"); + kernel->addKernelInput(v); + } +} + +std::vector removeExprsHoistedToHost( + kir::Kernel* kernel, + const std::vector& exprs) { + std::unordered_set hoisted_vals( + kernel->getKernelInputs().begin(), kernel->getKernelInputs().end()); + std::vector new_exprs; + for (auto expr : exprs) { + bool all_outputs_hoisted = true; + bool any_outputs_hoisted = false; + for (auto out : expr->outputs()) { + if (hoisted_vals.count(out)) { + any_outputs_hoisted = true; + } else { + all_outputs_hoisted = false; + } + } + TORCH_INTERNAL_ASSERT( + all_outputs_hoisted == any_outputs_hoisted, + "Expression cannot have both hoisted and non-hoisted outputs"); + if (!all_outputs_hoisted) { + new_exprs.push_back(expr); + } + } + // TODO: this will leave some dead code in the kernel, but it is not a big + // deal for now. In a followup PR, we should write a dead code elimination + // pass to remove the dead code. + return new_exprs; +} + +} // namespace nvfuser diff --git a/csrc/device_lower/pass/hoist_to_host.h b/csrc/device_lower/pass/hoist_to_host.h new file mode 100644 index 00000000000..d2b3d3d27c1 --- /dev/null +++ b/csrc/device_lower/pass/hoist_to_host.h @@ -0,0 +1,20 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#pragma once + +#include + +namespace nvfuser { + +void hoistScalarComputationToHost(kir::Kernel* kernel); + +std::vector removeExprsHoistedToHost( + kir::Kernel* kernel, + const std::vector& exprs); + +} // namespace nvfuser diff --git a/csrc/ir/base_nodes.cpp b/csrc/ir/base_nodes.cpp index 81d92b5e9fa..775b924e564 100644 --- a/csrc/ir/base_nodes.cpp +++ b/csrc/ir/base_nodes.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -160,70 +161,6 @@ void Val::resolveIndexDtype() { dtype_ = index_dtype; } -namespace { - -// Traverse definition of all values involved in constructing the provided val. -// Check if all values involved are constant values, meaning the provided -// val is also a constant value. -class ConstCheck : private OptOutConstDispatch { - private: - bool is_const_ = true; - - // Returns true if all Val's in the hisotry of provided Val is an Int. Since - // our expression evaluator doesn't support any type besides int, it's - // important to check it is one. - bool is_int_ = true; - - void handle(const Scalar* b) final { - is_const_ = is_const_ && b->isConst(); - } - - void handle(const NamedScalar* ns) final { - is_const_ = false; - } - - void handle(const TensorView* ns) final { - is_const_ = false; - } - - void handle(const kir::TensorIndex* ns) final { - is_const_ = false; - } - - void handle(const Expr* expr) final { - for (auto inp : expr->inputs()) { - handle(inp); - } - } - - void handle(const Val* val) final { - if (!val->isIntegralScalar()) { - is_int_ = false; - } - - if (val->definition() != nullptr) { - handle(val->definition()); - } else { - OptOutConstDispatch::handle(val); - } - } - - public: - static bool isConst(const Val* val) { - ConstCheck cc; - cc.handle(val); - return cc.is_const_; - } - - static bool isConstInt(const Val* val) { - ConstCheck cc; - cc.handle(val); - return cc.is_const_ && cc.is_int_; - } -}; - -} // namespace - bool Val::sameAs(const Statement* other) const { if (this == other) { return true; @@ -262,16 +199,21 @@ bool Val::isConstScalar() const { if (!isScalar()) { return false; } - return ConstCheck::isConst(this); + // Unfortunately const model is broken. We can not easily cast a + // std::vector into a std::vector in C++. + return ir_utils::dependenciesSatisfied({const_cast(this)}, {}); } bool Val::isConstInt() const { - return ConstCheck::isConst(this) && isIntegralScalar(); + // Unfortunately const model is broken. We can not easily cast a + // std::vector into a std::vector in C++. + return ir_utils::dependenciesSatisfied({const_cast(this)}, {}) && + isIntegralScalar(); } int64_t Val::evaluateInt() { TORCH_INTERNAL_ASSERT( - ConstCheck::isConst(this), + ir_utils::dependenciesSatisfied({this}, {}), "Cannot get Int of not const values through IR nodes, must use runtime ExpressionEvaluator."); if (this->as()->value().hasValue()) { @@ -289,7 +231,7 @@ int64_t Val::evaluateInt() { double Val::evaluateDouble() { TORCH_INTERNAL_ASSERT( - ConstCheck::isConst(this), + ir_utils::dependenciesSatisfied({this}, {}), "Cannot get Double of not const doubles through IR nodes, must use runtime ExpressionEvaluator."); if (this->as()->value().hasValue()) { @@ -306,7 +248,7 @@ double Val::evaluateDouble() { bool Val::evaluateBool() { TORCH_INTERNAL_ASSERT( - ConstCheck::isConst(this), + ir_utils::dependenciesSatisfied({this}, {}), "Cannot get Bool of not const bools through IR nodes, must use runtime ExpressionEvaluator."); if (this->as()->value().hasValue()) { diff --git a/csrc/ir/utils.cpp b/csrc/ir/utils.cpp index 8f3be4f934c..cb57a2a8b81 100644 --- a/csrc/ir/utils.cpp +++ b/csrc/ir/utils.cpp @@ -1085,6 +1085,26 @@ void validateDomainEquivalence( ValidateDomainEquivalence(initial_domain, derived_domain); } +bool dependenciesSatisfied( + std::vector needed_vals, + std::unordered_set known_vals) { + while (!needed_vals.empty()) { + auto needed_val = needed_vals.back(); + needed_vals.pop_back(); + if (known_vals.count(needed_val) > 0 || needed_val->isConst()) { + continue; + } + auto def = needed_val->definition(); + if (def == nullptr) { + return false; + } + for (auto input : def->inputs()) { + needed_vals.emplace_back(input); + } + } + return true; +} + bool isAlignedScopeExpr(const Expr* expr) { TORCH_INTERNAL_ASSERT(expr != nullptr); if (auto ite = dynamic_cast(expr)) { diff --git a/csrc/ir/utils.h b/csrc/ir/utils.h index 02397bcc6b6..3e66e01b255 100644 --- a/csrc/ir/utils.h +++ b/csrc/ir/utils.h @@ -427,6 +427,11 @@ void validateDomainEquivalence( const std::vector& initial_domain, const std::vector& derived_domain); +//! Check if all the inputs required to compute needed_val are known +bool dependenciesSatisfied( + std::vector needed_vals, + std::unordered_set known_vals); + //! Check if a conditional scope, i.e., ForLoop or IfThenElse, is //! guaranteed not to cause thread divergence bool isAlignedScopeExpr(const Expr* expr); diff --git a/csrc/kernel.cpp b/csrc/kernel.cpp index 702763b01cb..e303cf789e3 100644 --- a/csrc/kernel.cpp +++ b/csrc/kernel.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -383,6 +384,17 @@ void Kernel::registerExpr(Expr* expr) { Fusion::registerExpr(expr); } +void Kernel::setKernelInputs(std::vector kernel_inputs) { + ir_utils::dependenciesSatisfied( + kernel_inputs, {inputs().begin(), inputs().end()}); + kernel_inputs_ = std::move(kernel_inputs); +} + +void Kernel::addKernelInput(Val* input) { + ir_utils::dependenciesSatisfied({input}, {inputs().begin(), inputs().end()}); + kernel_inputs_.push_back(input); +} + std::vector& KernelInternalProxy::topLevelExprs() { return kernel_->top_level_exprs_; } diff --git a/csrc/kernel.h b/csrc/kernel.h index 2d894826c99..ad3ccd81878 100644 --- a/csrc/kernel.h +++ b/csrc/kernel.h @@ -228,6 +228,14 @@ class TORCH_CUDA_CU_API Kernel final : public Fusion { //! Debug dump of the Kernel IR void print() const; + void setKernelInputs(std::vector kernel_inputs); + + void addKernelInput(Val* input); + + const std::vector& getKernelInputs() const { + return kernel_inputs_; + } + protected: using IrContainer::registerExpr; using IrContainer::registerVal; @@ -257,6 +265,14 @@ class TORCH_CUDA_CU_API Kernel final : public Fusion { WarpPaddedParallelInfo warp_padded_parallel_info_; KernelPerformanceProfile profile_; + + // Inputs to the kernel, can be different from Fusion::inputs(). The + // relationship between kernel_inputs_ and Fusion::inputs() is similar to the + // relationship between root domain and rFactor domain. Fusion::inputs() are + // the inputs provided by the user, kernel_inputs_ are the inputs that will be + // sent to the kernel. Vals in kernel_inputs_ must be evaluatable from + // Fusion::inputs(). + std::vector kernel_inputs_; }; //! A special debugging proxy for Kernel. diff --git a/test/test_kernel_inputs.cpp b/test/test_kernel_inputs.cpp new file mode 100644 index 00000000000..4391b42de4f --- /dev/null +++ b/test/test_kernel_inputs.cpp @@ -0,0 +1,88 @@ +#include +#include + +#include +#include + +#include + +namespace nvfuser { + +class KernelInputsTest : public NVFuserTest {}; + +TEST_F(KernelInputsTest, HoistToHost1) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeSymbolicTensor(1); + fusion.addInput(tv0); + auto scalar = IrBuilder::newScalar(DataType::Double); + fusion.addInput(scalar); + auto inv = div(fusion.oneVal(DataType::Double), scalar); + auto tv1 = mul(tv0, inv); + fusion.addOutput(tv1); + fusion.manage("hoist_to_host", std::vector{inv}); + + tv1->axis(0)->parallelize(ParallelType::TIDx); + + const std::string expected_kernel = R"( +__global__ void CUDAGeneratedKernel(Tensor T0, double d0, double d1, Tensor T1) { + T1[((nvfuser_index_t)threadIdx.x)] + = T0[(T0.stride[0] * ((nvfuser_index_t)threadIdx.x))] + * (float) d1; +} +)"; + + assertCUDAKernel(&fusion, expected_kernel); + // TODO: executor change not implemented yet + return; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor input = at::randn({1000}, options); + FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion({input, 10.0}); + testValidate(&fusion, outputs, {input, 10.0}, __LINE__, __FILE__); +} + +TEST_F(KernelInputsTest, HoistToHost2) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeSymbolicTensor(1); + fusion.addInput(tv0); + auto scalar = IrBuilder::newScalar(DataType::Double); + fusion.addInput(scalar); + auto inv = div(fusion.oneVal(DataType::Double), scalar); + auto inv_sqr = mul(inv, inv); + auto tv1 = mul(tv0, inv_sqr); + fusion.addOutput(tv1); + fusion.manage("hoist_to_host", std::vector{inv_sqr}); + + tv1->axis(0)->parallelize(ParallelType::TIDx); + + // TODO: d2 below is not used, but it is generated in the kernel + // write a dead code elimination pass to remove it + const std::string expected_kernel = R"( +__global__ void CUDAGeneratedKernel(Tensor T0, double d0, double d1, Tensor T1) { + double d2; + d2 = 1.00000000000000000e+00 / d0; + T1[((nvfuser_index_t)threadIdx.x)] + = T0[(T0.stride[0] * ((nvfuser_index_t)threadIdx.x))] + * (float) d1; +} +)"; + + assertCUDAKernel(&fusion, expected_kernel); + // TODO: executor change not implemented yet + return; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor input = at::randn({1000}, options); + FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion({input, 10.0}); + testValidate(&fusion, outputs, {input, 10.0}, __LINE__, __FILE__); +} + +} // namespace nvfuser From ea58b4490e9327ce938c1b95c29607bf95df8c3a Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 18 Jul 2023 18:31:15 -0700 Subject: [PATCH 02/33] format --- csrc/device_lower/lower2device.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp index b07be86cdf0..2d244ca1012 100644 --- a/csrc/device_lower/lower2device.cpp +++ b/csrc/device_lower/lower2device.cpp @@ -407,7 +407,8 @@ void GpuLower::lower(Fusion* fusion) { dumpExprsIfEnabled(exprs_sorted, "reorderExprsForComputeAt"); // Remove expressions that are hoisted to host - const auto host_removed = removeExprsHoistedToHost(kernel_.get(), exprs_sorted); + const auto host_removed = + removeExprsHoistedToHost(kernel_.get(), exprs_sorted); dumpExprsIfEnabled(host_removed, "reorderExprsForComputeAt"); // Generate loop-nests and place each expression at its From 06645a3cd23f4154b38b1c56a16bf442632ce81a Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 18 Jul 2023 19:40:05 -0700 Subject: [PATCH 03/33] cleanup --- csrc/device_lower/lower2device.cpp | 7 +----- csrc/device_lower/pass/expr_sort.cpp | 12 +++++++--- csrc/device_lower/pass/hoist_to_host.cpp | 29 ------------------------ test/test_kernel_inputs.cpp | 2 -- 4 files changed, 10 insertions(+), 40 deletions(-) diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp index 2d244ca1012..1c502be0a08 100644 --- a/csrc/device_lower/lower2device.cpp +++ b/csrc/device_lower/lower2device.cpp @@ -406,14 +406,9 @@ void GpuLower::lower(Fusion* fusion) { const auto exprs_sorted = reorderExprsForComputeAt(); dumpExprsIfEnabled(exprs_sorted, "reorderExprsForComputeAt"); - // Remove expressions that are hoisted to host - const auto host_removed = - removeExprsHoistedToHost(kernel_.get(), exprs_sorted); - dumpExprsIfEnabled(host_removed, "reorderExprsForComputeAt"); - // Generate loop-nests and place each expression at its // corresponding loop - const auto exprs_lowered = LoopNestGenerator::loweredExprs(host_removed); + const auto exprs_lowered = LoopNestGenerator::loweredExprs(exprs_sorted); dumpExprsIfEnabled(exprs_lowered, "LoopNestGenerator"); // Replace squeezes, Transpose, Shift, Gather, and View ops with diff --git a/csrc/device_lower/pass/expr_sort.cpp b/csrc/device_lower/pass/expr_sort.cpp index 6acfba3b9fd..d55e89f4272 100644 --- a/csrc/device_lower/pass/expr_sort.cpp +++ b/csrc/device_lower/pass/expr_sort.cpp @@ -1331,7 +1331,10 @@ void ExprSegmentationSorter::sort() { // Need this for initialization of the DAG that is processed std::unordered_map expr2group; - auto all_exprs = fusion_->exprs(); + auto all_exprs = StmtSort::getExprsBetween( + fusion_, + fusion_->as()->getKernelInputs(), + fusion_->getTerminatingOutputs()); // Figure out all the values used as inputs to the expressions we're sorting // (to find terminating expressions). There could be branches of expressions @@ -1353,11 +1356,14 @@ void ExprSegmentationSorter::sort() { } // Create edges between the Exprs. Mark inputs and outputs of the fusion. - for (auto expr : fusion_->exprs()) { + for (auto expr : all_exprs) { auto expr_group = expr2group.at(expr); auto out = expr->outputs()[0]; for (auto inp : expr->inputs()) { - if (inp->isFusionInput()) { + if (std::any_of( + fusion_->as()->getKernelInputs().begin(), + fusion_->as()->getKernelInputs().end(), + [&inp](Val* input) { return input == inp; })) { continue; } diff --git a/csrc/device_lower/pass/hoist_to_host.cpp b/csrc/device_lower/pass/hoist_to_host.cpp index e4e37c93956..4baa7bf06d8 100644 --- a/csrc/device_lower/pass/hoist_to_host.cpp +++ b/csrc/device_lower/pass/hoist_to_host.cpp @@ -22,33 +22,4 @@ void hoistScalarComputationToHost(kir::Kernel* kernel) { } } -std::vector removeExprsHoistedToHost( - kir::Kernel* kernel, - const std::vector& exprs) { - std::unordered_set hoisted_vals( - kernel->getKernelInputs().begin(), kernel->getKernelInputs().end()); - std::vector new_exprs; - for (auto expr : exprs) { - bool all_outputs_hoisted = true; - bool any_outputs_hoisted = false; - for (auto out : expr->outputs()) { - if (hoisted_vals.count(out)) { - any_outputs_hoisted = true; - } else { - all_outputs_hoisted = false; - } - } - TORCH_INTERNAL_ASSERT( - all_outputs_hoisted == any_outputs_hoisted, - "Expression cannot have both hoisted and non-hoisted outputs"); - if (!all_outputs_hoisted) { - new_exprs.push_back(expr); - } - } - // TODO: this will leave some dead code in the kernel, but it is not a big - // deal for now. In a followup PR, we should write a dead code elimination - // pass to remove the dead code. - return new_exprs; -} - } // namespace nvfuser diff --git a/test/test_kernel_inputs.cpp b/test/test_kernel_inputs.cpp index 4391b42de4f..539531cf03b 100644 --- a/test/test_kernel_inputs.cpp +++ b/test/test_kernel_inputs.cpp @@ -65,8 +65,6 @@ TEST_F(KernelInputsTest, HoistToHost2) { // write a dead code elimination pass to remove it const std::string expected_kernel = R"( __global__ void CUDAGeneratedKernel(Tensor T0, double d0, double d1, Tensor T1) { - double d2; - d2 = 1.00000000000000000e+00 / d0; T1[((nvfuser_index_t)threadIdx.x)] = T0[(T0.stride[0] * ((nvfuser_index_t)threadIdx.x))] * (float) d1; From 48b2538a808e30503cc8aba4f2d84d019c8beea9 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 18 Jul 2023 20:13:37 -0700 Subject: [PATCH 04/33] cleanup --- csrc/codegen.cpp | 2 +- csrc/device_lower/lower2device.cpp | 3 +-- csrc/device_lower/lower2device.h | 12 ++++++++++++ csrc/device_lower/pass/expr_sort.cpp | 6 +++--- csrc/device_lower/pass/hoist_to_host.cpp | 21 ++++++++++++++------- csrc/device_lower/pass/hoist_to_host.h | 8 +++----- csrc/kernel.cpp | 13 +------------ csrc/kernel.h | 6 +----- 8 files changed, 36 insertions(+), 35 deletions(-) diff --git a/csrc/codegen.cpp b/csrc/codegen.cpp index 9ec6420172c..31f5c4cfee9 100644 --- a/csrc/codegen.cpp +++ b/csrc/codegen.cpp @@ -233,7 +233,7 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { std::vector params; // Inputs & Outputs - for (auto val : kernel_->getKernelInputs()) { + for (auto val : kernel_->kernelInputs()) { params.push_back(val); kernel_inputs_.insert(val); } diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp index 1c502be0a08..83741d77f84 100644 --- a/csrc/device_lower/lower2device.cpp +++ b/csrc/device_lower/lower2device.cpp @@ -275,11 +275,10 @@ void GpuLower::lower(Fusion* fusion) { assignRNGOffset(fusion_); FusionGuard fg(fusion_); - kernel_->setKernelInputs(kernel_->inputs()); dumpExprsIfEnabled(fusion_->exprs(), "initialize lowering"); - hoistScalarComputationToHost(kernel_.get()); + hoistScalarComputationToHost(fusion_, allKnownVals()); dumpExprsIfEnabled(fusion_->exprs(), "hoistScalarComputationToHost"); // prepare for lowering diff --git a/csrc/device_lower/lower2device.h b/csrc/device_lower/lower2device.h index 21b1cbffbc2..2a69947cc46 100644 --- a/csrc/device_lower/lower2device.h +++ b/csrc/device_lower/lower2device.h @@ -195,6 +195,14 @@ class TORCH_CUDA_CU_API GpuLower : public NonCopyable { // in any pass that performs replacement. void propagateExprInfo(const Expr* old_expr, const Expr* new_expr); + std::vector& allKnownVals() { + return all_known_vals_; + } + + const std::vector& allKnownVals() const { + return all_known_vals_; + } + private: void lower(Fusion* fusion); @@ -240,6 +248,10 @@ class TORCH_CUDA_CU_API GpuLower : public NonCopyable { // Info on each vectorized set op std::vector vectorized_set_info_; + // All vals that are known to the kernel, including fusion inputs and + // precomputed values + std::vector all_known_vals_; + Fusion* fusion_ = nullptr; }; diff --git a/csrc/device_lower/pass/expr_sort.cpp b/csrc/device_lower/pass/expr_sort.cpp index d55e89f4272..853e8af2f8b 100644 --- a/csrc/device_lower/pass/expr_sort.cpp +++ b/csrc/device_lower/pass/expr_sort.cpp @@ -1333,7 +1333,7 @@ void ExprSegmentationSorter::sort() { auto all_exprs = StmtSort::getExprsBetween( fusion_, - fusion_->as()->getKernelInputs(), + GpuLower::current()->allKnownVals(), fusion_->getTerminatingOutputs()); // Figure out all the values used as inputs to the expressions we're sorting @@ -1361,8 +1361,8 @@ void ExprSegmentationSorter::sort() { auto out = expr->outputs()[0]; for (auto inp : expr->inputs()) { if (std::any_of( - fusion_->as()->getKernelInputs().begin(), - fusion_->as()->getKernelInputs().end(), + GpuLower::current()->allKnownVals().begin(), + GpuLower::current()->allKnownVals().end(), [&inp](Val* input) { return input == inp; })) { continue; } diff --git a/csrc/device_lower/pass/hoist_to_host.cpp b/csrc/device_lower/pass/hoist_to_host.cpp index 4baa7bf06d8..33377941590 100644 --- a/csrc/device_lower/pass/hoist_to_host.cpp +++ b/csrc/device_lower/pass/hoist_to_host.cpp @@ -7,19 +7,26 @@ // clang-format on #include +#include namespace nvfuser { -void hoistScalarComputationToHost(kir::Kernel* kernel) { +void hoistScalarComputationToHost( + Fusion* kernel, + std::vector& all_known_vals) { + all_known_vals.insert( + all_known_vals.end(), kernel->inputs().begin(), kernel->inputs().end()); if (!kernel->hasManaged("hoist_to_host")) { return; } - for (auto v : kernel->getManaged>("hoist_to_host")) { - TORCH_INTERNAL_ASSERT( - !v->isA(), - "Hoisting tensor computation to host is not supported yet"); - kernel->addKernelInput(v); - } + const auto& hoist_to_host = + kernel->getManaged>("hoist_to_host"); + TORCH_CHECK( + ir_utils::dependenciesSatisfied( + hoist_to_host, {kernel->inputs().begin(), kernel->inputs().end()}), + "Cannot hoist to host because some inputs are not provided"); + all_known_vals.insert( + all_known_vals.end(), hoist_to_host.begin(), hoist_to_host.end()); } } // namespace nvfuser diff --git a/csrc/device_lower/pass/hoist_to_host.h b/csrc/device_lower/pass/hoist_to_host.h index d2b3d3d27c1..cf66f611402 100644 --- a/csrc/device_lower/pass/hoist_to_host.h +++ b/csrc/device_lower/pass/hoist_to_host.h @@ -11,10 +11,8 @@ namespace nvfuser { -void hoistScalarComputationToHost(kir::Kernel* kernel); - -std::vector removeExprsHoistedToHost( - kir::Kernel* kernel, - const std::vector& exprs); +void hoistScalarComputationToHost( + Fusion* kernel, + std::vector& all_known_vals); } // namespace nvfuser diff --git a/csrc/kernel.cpp b/csrc/kernel.cpp index e303cf789e3..585af8862db 100644 --- a/csrc/kernel.cpp +++ b/csrc/kernel.cpp @@ -10,7 +10,6 @@ #include #include #include -#include #include #include @@ -315,6 +314,7 @@ void Kernel::finalize(std::vector top_level_exprs) { summary_.sync_map = GpuLower::current()->syncMap(); summary_.parallel_dimension_map_ = GpuLower::current()->parallelDimensionMap(); + kernel_inputs_ = GpuLower::current()->allKnownVals(); } void Kernel::analyze() { @@ -384,17 +384,6 @@ void Kernel::registerExpr(Expr* expr) { Fusion::registerExpr(expr); } -void Kernel::setKernelInputs(std::vector kernel_inputs) { - ir_utils::dependenciesSatisfied( - kernel_inputs, {inputs().begin(), inputs().end()}); - kernel_inputs_ = std::move(kernel_inputs); -} - -void Kernel::addKernelInput(Val* input) { - ir_utils::dependenciesSatisfied({input}, {inputs().begin(), inputs().end()}); - kernel_inputs_.push_back(input); -} - std::vector& KernelInternalProxy::topLevelExprs() { return kernel_->top_level_exprs_; } diff --git a/csrc/kernel.h b/csrc/kernel.h index ad3ccd81878..5d6a850b014 100644 --- a/csrc/kernel.h +++ b/csrc/kernel.h @@ -228,11 +228,7 @@ class TORCH_CUDA_CU_API Kernel final : public Fusion { //! Debug dump of the Kernel IR void print() const; - void setKernelInputs(std::vector kernel_inputs); - - void addKernelInput(Val* input); - - const std::vector& getKernelInputs() const { + const std::vector& kernelInputs() const { return kernel_inputs_; } From 2080387bfd752125d6168f759f697c67181ecec3 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 18 Jul 2023 20:15:31 -0700 Subject: [PATCH 05/33] save --- test/test_kernel_inputs.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/test_kernel_inputs.cpp b/test/test_kernel_inputs.cpp index 539531cf03b..c1dbc2b04ea 100644 --- a/test/test_kernel_inputs.cpp +++ b/test/test_kernel_inputs.cpp @@ -61,8 +61,6 @@ TEST_F(KernelInputsTest, HoistToHost2) { tv1->axis(0)->parallelize(ParallelType::TIDx); - // TODO: d2 below is not used, but it is generated in the kernel - // write a dead code elimination pass to remove it const std::string expected_kernel = R"( __global__ void CUDAGeneratedKernel(Tensor T0, double d0, double d1, Tensor T1) { T1[((nvfuser_index_t)threadIdx.x)] From e09b45cf1ac33f85d721bff3f1831ab7f84132dc Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 18 Jul 2023 20:49:28 -0700 Subject: [PATCH 06/33] minimum set of inputs --- csrc/iter_visitor.cpp | 3 ++- csrc/kernel.cpp | 8 +++++++- test/test_kernel_inputs.cpp | 8 ++++---- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/csrc/iter_visitor.cpp b/csrc/iter_visitor.cpp index 932c876a214..dbef05ec3f6 100644 --- a/csrc/iter_visitor.cpp +++ b/csrc/iter_visitor.cpp @@ -301,7 +301,8 @@ class Inputs : public IterVisitor { Inputs(const std::vector& all_inputs) : all_inputs_(all_inputs) {} std::vector next(Val* v) override { - if (std::find(inputs_.begin(), inputs_.end(), v) != inputs_.end()) { + if (std::find(all_inputs_.begin(), all_inputs_.end(), v) != + all_inputs_.end()) { return {}; } return IterVisitor::next(v); diff --git a/csrc/kernel.cpp b/csrc/kernel.cpp index 585af8862db..5e24102f01e 100644 --- a/csrc/kernel.cpp +++ b/csrc/kernel.cpp @@ -314,7 +314,13 @@ void Kernel::finalize(std::vector top_level_exprs) { summary_.sync_map = GpuLower::current()->syncMap(); summary_.parallel_dimension_map_ = GpuLower::current()->parallelDimensionMap(); - kernel_inputs_ = GpuLower::current()->allKnownVals(); + auto kernel_inputs = + IterVisitor::getInputsTo(outputs(), GpuLower::current()->allKnownVals()); + std::copy_if( + kernel_inputs.begin(), + kernel_inputs.end(), + std::back_inserter(kernel_inputs_), + [](Val* v) { return !v->isConst(); }); } void Kernel::analyze() { diff --git a/test/test_kernel_inputs.cpp b/test/test_kernel_inputs.cpp index c1dbc2b04ea..817a92ac4ea 100644 --- a/test/test_kernel_inputs.cpp +++ b/test/test_kernel_inputs.cpp @@ -26,10 +26,10 @@ TEST_F(KernelInputsTest, HoistToHost1) { tv1->axis(0)->parallelize(ParallelType::TIDx); const std::string expected_kernel = R"( -__global__ void CUDAGeneratedKernel(Tensor T0, double d0, double d1, Tensor T1) { +__global__ void CUDAGeneratedKernel(Tensor T0, double d0, Tensor T1) { T1[((nvfuser_index_t)threadIdx.x)] = T0[(T0.stride[0] * ((nvfuser_index_t)threadIdx.x))] - * (float) d1; + * (float) d0; } )"; @@ -62,10 +62,10 @@ TEST_F(KernelInputsTest, HoistToHost2) { tv1->axis(0)->parallelize(ParallelType::TIDx); const std::string expected_kernel = R"( -__global__ void CUDAGeneratedKernel(Tensor T0, double d0, double d1, Tensor T1) { +__global__ void CUDAGeneratedKernel(Tensor T0, double d0, Tensor T1) { T1[((nvfuser_index_t)threadIdx.x)] = T0[(T0.stride[0] * ((nvfuser_index_t)threadIdx.x))] - * (float) d1; + * (float) d0; } )"; From 8b9e980e34a8b4648337a11b1178ab178c709ef5 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Mon, 24 Jul 2023 10:22:18 -0700 Subject: [PATCH 07/33] cleanup --- csrc/ir/utils.cpp | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/csrc/ir/utils.cpp b/csrc/ir/utils.cpp index 6940009fc2b..46590eb3864 100644 --- a/csrc/ir/utils.cpp +++ b/csrc/ir/utils.cpp @@ -1090,26 +1090,6 @@ bool dependenciesSatisfied( return true; } -bool dependenciesSatisfied( - std::vector needed_vals, - std::unordered_set known_vals) { - while (!needed_vals.empty()) { - auto needed_val = needed_vals.back(); - needed_vals.pop_back(); - if (known_vals.count(needed_val) > 0 || needed_val->isConst()) { - continue; - } - auto def = needed_val->definition(); - if (def == nullptr) { - return false; - } - for (auto input : def->inputs()) { - needed_vals.emplace_back(input); - } - } - return true; -} - bool isAlignedScopeExpr(const Expr* expr) { TORCH_INTERNAL_ASSERT(expr != nullptr); if (auto ite = dynamic_cast(expr)) { From 72937d7b9df39c6238a9179f7e64b7787ee66373 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Mon, 24 Jul 2023 13:06:36 -0700 Subject: [PATCH 08/33] save --- CMakeLists.txt | 2 - csrc/device_lower/lower2device.cpp | 15 ++++- csrc/device_lower/pass/hoist_to_host.cpp | 32 --------- csrc/device_lower/pass/hoist_to_host.h | 18 ----- test/test_kernel_inputs.cpp | 84 ------------------------ 5 files changed, 12 insertions(+), 139 deletions(-) delete mode 100644 csrc/device_lower/pass/hoist_to_host.cpp delete mode 100644 csrc/device_lower/pass/hoist_to_host.h delete mode 100644 test/test_kernel_inputs.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c55e87c3cc..ca3d7950c3e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -119,7 +119,6 @@ list(APPEND NVFUSER_SRCS ${NVFUSER_SRCS_DIR}/device_lower/pass/double_buffer.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/expr_sort.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/fusion_simplifier.cpp - ${NVFUSER_SRCS_DIR}/device_lower/pass/hoist_to_host.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/index.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/scalar_hoist.cpp ${NVFUSER_SRCS_DIR}/device_lower/pass/insert_syncs.cpp @@ -403,7 +402,6 @@ if(BUILD_TEST) ${NVFUSER_ROOT}/test/test_gpu_compute_with.cpp ${NVFUSER_ROOT}/test/test_expr_simplifier.cpp ${NVFUSER_ROOT}/test/test_external_src.cpp - ${NVFUSER_ROOT}/test/test_kernel_inputs.cpp ${NVFUSER_ROOT}/test/test_swizzle.cpp ${NVFUSER_ROOT}/test/test_tensor_factories.cpp ${NVFUSER_ROOT}/test/test_gpu_fused_reduction.cpp diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp index 83d83066306..befa0107488 100644 --- a/csrc/device_lower/lower2device.cpp +++ b/csrc/device_lower/lower2device.cpp @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -239,6 +238,16 @@ void dumpExprsIfEnabled( } } +namespace { + +// A temporary function that copy inputs to kernel_inputs. In the future, this +// will be replaced a real pass that computes the kernel inputs. +void _setKernelInputs(kir::Kernel* kernel) { + allKnownVals() = kernel->inputs(); +} + +} // namespace + void GpuLower::lower(Fusion* fusion) { FUSER_PERF_SCOPE("GpuLower::lower"); TORCH_INTERNAL_ASSERT(fusion != nullptr); @@ -278,8 +287,8 @@ void GpuLower::lower(Fusion* fusion) { dumpExprsIfEnabled(fusion_->exprs(), "initialize lowering"); - hoistScalarComputationToHost(fusion_, allKnownVals()); - dumpExprsIfEnabled(fusion_->exprs(), "hoistScalarComputationToHost"); + _setKernelInputs(kernel_); + dumpExprsIfEnabled(fusion_->exprs(), "_setKernelInputs"); // prepare for lowering validateIr(fusion_); diff --git a/csrc/device_lower/pass/hoist_to_host.cpp b/csrc/device_lower/pass/hoist_to_host.cpp deleted file mode 100644 index 33377941590..00000000000 --- a/csrc/device_lower/pass/hoist_to_host.cpp +++ /dev/null @@ -1,32 +0,0 @@ -// clang-format off -/* - * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. - * All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - */ -// clang-format on - -#include -#include - -namespace nvfuser { - -void hoistScalarComputationToHost( - Fusion* kernel, - std::vector& all_known_vals) { - all_known_vals.insert( - all_known_vals.end(), kernel->inputs().begin(), kernel->inputs().end()); - if (!kernel->hasManaged("hoist_to_host")) { - return; - } - const auto& hoist_to_host = - kernel->getManaged>("hoist_to_host"); - TORCH_CHECK( - ir_utils::dependenciesSatisfied( - hoist_to_host, {kernel->inputs().begin(), kernel->inputs().end()}), - "Cannot hoist to host because some inputs are not provided"); - all_known_vals.insert( - all_known_vals.end(), hoist_to_host.begin(), hoist_to_host.end()); -} - -} // namespace nvfuser diff --git a/csrc/device_lower/pass/hoist_to_host.h b/csrc/device_lower/pass/hoist_to_host.h deleted file mode 100644 index cf66f611402..00000000000 --- a/csrc/device_lower/pass/hoist_to_host.h +++ /dev/null @@ -1,18 +0,0 @@ -// clang-format off -/* - * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. - * All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - */ -// clang-format on -#pragma once - -#include - -namespace nvfuser { - -void hoistScalarComputationToHost( - Fusion* kernel, - std::vector& all_known_vals); - -} // namespace nvfuser diff --git a/test/test_kernel_inputs.cpp b/test/test_kernel_inputs.cpp deleted file mode 100644 index 817a92ac4ea..00000000000 --- a/test/test_kernel_inputs.cpp +++ /dev/null @@ -1,84 +0,0 @@ -#include -#include - -#include -#include - -#include - -namespace nvfuser { - -class KernelInputsTest : public NVFuserTest {}; - -TEST_F(KernelInputsTest, HoistToHost1) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - auto scalar = IrBuilder::newScalar(DataType::Double); - fusion.addInput(scalar); - auto inv = div(fusion.oneVal(DataType::Double), scalar); - auto tv1 = mul(tv0, inv); - fusion.addOutput(tv1); - fusion.manage("hoist_to_host", std::vector{inv}); - - tv1->axis(0)->parallelize(ParallelType::TIDx); - - const std::string expected_kernel = R"( -__global__ void CUDAGeneratedKernel(Tensor T0, double d0, Tensor T1) { - T1[((nvfuser_index_t)threadIdx.x)] - = T0[(T0.stride[0] * ((nvfuser_index_t)threadIdx.x))] - * (float) d0; -} -)"; - - assertCUDAKernel(&fusion, expected_kernel); - // TODO: executor change not implemented yet - return; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({1000}, options); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({input, 10.0}); - testValidate(&fusion, outputs, {input, 10.0}, __LINE__, __FILE__); -} - -TEST_F(KernelInputsTest, HoistToHost2) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(1); - fusion.addInput(tv0); - auto scalar = IrBuilder::newScalar(DataType::Double); - fusion.addInput(scalar); - auto inv = div(fusion.oneVal(DataType::Double), scalar); - auto inv_sqr = mul(inv, inv); - auto tv1 = mul(tv0, inv_sqr); - fusion.addOutput(tv1); - fusion.manage("hoist_to_host", std::vector{inv_sqr}); - - tv1->axis(0)->parallelize(ParallelType::TIDx); - - const std::string expected_kernel = R"( -__global__ void CUDAGeneratedKernel(Tensor T0, double d0, Tensor T1) { - T1[((nvfuser_index_t)threadIdx.x)] - = T0[(T0.stride[0] * ((nvfuser_index_t)threadIdx.x))] - * (float) d0; -} -)"; - - assertCUDAKernel(&fusion, expected_kernel); - // TODO: executor change not implemented yet - return; - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::randn({1000}, options); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({input, 10.0}); - testValidate(&fusion, outputs, {input, 10.0}, __LINE__, __FILE__); -} - -} // namespace nvfuser From d0cb6d5388c2febe1364dd55fbb1366b9a950612 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Mon, 24 Jul 2023 13:16:21 -0700 Subject: [PATCH 09/33] fix --- csrc/device_lower/lower2device.cpp | 14 +++----------- csrc/kernel.cpp | 8 +------- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp index befa0107488..34ad0640296 100644 --- a/csrc/device_lower/lower2device.cpp +++ b/csrc/device_lower/lower2device.cpp @@ -238,16 +238,6 @@ void dumpExprsIfEnabled( } } -namespace { - -// A temporary function that copy inputs to kernel_inputs. In the future, this -// will be replaced a real pass that computes the kernel inputs. -void _setKernelInputs(kir::Kernel* kernel) { - allKnownVals() = kernel->inputs(); -} - -} // namespace - void GpuLower::lower(Fusion* fusion) { FUSER_PERF_SCOPE("GpuLower::lower"); TORCH_INTERNAL_ASSERT(fusion != nullptr); @@ -287,7 +277,9 @@ void GpuLower::lower(Fusion* fusion) { dumpExprsIfEnabled(fusion_->exprs(), "initialize lowering"); - _setKernelInputs(kernel_); + // Temporarily set kernel_inputs to inputs. In the future, we will have a real + // pass to determine how to set kernel_inputs. + allKnownVals() = kernel_->inputs(); dumpExprsIfEnabled(fusion_->exprs(), "_setKernelInputs"); // prepare for lowering diff --git a/csrc/kernel.cpp b/csrc/kernel.cpp index 67df4d316cc..3c313dea636 100644 --- a/csrc/kernel.cpp +++ b/csrc/kernel.cpp @@ -315,13 +315,7 @@ void Kernel::finalize(std::vector top_level_exprs) { summary_.sync_map = GpuLower::current()->syncMap(); summary_.parallel_dimension_map_ = GpuLower::current()->parallelDimensionMap(); - auto kernel_inputs = - IterVisitor::getInputsTo(outputs(), GpuLower::current()->allKnownVals()); - std::copy_if( - kernel_inputs.begin(), - kernel_inputs.end(), - std::back_inserter(kernel_inputs_), - [](Val* v) { return !v->isConst(); }); + kernel_inputs_ = GpuLower::current()->allKnownVals(); } void Kernel::analyze() { From 5d9220de153bbdadd9f3e378dcbfbb5a2a9fe7cc Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Mon, 24 Jul 2023 13:17:49 -0700 Subject: [PATCH 10/33] save --- csrc/executor_utils.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/csrc/executor_utils.cpp b/csrc/executor_utils.cpp index 11fdc2281fb..6e39a36ee32 100644 --- a/csrc/executor_utils.cpp +++ b/csrc/executor_utils.cpp @@ -838,6 +838,9 @@ void bindInputForExprEvaluation( if (tensor_arg_abstract != nullptr) { expr_eval.bind(cg_tensor, tensor_arg_abstract->getTensor()); } + +#if 1 + // Legacy code. To be removed in the future auto root_domain = TensorDomain::noReductions(cg_tensor->getMaybeRFactorDomain()); @@ -902,6 +905,7 @@ void bindInputForExprEvaluation( } } } +#endif } else if (val->getValType().value() == ValType::Others) { if (val->getDataType().value() == DataType::Int) { TORCH_INTERNAL_ASSERT( From 6060fb9313c85d494fd9e6b1d4fb0458b02ae08a Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Mon, 24 Jul 2023 13:28:26 -0700 Subject: [PATCH 11/33] save --- csrc/executor_kernel_arg.cpp | 1 + csrc/executor_kernel_arg.h | 5 +++++ csrc/executor_utils.cpp | 21 +++++++++++++++++++++ 3 files changed, 27 insertions(+) diff --git a/csrc/executor_kernel_arg.cpp b/csrc/executor_kernel_arg.cpp index 33b00c3d014..200e7167d30 100644 --- a/csrc/executor_kernel_arg.cpp +++ b/csrc/executor_kernel_arg.cpp @@ -466,6 +466,7 @@ std::unique_ptr makeCpuScalarTensorArg(const at::Tensor& tensor) { auto ptr = std::make_unique>(); static_assert(sizeof(ptr->instance_) == size); std::memcpy(&(ptr->instance_), tensor.data_ptr(), size); + ptr->tensor_ = tensor; return ptr; } diff --git a/csrc/executor_kernel_arg.h b/csrc/executor_kernel_arg.h index 538d3ce3d5f..c85f0a3e0bf 100644 --- a/csrc/executor_kernel_arg.h +++ b/csrc/executor_kernel_arg.h @@ -363,7 +363,12 @@ struct TensorArg : public TensorArgAbstract { template struct CpuScalarTensorArg : public ArgAbstract { std::array instance_; + at::Tensor tensor_; DEF_HELPEE_FUNC(CpuScalarTensor, instance_) + + at::Tensor getTensor() const { + return tensor_; + } }; // TODO: This class needs some further clean up and refactor diff --git a/csrc/executor_utils.cpp b/csrc/executor_utils.cpp index 6e39a36ee32..1871361d379 100644 --- a/csrc/executor_utils.cpp +++ b/csrc/executor_utils.cpp @@ -838,6 +838,27 @@ void bindInputForExprEvaluation( if (tensor_arg_abstract != nullptr) { expr_eval.bind(cg_tensor, tensor_arg_abstract->getTensor()); } + // TODO: clean this up + auto cpu_scalar_tensor1 = dynamic_cast*>(arg); + if (cpu_scalar_tensor1 != nullptr) { + expr_eval.bind(cg_tensor, cpu_scalar_tensor1->getTensor()); + } + auto cpu_scalar_tensor2 = dynamic_cast*>(arg); + if (cpu_scalar_tensor2 != nullptr) { + expr_eval.bind(cg_tensor, cpu_scalar_tensor2->getTensor()); + } + auto cpu_scalar_tensor4 = dynamic_cast*>(arg); + if (cpu_scalar_tensor4 != nullptr) { + expr_eval.bind(cg_tensor, cpu_scalar_tensor4->getTensor()); + } + auto cpu_scalar_tensor8 = dynamic_cast*>(arg); + if (cpu_scalar_tensor8 != nullptr) { + expr_eval.bind(cg_tensor, cpu_scalar_tensor8->getTensor()); + } + auto cpu_scalar_tensor16 = dynamic_cast*>(arg); + if (cpu_scalar_tensor16 != nullptr) { + expr_eval.bind(cg_tensor, cpu_scalar_tensor16->getTensor()); + } #if 1 // Legacy code. To be removed in the future From 1d6301dfaedbca77edd96b806e6a9354d9dd432b Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Mon, 24 Jul 2023 13:37:15 -0700 Subject: [PATCH 12/33] save --- csrc/executor.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/csrc/executor.cpp b/csrc/executor.cpp index 2c59d0468d5..a72173c8a01 100644 --- a/csrc/executor.cpp +++ b/csrc/executor.cpp @@ -1616,9 +1616,10 @@ std::vector FusionExecutor::runFusion( // context manager to disable auto grad for `empty_cuda` calls later at::AutoDispatchBelowADInplaceOrView non_variable_type_mode; + auto expr_eval = executor_utils::bindInputs(args, lowered_->kernel()); + // only allocate outputs when not given if (outputs.empty()) { - auto expr_eval = executor_utils::bindInputs(args, lowered_->kernel()); outputs = allocOutputs( kernel(), executor_entry->outputs, @@ -1700,9 +1701,8 @@ std::vector FusionExecutor::runFusion( if (execute_kernel_) { ensureAvailableDynamicSmemSize(executor_entry->launch_params.smem()); - auto ee = executor_utils::bindInputs(args, kernel()); auto arg_buffer = - args.getBuffer(kernel()->indexType(), getTvsForKernelArguments(), ee); + args.getBuffer(kernel()->indexType(), getTvsForKernelArguments(), expr_eval); if (isDebugDumpEnabled(DebugDumpOption::Occupancy) || isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) { From b0ea7606635b61a96d57564a830f58b2c12ba913 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Mon, 24 Jul 2023 21:09:07 -0700 Subject: [PATCH 13/33] outputs and global buffers as kernel inputs --- csrc/codegen.cpp | 15 ------- csrc/executor.cpp | 4 +- csrc/executor_kernel_arg.cpp | 85 ++++++++++++++++++++++++++++++++++++ csrc/executor_kernel_arg.h | 33 +++++++++++--- csrc/kernel.cpp | 5 +++ 5 files changed, 119 insertions(+), 23 deletions(-) diff --git a/csrc/codegen.cpp b/csrc/codegen.cpp index 9534b582eb4..a09b21469a7 100644 --- a/csrc/codegen.cpp +++ b/csrc/codegen.cpp @@ -237,11 +237,6 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { params.push_back(val); kernel_inputs_.insert(val); } - for (auto val : kernel_->outputs()) { - TORCH_INTERNAL_ASSERT( - !val->isScalar(), "No scalar output is allowed: ", val->toString()); - params.push_back(val); - } // Generate parameter declarations unsigned int duplicate_counter = 0; @@ -282,16 +277,6 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { } } - // Global buffers - for (auto allocate : kernel_summary.global_allocations) { - TORCH_INTERNAL_ASSERT(allocate->buffer()->isA()); - const auto tv = allocate->buffer()->as(); - const auto& alloc_domain = - TensorDomain::noReductions(tv->getMaybeAllocationDomain()); - code_ << ", Tensor<" << tv->dtype() << ", " << alloc_domain.size() << ", " - << alloc_domain.size() << "> " << genVariableName(tv); - } - // Kernels generating random numbers take extra (seed, offset) arguments if (kernel_summary.max_rng_offsets >= 0) { code_ << ", at::PhiloxCudaState philox_args"; diff --git a/csrc/executor.cpp b/csrc/executor.cpp index a72173c8a01..aa0eca85ed6 100644 --- a/csrc/executor.cpp +++ b/csrc/executor.cpp @@ -1701,8 +1701,8 @@ std::vector FusionExecutor::runFusion( if (execute_kernel_) { ensureAvailableDynamicSmemSize(executor_entry->launch_params.smem()); - auto arg_buffer = - args.getBuffer(kernel()->indexType(), getTvsForKernelArguments(), expr_eval); + auto arg_buffer = args.getBuffer( + kernel()->indexType(), getTvsForKernelArguments(), expr_eval); if (isDebugDumpEnabled(DebugDumpOption::Occupancy) || isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) { diff --git a/csrc/executor_kernel_arg.cpp b/csrc/executor_kernel_arg.cpp index 200e7167d30..b95ad39fcd8 100644 --- a/csrc/executor_kernel_arg.cpp +++ b/csrc/executor_kernel_arg.cpp @@ -648,4 +648,89 @@ void KernelArgumentHolder::pushTensorProxy( arguments_.push_back(getAbstractTensorArg(at::Tensor(meta_tensor))); } +std::vector getKernelArgument( + ExpressionEvaluator& ee, + Val* parameter, + PrimDataType index_type) { + PolymorphicValue pv = ee.evaluate(parameter); + if (auto tv = dynamic_cast(parameter)) { + auto tensor = pv.as(); + if (is_cpu_scalar(tensor)) { + return std::vector( + (std::byte*)tensor.data_ptr(), + (std::byte*)tensor.data_ptr() + tensor.element_size()); + } else { + auto resolved_arg = + getTensorArg(tensor, tv, ee, index_type); + return std::vector( + (std::byte*)resolved_arg->arg(), + (std::byte*)resolved_arg->arg() + resolved_arg->argSize()); + } + } else if (isIntegralType(parameter->dtype())) { + int64_t v = pv.as(); + if (parameter->dtype() == DataType::Int || + (index_type == PrimDataType::Int && + parameter->dtype() == DataType::Index)) { + return std::vector((std::byte*)&v, (std::byte*)&v + 8); + } else if ( + parameter->dtype() == DataType::Int32 || + (index_type == PrimDataType::Int32 && + parameter->dtype() == DataType::Index)) { + int32_t v32 = (int32_t)v; + return std::vector((std::byte*)&v32, (std::byte*)&v32 + 4); + } else { + TORCH_INTERNAL_ASSERT( + false, + "Tried to run a generated kernel with ", + parameter->dtype(), + " type, however only int32 and int64 are supported."); + } + } else if (isFloatingPointType(parameter->dtype())) { + double v = pv.as(); + if (parameter->dtype() == DataType::Double) { + return std::vector( + (std::byte*)&v, (std::byte*)&v + sizeof(double)); + } else if (parameter->dtype() == DataType::Float) { + float v32 = (float)v; + return std::vector( + (std::byte*)&v32, (std::byte*)&v32 + sizeof(float)); + } else if (parameter->dtype() == DataType::Half) { + at::Half v16 = (at::Half)v; + return std::vector( + (std::byte*)&v16, (std::byte*)&v16 + sizeof(at::Half)); + } else if (parameter->dtype() == DataType::BFloat16) { + at::BFloat16 v16 = (at::BFloat16)v; + return std::vector( + (std::byte*)&v16, (std::byte*)&v16 + sizeof(at::BFloat16)); + } else { + TORCH_INTERNAL_ASSERT( + false, + "Tried to run a generated kernel with ", + parameter->dtype(), + " type, however only float, double, half and bfloat16 are supported."); + } + } else if (isComplexType(parameter->dtype())) { + std::complex v = pv.as>(); + if (parameter->dtype() == DataType::ComplexDouble) { + return std::vector( + (std::byte*)&v, (std::byte*)&v + sizeof(std::complex)); + } else if (parameter->dtype() == DataType::ComplexFloat) { + std::complex v32 = (std::complex)v; + return std::vector( + (std::byte*)&v32, (std::byte*)&v32 + sizeof(std::complex)); + } else { + TORCH_INTERNAL_ASSERT( + false, + "Tried to run a generated kernel with ", + parameter->dtype(), + " type, however only complex float and complex double are supported."); + } + } else { + TORCH_INTERNAL_ASSERT( + false, + "Tried to run a generated kernel with unsupported dtype ", + parameter->dtype()); + } +} + } // namespace nvfuser diff --git a/csrc/executor_kernel_arg.h b/csrc/executor_kernel_arg.h index c85f0a3e0bf..4cb3b4ac899 100644 --- a/csrc/executor_kernel_arg.h +++ b/csrc/executor_kernel_arg.h @@ -115,6 +115,7 @@ struct TensorArgCodegen<0, 0, nvfuser_index_t> { } }; +// TODO: remove this struct ArgAbstract { virtual ~ArgAbstract() = default; virtual const void* arg() const = 0; @@ -127,6 +128,7 @@ struct ArgAbstract { }; }; +// TODO: remove this #define DEF_HELPEE_FUNC(TARGET_TYPE, ARG_NAME) \ bool isType(ArgType type) const override { \ return ArgType::TARGET_TYPE == type; \ @@ -144,6 +146,8 @@ struct ArgAbstract { return std::make_unique(*this); \ } + +// TODO: remove this #define DEF_TOSTRING_FUNC \ std::string toString() const override { \ std::stringstream ss; \ @@ -151,12 +155,14 @@ struct ArgAbstract { return ss.str(); \ } +// TODO: remove this struct PhiloxCudaStateArg : public ArgAbstract { at::PhiloxCudaState val_; PhiloxCudaStateArg(at::PhiloxCudaState _val) : val_(_val){}; DEF_HELPEE_FUNC(PhiloxCudaState, val_) }; +// TODO: remove this struct LongArg : public ArgAbstract { int64_t val_; explicit LongArg(int64_t _val) : val_(_val) {} @@ -164,6 +170,7 @@ struct LongArg : public ArgAbstract { DEF_TOSTRING_FUNC }; +// TODO: remove this struct DoubleArg : public ArgAbstract { double val_; explicit DoubleArg(double _val) : val_(_val) {} @@ -171,6 +178,7 @@ struct DoubleArg : public ArgAbstract { DEF_TOSTRING_FUNC }; +// TODO: remove this struct ComplexDoubleArg : public ArgAbstract { c10::complex val_; explicit ComplexDoubleArg(c10::complex _val) : val_(_val) {} @@ -178,6 +186,7 @@ struct ComplexDoubleArg : public ArgAbstract { DEF_TOSTRING_FUNC }; +// TODO: remove this struct BoolArg : public ArgAbstract { bool val_; explicit BoolArg(bool _val) : val_(_val) {} @@ -185,6 +194,7 @@ struct BoolArg : public ArgAbstract { DEF_TOSTRING_FUNC }; +// TODO: remove this struct TensorArgAbstract : ArgAbstract { at::Tensor tensor_; @@ -259,6 +269,10 @@ struct TensorArgAbstract : ArgAbstract { TORCH_INTERNAL_ASSERT(false, "Abstract tensor arg does not have arg"); } + virtual size_t argSize() const { + TORCH_INTERNAL_ASSERT(false, "Abstract tensor arg does not have arg"); + } + std::string toString() const override { std::stringstream ss; auto rank = getRank(); @@ -275,12 +289,14 @@ struct TensorArgAbstract : ArgAbstract { } }; +// TODO: move this to GetMetaData::evaluate std::vector> inferAndValidateAllocationSizesAndStrides( const at::Tensor& tensor, TensorView* tv, ExpressionEvaluator& ee); +// TODO: remove this template struct TensorArg : public TensorArgAbstract { TENSOR_TYPE instance_; @@ -332,6 +348,10 @@ struct TensorArg : public TensorArgAbstract { return &instance_; } + size_t argSize() const override { + return sizeof(TENSOR_TYPE); + } + bool isAbstract() const override { return false; } @@ -360,6 +380,7 @@ struct TensorArg : public TensorArgAbstract { } }; +// TODO: remove this template struct CpuScalarTensorArg : public ArgAbstract { std::array instance_; @@ -371,12 +392,7 @@ struct CpuScalarTensorArg : public ArgAbstract { } }; -// TODO: This class needs some further clean up and refactor -//! KernelArgumentHolder copies meta information from kernel inputs, including -//! tensor sizes/shapes/dtype/memory_ptr and copies scalar inputs. It is used -//! for both compilation as well as kernel execution. The important thing is to -//! strip ownership of tensor from KernelArgumentHolder, so that during async -//! compilation, we are not unnecessarily holding memory that is not needed. +// TODO: remove this class TORCH_CUDA_CU_API KernelArgumentHolder { public: //! create KernelArgumentHolder from c10 inputs. Note that we we not taking @@ -490,4 +506,9 @@ class TORCH_CUDA_CU_API KernelArgumentHolder { std::optional cache_id_ = std::nullopt; }; +std::vector getKernelArgument( + ExpressionEvaluator& ee, + Val* parameter, + PrimDataType index_type); + } // namespace nvfuser diff --git a/csrc/kernel.cpp b/csrc/kernel.cpp index 3c313dea636..f40ea1c601c 100644 --- a/csrc/kernel.cpp +++ b/csrc/kernel.cpp @@ -316,6 +316,11 @@ void Kernel::finalize(std::vector top_level_exprs) { summary_.parallel_dimension_map_ = GpuLower::current()->parallelDimensionMap(); kernel_inputs_ = GpuLower::current()->allKnownVals(); + kernel_inputs_.insert( + kernel_inputs_.end(), outputs().begin(), outputs().end()); + for (auto alloc : summary_.global_allocations) { + kernel_inputs_.push_back(alloc->buffer()); + } } void Kernel::analyze() { From 7eedc7490eac206009c5a28c81fd72decba53e64 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Mon, 24 Jul 2023 22:51:03 -0700 Subject: [PATCH 14/33] real migration --- csrc/executor.cpp | 52 +++++++++++++++++++++++++++++++----- csrc/executor_kernel_arg.cpp | 7 +++-- csrc/executor_kernel_arg.h | 3 ++- csrc/executor_utils.cpp | 19 +++++++++---- csrc/executor_utils.h | 8 ++++++ 5 files changed, 73 insertions(+), 16 deletions(-) diff --git a/csrc/executor.cpp b/csrc/executor.cpp index aa0eca85ed6..1ace5ab75de 100644 --- a/csrc/executor.cpp +++ b/csrc/executor.cpp @@ -1616,7 +1616,13 @@ std::vector FusionExecutor::runFusion( // context manager to disable auto grad for `empty_cuda` calls later at::AutoDispatchBelowADInplaceOrView non_variable_type_mode; - auto expr_eval = executor_utils::bindInputs(args, lowered_->kernel()); + ExpressionEvaluator expr_eval; + const auto& inputs = kernel()->inputs(); + + for (const auto i : c10::irange(inputs.size())) { + executor_utils::bindInputForExprEvaluation( + inputs[i], args[i], true, expr_eval); + } // only allocate outputs when not given if (outputs.empty()) { @@ -1636,6 +1642,19 @@ std::vector FusionExecutor::runFusion( } args.push(outputs); + for (const auto i : c10::irange(outputs.size())) { + auto output = kernel()->outputs()[i]; + if (std::any_of( + kernel()->inputs().begin(), + kernel()->inputs().end(), + [&](const auto& in) { return in == output; })) { + // Skip trivially forwarded outputs because they are just placeholders + continue; + } + executor_utils::bindInputForExprEvaluation( + output, args[inputs.size() + i], true, expr_eval, false); + } + std::vector intermediates; at::Tensor profile_buffer; { @@ -1660,15 +1679,32 @@ std::vector FusionExecutor::runFusion( } args.push(intermediate_buffer); intermediates.push_back(intermediate_buffer); + executor_utils::bindInputForExprEvaluation( + kernel()->summary().global_allocations[i]->buffer(), + args[inputs.size() + outputs.size() + i], + true, + expr_eval, + false); if (buf_info.is_profile_buffer) { profile_buffer = intermediate_buffer; } } } + std::vector> arg_buffers; + arg_buffers.reserve(kernel()->kernelInputs().size()); + for (auto v : kernel()->kernelInputs()) { + arg_buffers.emplace_back( + getKernelArgument(expr_eval, v, kernel()->indexType())); + } + // push back RNG state if needed if (lowered_->kernel()->summary().max_rng_offsets >= 0) { - args.appendPhiloxRNGSeed(executor_entry->rand_offset); + auto philox_seed = getPhiloxRNGSeed(executor_entry->rand_offset); + arg_buffers.emplace_back( + (std::byte*)&philox_seed, + (std::byte*)&philox_seed + sizeof(philox_seed)); + args.push(philox_seed); } if (isDebugDumpEnabled(DebugDumpOption::LaunchParam)) { @@ -1701,8 +1737,12 @@ std::vector FusionExecutor::runFusion( if (execute_kernel_) { ensureAvailableDynamicSmemSize(executor_entry->launch_params.smem()); - auto arg_buffer = args.getBuffer( - kernel()->indexType(), getTvsForKernelArguments(), expr_eval); + + std::vector arg_buffer_ptrs; + arg_buffer_ptrs.reserve(arg_buffers.size()); + for (auto& arg_buffer : arg_buffers) { + arg_buffer_ptrs.push_back(arg_buffer.data()); + } if (isDebugDumpEnabled(DebugDumpOption::Occupancy) || isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) { @@ -1743,7 +1783,7 @@ std::vector FusionExecutor::runFusion( launch_params_.bdimz(), launch_params_.smem(), stream, - arg_buffer, + arg_buffer_ptrs.data(), nullptr)); } else { FUSER_PERF_SCOPE("ExecutorRunFusion::cuLaunchCooperativeKernel"); @@ -1757,7 +1797,7 @@ std::vector FusionExecutor::runFusion( launch_params_.bdimz(), launch_params_.smem(), stream, - arg_buffer)); + arg_buffer_ptrs.data())); } if (measure_kernel_time) { diff --git a/csrc/executor_kernel_arg.cpp b/csrc/executor_kernel_arg.cpp index b95ad39fcd8..d8872894fc5 100644 --- a/csrc/executor_kernel_arg.cpp +++ b/csrc/executor_kernel_arg.cpp @@ -601,7 +601,7 @@ void KernelArgumentHolder::swap(int i, const ArgAbstract* arg) { arguments_[i].swap(holder); } -void KernelArgumentHolder::appendPhiloxRNGSeed(uint64_t rand_offset) { +at::PhiloxCudaState getPhiloxRNGSeed(uint64_t rand_offset) { at::PhiloxCudaState philox_engine_inputs; auto gen = at::cuda::detail::getDefaultCUDAGenerator(); { @@ -611,7 +611,7 @@ void KernelArgumentHolder::appendPhiloxRNGSeed(uint64_t rand_offset) { at::check_generator(gen)->philox_cuda_state( rand_offset); } - push(philox_engine_inputs); + return philox_engine_inputs; } std::string KernelArgumentHolder::toString() const { @@ -660,8 +660,7 @@ std::vector getKernelArgument( (std::byte*)tensor.data_ptr(), (std::byte*)tensor.data_ptr() + tensor.element_size()); } else { - auto resolved_arg = - getTensorArg(tensor, tv, ee, index_type); + auto resolved_arg = getTensorArg(tensor, tv, ee, index_type); return std::vector( (std::byte*)resolved_arg->arg(), (std::byte*)resolved_arg->arg() + resolved_arg->argSize()); diff --git a/csrc/executor_kernel_arg.h b/csrc/executor_kernel_arg.h index 4cb3b4ac899..c619a960e38 100644 --- a/csrc/executor_kernel_arg.h +++ b/csrc/executor_kernel_arg.h @@ -146,7 +146,6 @@ struct ArgAbstract { return std::make_unique(*this); \ } - // TODO: remove this #define DEF_TOSTRING_FUNC \ std::string toString() const override { \ @@ -506,6 +505,8 @@ class TORCH_CUDA_CU_API KernelArgumentHolder { std::optional cache_id_ = std::nullopt; }; +at::PhiloxCudaState getPhiloxRNGSeed(uint64_t rand_offset); + std::vector getKernelArgument( ExpressionEvaluator& ee, Val* parameter, diff --git a/csrc/executor_utils.cpp b/csrc/executor_utils.cpp index 1871361d379..69ab11dc5d8 100644 --- a/csrc/executor_utils.cpp +++ b/csrc/executor_utils.cpp @@ -825,13 +825,13 @@ void validateVectorizedTensors( validateVectorizedSplits(kernel, expr_eval); } -namespace { - void bindInputForExprEvaluation( Val* val, const ArgAbstract* arg, bool check_consistency, - ExpressionEvaluator& expr_eval) { + ExpressionEvaluator& expr_eval, + bool legacy) { + TORCH_INTERNAL_ASSERT(val != nullptr); if (val->getValType() == ValType::TensorView) { TensorView* cg_tensor = val->as(); auto tensor_arg_abstract = dynamic_cast(arg); @@ -861,6 +861,10 @@ void bindInputForExprEvaluation( } #if 1 + if (!legacy) { + return; + } + // Legacy code. To be removed in the future auto root_domain = TensorDomain::noReductions(cg_tensor->getMaybeRFactorDomain()); @@ -940,12 +944,17 @@ void bindInputForExprEvaluation( "fusion expected Scalar Double inputs, but found ", argTypeToString(arg->type())); expr_eval.bind(val, *static_cast(arg->arg())); + } else if (val->getDataType().value() == DataType::ComplexDouble) { + TORCH_INTERNAL_ASSERT( + arg->isType(ArgType::ComplexDouble), + "fusion expected Scalar ComplexDouble inputs, but found ", + argTypeToString(arg->type())); + expr_eval.bind( + val, *static_cast*>(arg->arg())); } } } -} // namespace - ExpressionEvaluator bindInputs( const KernelArgumentHolder& args, Fusion* kernel, diff --git a/csrc/executor_utils.h b/csrc/executor_utils.h index 95e970fc3ce..6eedabbd317 100644 --- a/csrc/executor_utils.h +++ b/csrc/executor_utils.h @@ -44,6 +44,14 @@ void validateKernelOutputs( const std::vector& outputs, const c10::Device& device); +void bindInputForExprEvaluation( + Val* val, + const ArgAbstract* arg, + bool check_consistency, + ExpressionEvaluator& expr_eval, + bool legacy = true); + +// TODO: remove this function //! Bind input values to runtime values TORCH_CUDA_CU_API ExpressionEvaluator bindInputs( const KernelArgumentHolder& args, From 5848305fb9af54dc0a8d5067faef11fc959e0bd3 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Mon, 24 Jul 2023 23:05:04 -0700 Subject: [PATCH 15/33] tidy --- csrc/executor_kernel_arg.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/csrc/executor_kernel_arg.cpp b/csrc/executor_kernel_arg.cpp index d8872894fc5..e7289669cba 100644 --- a/csrc/executor_kernel_arg.cpp +++ b/csrc/executor_kernel_arg.cpp @@ -652,6 +652,7 @@ std::vector getKernelArgument( ExpressionEvaluator& ee, Val* parameter, PrimDataType index_type) { + TORCH_INTERNAL_ASSERT(parameter != nullptr); PolymorphicValue pv = ee.evaluate(parameter); if (auto tv = dynamic_cast(parameter)) { auto tensor = pv.as(); From 14d50b77804b3aa927ec04ed6ffa599aaa08efac Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Mon, 24 Jul 2023 23:18:13 -0700 Subject: [PATCH 16/33] renamings --- csrc/codegen.cpp | 8 ++++---- csrc/device_lower/lower2device.cpp | 6 +++--- csrc/executor.cpp | 4 ++-- csrc/executor_kernel_arg.cpp | 4 ++-- csrc/kernel.cpp | 8 ++++---- csrc/kernel.h | 17 ++++++++--------- 6 files changed, 23 insertions(+), 24 deletions(-) diff --git a/csrc/codegen.cpp b/csrc/codegen.cpp index a09b21469a7..75dd92cc267 100644 --- a/csrc/codegen.cpp +++ b/csrc/codegen.cpp @@ -233,9 +233,9 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { std::vector params; // Inputs & Outputs - for (auto val : kernel_->kernelInputs()) { + for (auto val : kernel_->parameters()) { params.push_back(val); - kernel_inputs_.insert(val); + kernel_params_.insert(val); } // Generate parameter declarations @@ -464,7 +464,7 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { } const auto def = s->definition(); const bool has_alloc = alloc_map_.find(s) != alloc_map_.end(); - const bool is_param = kernel_inputs_.find(s) != kernel_inputs_.end(); + const bool is_param = kernel_params_.find(s) != kernel_params_.end(); if (def != nullptr && !has_alloc && !is_param) { code_ << "(" << genInline(def) << ")"; } else if (s->isConst()) { @@ -2955,7 +2955,7 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { //! Keep track of the Val* and its generated variable name std::unordered_map val_to_name_; //! Keep track of variables in the kernel inputs - std::unordered_set kernel_inputs_; + std::unordered_set kernel_params_; }; } // namespace diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp index 34ad0640296..a890f4d987a 100644 --- a/csrc/device_lower/lower2device.cpp +++ b/csrc/device_lower/lower2device.cpp @@ -277,10 +277,10 @@ void GpuLower::lower(Fusion* fusion) { dumpExprsIfEnabled(fusion_->exprs(), "initialize lowering"); - // Temporarily set kernel_inputs to inputs. In the future, we will have a real - // pass to determine how to set kernel_inputs. + // Temporarily set allKnownVals to inputs. In the future, we will have a real + // pass to determine how to set allKnownVals. allKnownVals() = kernel_->inputs(); - dumpExprsIfEnabled(fusion_->exprs(), "_setKernelInputs"); + dumpExprsIfEnabled(fusion_->exprs(), "set allKnownVals"); // prepare for lowering validateIr(fusion_); diff --git a/csrc/executor.cpp b/csrc/executor.cpp index 1ace5ab75de..32faddb1a72 100644 --- a/csrc/executor.cpp +++ b/csrc/executor.cpp @@ -1692,8 +1692,8 @@ std::vector FusionExecutor::runFusion( } std::vector> arg_buffers; - arg_buffers.reserve(kernel()->kernelInputs().size()); - for (auto v : kernel()->kernelInputs()) { + arg_buffers.reserve(kernel()->parameters().size()); + for (auto v : kernel()->parameters()) { arg_buffers.emplace_back( getKernelArgument(expr_eval, v, kernel()->indexType())); } diff --git a/csrc/executor_kernel_arg.cpp b/csrc/executor_kernel_arg.cpp index e7289669cba..3c5b37dc2c3 100644 --- a/csrc/executor_kernel_arg.cpp +++ b/csrc/executor_kernel_arg.cpp @@ -695,11 +695,11 @@ std::vector getKernelArgument( return std::vector( (std::byte*)&v32, (std::byte*)&v32 + sizeof(float)); } else if (parameter->dtype() == DataType::Half) { - at::Half v16 = (at::Half)v; + at::Half v16 = (at::Half)(float)v; return std::vector( (std::byte*)&v16, (std::byte*)&v16 + sizeof(at::Half)); } else if (parameter->dtype() == DataType::BFloat16) { - at::BFloat16 v16 = (at::BFloat16)v; + at::BFloat16 v16 = (at::BFloat16)(float)v; return std::vector( (std::byte*)&v16, (std::byte*)&v16 + sizeof(at::BFloat16)); } else { diff --git a/csrc/kernel.cpp b/csrc/kernel.cpp index f40ea1c601c..54900fa6f44 100644 --- a/csrc/kernel.cpp +++ b/csrc/kernel.cpp @@ -315,11 +315,11 @@ void Kernel::finalize(std::vector top_level_exprs) { summary_.sync_map = GpuLower::current()->syncMap(); summary_.parallel_dimension_map_ = GpuLower::current()->parallelDimensionMap(); - kernel_inputs_ = GpuLower::current()->allKnownVals(); - kernel_inputs_.insert( - kernel_inputs_.end(), outputs().begin(), outputs().end()); + parameters_ = GpuLower::current()->allKnownVals(); + parameters_.insert( + parameters_.end(), outputs().begin(), outputs().end()); for (auto alloc : summary_.global_allocations) { - kernel_inputs_.push_back(alloc->buffer()); + parameters_.push_back(alloc->buffer()); } } diff --git a/csrc/kernel.h b/csrc/kernel.h index 5d6a850b014..823c9bec148 100644 --- a/csrc/kernel.h +++ b/csrc/kernel.h @@ -228,8 +228,8 @@ class TORCH_CUDA_CU_API Kernel final : public Fusion { //! Debug dump of the Kernel IR void print() const; - const std::vector& kernelInputs() const { - return kernel_inputs_; + const std::vector& parameters() const { + return parameters_; } protected: @@ -262,13 +262,12 @@ class TORCH_CUDA_CU_API Kernel final : public Fusion { KernelPerformanceProfile profile_; - // Inputs to the kernel, can be different from Fusion::inputs(). The - // relationship between kernel_inputs_ and Fusion::inputs() is similar to the - // relationship between root domain and rFactor domain. Fusion::inputs() are - // the inputs provided by the user, kernel_inputs_ are the inputs that will be - // sent to the kernel. Vals in kernel_inputs_ must be evaluatable from - // Fusion::inputs(). - std::vector kernel_inputs_; + // Parameters of the kernel. The parameters contain the inputs and outputs of + // the kernel, intermediate buffers, and special items such as RNG state and + // tensor map for TMA support, etc. The parameters are not required to have no + // definition. If a parameter has a definition, its definition will be + // evaluated before the kernel is executed. + std::vector parameters_; }; //! A special debugging proxy for Kernel. From 86cde8de243dcb72548beb3d8f9f577df432ce62 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Mon, 24 Jul 2023 23:34:10 -0700 Subject: [PATCH 17/33] cleanups --- csrc/codegen.cpp | 36 +++++++++++++++++------------------- csrc/kernel.cpp | 3 +-- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/csrc/codegen.cpp b/csrc/codegen.cpp index 75dd92cc267..88e2305cab3 100644 --- a/csrc/codegen.cpp +++ b/csrc/codegen.cpp @@ -232,35 +232,33 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { std::vector params; - // Inputs & Outputs - for (auto val : kernel_->parameters()) { - params.push_back(val); - kernel_params_.insert(val); - } - // Generate parameter declarations + kernel_params_.reserve(kernel_->parameters().size()); unsigned int duplicate_counter = 0; - for (auto i : c10::irange(params.size())) { + for (auto i : c10::irange(kernel_->parameters().size())) { std::stringstream var_name_ss; - if (params[i]->isA()) { - var_name_ss << genVariableName(params[i]->as()); + auto param = kernel_->parameters().at(i); + kernel_params_.insert(param); + + if (param->isA()) { + var_name_ss << genVariableName(param->as()); } else { - var_name_ss << gen(params[i]); + var_name_ss << gen(param); } // If value is duplicate in arguments change the name to avoid name // conflicts in args. - if (!unique_args.emplace(params[i]).second) { + if (!unique_args.emplace(param).second) { var_name_ss << "_duplicate_" << duplicate_counter++; } - if (const auto tv = dynamic_cast(params[i])) { + if (const auto tv = dynamic_cast(param)) { if (tv->isCpuScalar()) { - code_ << " CpuScalarTensor<" << params[i]->dtype() << "> " + code_ << " CpuScalarTensor<" << param->dtype() << "> " << var_name_ss.str(); } else { code_ - << "Tensor<" << params[i]->dtype() << ", " + << "Tensor<" << param->dtype() << ", " << TensorDomain::noReductions(tv->getMaybeRFactorDomain()).size() << ", " << TensorDomain::noReductions(tv->getMaybeAllocationDomain()) @@ -268,16 +266,16 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { << "> " << var_name_ss.str(); } } else { - TORCH_INTERNAL_ASSERT(params[i]->isScalar()); // NOLINT (LLVM bug 48525) - code_ << params[i]->dtype() << " " << var_name_ss.str(); + TORCH_INTERNAL_ASSERT(param->isScalar()); // NOLINT (LLVM bug 48525) + code_ << param->dtype() << " " << var_name_ss.str(); } - if (i + 1 != params.size()) { + if (i + 1 != kernel_->parameters().size()) { code_ << ", "; } } - // Kernels generating random numbers take extra (seed, offset) arguments + // TODO: remove this special handling of philox state if (kernel_summary.max_rng_offsets >= 0) { code_ << ", at::PhiloxCudaState philox_args"; } @@ -2954,7 +2952,7 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { std::vector aligned_scope_exprs_; //! Keep track of the Val* and its generated variable name std::unordered_map val_to_name_; - //! Keep track of variables in the kernel inputs + //! basically kernel_->parameters(), but as a set so it's faster to lookup std::unordered_set kernel_params_; }; diff --git a/csrc/kernel.cpp b/csrc/kernel.cpp index 54900fa6f44..3ff9e3ff86e 100644 --- a/csrc/kernel.cpp +++ b/csrc/kernel.cpp @@ -316,8 +316,7 @@ void Kernel::finalize(std::vector top_level_exprs) { summary_.parallel_dimension_map_ = GpuLower::current()->parallelDimensionMap(); parameters_ = GpuLower::current()->allKnownVals(); - parameters_.insert( - parameters_.end(), outputs().begin(), outputs().end()); + parameters_.insert(parameters_.end(), outputs().begin(), outputs().end()); for (auto alloc : summary_.global_allocations) { parameters_.push_back(alloc->buffer()); } From 38d61792b274eef1f9bb71f55b61931408db57b5 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Mon, 24 Jul 2023 23:41:17 -0700 Subject: [PATCH 18/33] commenting --- csrc/device_lower/pass/expr_sort.cpp | 2 ++ csrc/executor_utils.h | 1 + 2 files changed, 3 insertions(+) diff --git a/csrc/device_lower/pass/expr_sort.cpp b/csrc/device_lower/pass/expr_sort.cpp index 81251e4a8fc..50983e2c441 100644 --- a/csrc/device_lower/pass/expr_sort.cpp +++ b/csrc/device_lower/pass/expr_sort.cpp @@ -1504,6 +1504,8 @@ void ExprSegmentationSorter::sort() { // Need this for initialization of the DAG that is processed std::unordered_map expr2group; + // Not putting the exprs between allKnownVals() and fusion inputs here + // because they are computed using the expr evaluator. auto all_exprs = StmtSort::getExprsBetween( fusion_, GpuLower::current()->allKnownVals(), diff --git a/csrc/executor_utils.h b/csrc/executor_utils.h index 6eedabbd317..2b856cb3f11 100644 --- a/csrc/executor_utils.h +++ b/csrc/executor_utils.h @@ -44,6 +44,7 @@ void validateKernelOutputs( const std::vector& outputs, const c10::Device& device); +// TODO: rename this function void bindInputForExprEvaluation( Val* val, const ArgAbstract* arg, From 388673257efa844b635075514f44fc768a7275a5 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Mon, 24 Jul 2023 23:50:07 -0700 Subject: [PATCH 19/33] more cleanup --- csrc/executor.cpp | 5 ++--- csrc/executor_kernel_arg.cpp | 4 ---- csrc/executor_kernel_arg.h | 15 --------------- 3 files changed, 2 insertions(+), 22 deletions(-) diff --git a/csrc/executor.cpp b/csrc/executor.cpp index 32faddb1a72..e1493b8933c 100644 --- a/csrc/executor.cpp +++ b/csrc/executor.cpp @@ -1621,7 +1621,7 @@ std::vector FusionExecutor::runFusion( for (const auto i : c10::irange(inputs.size())) { executor_utils::bindInputForExprEvaluation( - inputs[i], args[i], true, expr_eval); + inputs.at(i), args[i], true, expr_eval); } // only allocate outputs when not given @@ -1680,7 +1680,7 @@ std::vector FusionExecutor::runFusion( args.push(intermediate_buffer); intermediates.push_back(intermediate_buffer); executor_utils::bindInputForExprEvaluation( - kernel()->summary().global_allocations[i]->buffer(), + kernel()->summary().global_allocations.at(i)->buffer(), args[inputs.size() + outputs.size() + i], true, expr_eval, @@ -1704,7 +1704,6 @@ std::vector FusionExecutor::runFusion( arg_buffers.emplace_back( (std::byte*)&philox_seed, (std::byte*)&philox_seed + sizeof(philox_seed)); - args.push(philox_seed); } if (isDebugDumpEnabled(DebugDumpOption::LaunchParam)) { diff --git a/csrc/executor_kernel_arg.cpp b/csrc/executor_kernel_arg.cpp index 3c5b37dc2c3..6bcad72f11a 100644 --- a/csrc/executor_kernel_arg.cpp +++ b/csrc/executor_kernel_arg.cpp @@ -532,10 +532,6 @@ void KernelArgumentHolder::push(int64_t val) { arguments_.push_back(std::make_unique(val)); } -void KernelArgumentHolder::push(const at::PhiloxCudaState& val) { - arguments_.push_back(std::make_unique(val)); -} - // Create buffer, flatten arguments into it, align by 8 Bytes, return pointers // in the buffer void** KernelArgumentHolder::getBuffer( diff --git a/csrc/executor_kernel_arg.h b/csrc/executor_kernel_arg.h index c619a960e38..2e138a3c4e7 100644 --- a/csrc/executor_kernel_arg.h +++ b/csrc/executor_kernel_arg.h @@ -22,7 +22,6 @@ namespace nvfuser { // TODO: macro this and the printer below enum class ArgType { - PhiloxCudaState, Long, Double, ComplexDouble, @@ -34,9 +33,6 @@ enum class ArgType { inline std::string argTypeToString(ArgType type) { std::string ret; switch (type) { - case ArgType::PhiloxCudaState: - ret = "PhiloxCudaState"; - break; case ArgType::Long: ret = "Long"; break; @@ -154,13 +150,6 @@ struct ArgAbstract { return ss.str(); \ } -// TODO: remove this -struct PhiloxCudaStateArg : public ArgAbstract { - at::PhiloxCudaState val_; - PhiloxCudaStateArg(at::PhiloxCudaState _val) : val_(_val){}; - DEF_HELPEE_FUNC(PhiloxCudaState, val_) -}; - // TODO: remove this struct LongArg : public ArgAbstract { int64_t val_; @@ -434,8 +423,6 @@ class TORCH_CUDA_CU_API KernelArgumentHolder { // Push a scalar or integer to the arguments void push(const c10::IValue& val); - void push(const at::PhiloxCudaState& val); - // Create a buffer, flatten arguments into it, align by 8 Bytes, return // pointers in the buffer. Tensor arguments are passed with the given index // type. @@ -461,8 +448,6 @@ class TORCH_CUDA_CU_API KernelArgumentHolder { return arguments_.back().get(); } - void appendPhiloxRNGSeed(uint64_t rand_offset); - const ArgAbstract* at(size_t ind) const { return arguments_.at(ind).get(); }; From a6e35248509a1017a3f8fcd16a37676a8d4de492 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 25 Jul 2023 11:28:49 -0700 Subject: [PATCH 20/33] save --- CMakeLists.txt | 1 + csrc/ir/nodes.cpp | 50 ------------------------------ csrc/kernel.cpp | 20 ++++++++++-- csrc/tensor_metadata.cpp | 67 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 85 insertions(+), 53 deletions(-) create mode 100644 csrc/tensor_metadata.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index ca3d7950c3e..7b1e64e13cb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -176,6 +176,7 @@ list(APPEND NVFUSER_SRCS ${NVFUSER_SRCS_DIR}/type_inference.cpp ${NVFUSER_SRCS_DIR}/type_promotion.cpp ${NVFUSER_SRCS_DIR}/fusion_segmenter.cpp + ${NVFUSER_SRCS_DIR}/tensor_metadata.cpp ${NVFUSER_SRCS_DIR}/tensor_view.cpp ${NVFUSER_SRCS_DIR}/transform_iter.cpp ${NVFUSER_SRCS_DIR}/transform_replay.cpp diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp index 8008fc9533d..29042b8d20b 100644 --- a/csrc/ir/nodes.cpp +++ b/csrc/ir/nodes.cpp @@ -739,56 +739,6 @@ std::vector GetAttr::evaluate( NVFUSER_DEFINE_CLONE_AND_CREATE(GetAttr) -GetMetaData::GetMetaData(IrBuilderPasskey passkey, Val* output, Val* input) - : Expr(passkey) { - addOutput(output); - addInput(input); - TORCH_INTERNAL_ASSERT( - out()->dtype() == metaDataTypeOf(in()), - "Data type mismatch for GetMetaData") -} - -std::string GetMetaData::toString(int indent_size) const { - std::stringstream ss; - indent(ss, indent_size) << out()->toString() << " = getMetaData(" - << in()->toString() << ")\n"; - return ss.str(); -} - -std::string GetMetaData::toInlineString(int indent_size) const { - std::stringstream ss; - ss << "getMetaData(" << in()->toInlineString() << ")"; - return ss.str(); -} - -std::vector GetMetaData::evaluate( - const std::vector& inputs) const { - TORCH_INTERNAL_ASSERT(inputs.size() == 1, "GetMetaData expects 1 input"); - TORCH_INTERNAL_ASSERT( - in()->isA(), - "Currently, GetMetaData only supports TensorView"); - TensorView* tv = in()->as(); - if (tv->getMemoryType() == MemoryType::Shared) { - // Smem tensor is defined locally as a pointer. It is impossible to know the - // actual address, but using nullptr is a good approximation. - return {PolymorphicValue(Pointer(nullptr, tv->dtype()))}; - } - - at::Tensor input = inputs.at(0).as(); - - Struct concrete_value; - concrete_value["data"] = - PolymorphicValue(Pointer(input.data_ptr(), tv->dtype())); - concrete_value["size"] = PolymorphicValue(input.sizes().vec()); - // TODO: this is not correct, strides actually needs to be based on allocation - // domain, but input.strides() is on the rFactor domain. We need to refactor - // our executor to move related logic here. - concrete_value["stride"] = PolymorphicValue(input.strides().vec()); - return {PolymorphicValue(concrete_value)}; -} - -NVFUSER_DEFINE_CLONE_AND_CREATE(GetMetaData) - TensorConstruct::TensorConstruct( IrBuilderPasskey passkey, TensorView* output, diff --git a/csrc/kernel.cpp b/csrc/kernel.cpp index 3ff9e3ff86e..e0fc85678a1 100644 --- a/csrc/kernel.cpp +++ b/csrc/kernel.cpp @@ -315,10 +315,24 @@ void Kernel::finalize(std::vector top_level_exprs) { summary_.sync_map = GpuLower::current()->syncMap(); summary_.parallel_dimension_map_ = GpuLower::current()->parallelDimensionMap(); - parameters_ = GpuLower::current()->allKnownVals(); - parameters_.insert(parameters_.end(), outputs().begin(), outputs().end()); + auto maybe_metadata = [](Val* v) { + if (auto tv = dynamic_cast(v)) { + return IrBuilder::metadataExpr(tv); + } else { + return v; + } + }; + parameters_.reserve(GpuLower::current()->allKnownVals().size()); + for (auto v : GpuLower::current()->allKnownVals()) { + parameters_.push_back(maybe_metadata(v)); + } + parameters_.reserve(parameters_.size() + outputs().size()); + for (auto v : outputs()) { + parameters_.push_back(maybe_metadata(v)); + } + parameters_.reserve(parameters_.size() + summary_.global_allocations.size()); for (auto alloc : summary_.global_allocations) { - parameters_.push_back(alloc->buffer()); + parameters_.push_back(maybe_metadata(alloc->buffer())); } } diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp new file mode 100644 index 00000000000..131293957c1 --- /dev/null +++ b/csrc/tensor_metadata.cpp @@ -0,0 +1,67 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on + +#include +#include +#include +#include +#include + +namespace nvfuser { + +GetMetaData::GetMetaData(IrBuilderPasskey passkey, Val* output, Val* input) + : Expr(passkey) { + addOutput(output); + addInput(input); + TORCH_INTERNAL_ASSERT( + out()->dtype() == metaDataTypeOf(in()), + "Data type mismatch for GetMetaData") +} + +std::string GetMetaData::toString(int indent_size) const { + std::stringstream ss; + indent(ss, indent_size) << out()->toString() << " = getMetaData(" + << in()->toString() << ")\n"; + return ss.str(); +} + +std::string GetMetaData::toInlineString(int indent_size) const { + std::stringstream ss; + ss << "getMetaData(" << in()->toInlineString() << ")"; + return ss.str(); +} + +std::vector GetMetaData::evaluate( + const std::vector& inputs) const { + TORCH_INTERNAL_ASSERT(inputs.size() == 1, "GetMetaData expects 1 input"); + TORCH_INTERNAL_ASSERT( + in()->isA(), + "Currently, GetMetaData only supports TensorView"); + TensorView* tv = in()->as(); + if (tv->getMemoryType() == MemoryType::Shared) { + // Smem tensor is defined locally as a pointer. It is impossible to know the + // actual address, but using nullptr is a good approximation. + return {PolymorphicValue(Pointer(nullptr, tv->dtype()))}; + } + + at::Tensor input = inputs.at(0).as(); + + Struct concrete_value; + concrete_value["data"] = + PolymorphicValue(Pointer(input.data_ptr(), tv->dtype())); + concrete_value["size"] = PolymorphicValue(input.sizes().vec()); + // TODO: this is not correct, strides actually needs to be based on allocation + // domain, but input.strides() is on the rFactor domain. We need to refactor + // our executor to move related logic here. + concrete_value["stride"] = PolymorphicValue(input.strides().vec()); + return {PolymorphicValue(concrete_value)}; +} + +NVFUSER_DEFINE_CLONE_AND_CREATE(GetMetaData) + +} // namespace nvfuser From c6ec315efa7891607a63b799c3fc53d5635e5eb3 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 25 Jul 2023 17:23:25 -0700 Subject: [PATCH 21/33] revert kernel change --- csrc/kernel.cpp | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/csrc/kernel.cpp b/csrc/kernel.cpp index e0fc85678a1..3ff9e3ff86e 100644 --- a/csrc/kernel.cpp +++ b/csrc/kernel.cpp @@ -315,24 +315,10 @@ void Kernel::finalize(std::vector top_level_exprs) { summary_.sync_map = GpuLower::current()->syncMap(); summary_.parallel_dimension_map_ = GpuLower::current()->parallelDimensionMap(); - auto maybe_metadata = [](Val* v) { - if (auto tv = dynamic_cast(v)) { - return IrBuilder::metadataExpr(tv); - } else { - return v; - } - }; - parameters_.reserve(GpuLower::current()->allKnownVals().size()); - for (auto v : GpuLower::current()->allKnownVals()) { - parameters_.push_back(maybe_metadata(v)); - } - parameters_.reserve(parameters_.size() + outputs().size()); - for (auto v : outputs()) { - parameters_.push_back(maybe_metadata(v)); - } - parameters_.reserve(parameters_.size() + summary_.global_allocations.size()); + parameters_ = GpuLower::current()->allKnownVals(); + parameters_.insert(parameters_.end(), outputs().begin(), outputs().end()); for (auto alloc : summary_.global_allocations) { - parameters_.push_back(maybe_metadata(alloc->buffer())); + parameters_.push_back(alloc->buffer()); } } From a694f1a9cd3338ac7c87688be8071363e333ceb2 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 25 Jul 2023 17:47:27 -0700 Subject: [PATCH 22/33] comment --- csrc/device_lower/lower2device.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp index a890f4d987a..1219f8e1d46 100644 --- a/csrc/device_lower/lower2device.cpp +++ b/csrc/device_lower/lower2device.cpp @@ -279,6 +279,9 @@ void GpuLower::lower(Fusion* fusion) { // Temporarily set allKnownVals to inputs. In the future, we will have a real // pass to determine how to set allKnownVals. + // TODO: revisit all passes on how they handle exprs in the fusion. Should we + // change their use of fusion_->exprs() to only include exprs that are not + // between inputs and allKnownVals()? allKnownVals() = kernel_->inputs(); dumpExprsIfEnabled(fusion_->exprs(), "set allKnownVals"); From 30292f7cdf28b8d23bd26c54fa5411879232b4c2 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Wed, 26 Jul 2023 23:23:47 -0700 Subject: [PATCH 23/33] fix --- csrc/tensor_metadata.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp index 131293957c1..45da3af30b7 100644 --- a/csrc/tensor_metadata.cpp +++ b/csrc/tensor_metadata.cpp @@ -37,6 +37,7 @@ std::string GetMetaData::toInlineString(int indent_size) const { } std::vector GetMetaData::evaluate( + const ExpressionEvaluator& ee, const std::vector& inputs) const { TORCH_INTERNAL_ASSERT(inputs.size() == 1, "GetMetaData expects 1 input"); TORCH_INTERNAL_ASSERT( From 27b5ad8b9ee23e49cb3f2762b96f2c3c68a56a4a Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Wed, 26 Jul 2023 23:41:46 -0700 Subject: [PATCH 24/33] save both logical and alloc size and stride --- csrc/executor.cpp | 4 ++-- csrc/executor_kernel_arg.cpp | 4 ++-- csrc/index_compute.cpp | 4 ++-- csrc/ir/utils.cpp | 5 +++-- csrc/tensor_metadata.cpp | 9 ++++----- csrc/type.cpp | 8 ++++++-- test/test_evaluator.cpp | 4 ++-- test/test_tensor_factories.cpp | 8 ++++---- 8 files changed, 25 insertions(+), 21 deletions(-) diff --git a/csrc/executor.cpp b/csrc/executor.cpp index 17fd4fb385d..d4ea0c69311 100644 --- a/csrc/executor.cpp +++ b/csrc/executor.cpp @@ -1879,8 +1879,8 @@ float FusionExecutor::runRtc( Struct concrete_value; concrete_value["data"] = PolymorphicValue( Pointer(input.data_ptr(), aten_to_data_type(input.scalar_type()))); - concrete_value["size"] = PolymorphicValue(input.sizes().vec()); - concrete_value["stride"] = PolymorphicValue(input.strides().vec()); + concrete_value["logical_size"] = PolymorphicValue(input.sizes().vec()); + concrete_value["alloc_stride"] = PolymorphicValue(input.strides().vec()); data.emplace_back(getTensorArgBuffer(concrete_value, index_type)); pointers.emplace_back(data.back().data()); } diff --git a/csrc/executor_kernel_arg.cpp b/csrc/executor_kernel_arg.cpp index 8ae78c06f09..a6fece08164 100644 --- a/csrc/executor_kernel_arg.cpp +++ b/csrc/executor_kernel_arg.cpp @@ -650,8 +650,8 @@ std::vector getTensorArgBuffer( auto struct_ = metadata.as(); std::vector buffer; void* ptr = (void*)struct_["data"]; - std::vector sizes = (std::vector)struct_["size"]; - std::vector strides = (std::vector)struct_["stride"]; + std::vector sizes = (std::vector)struct_["logical_size"]; + std::vector strides = (std::vector)struct_["alloc_stride"]; if (index_type == PrimDataType::Int) { buffer.reserve( sizeof(ptr) + sizeof(int64_t) * (sizes.size() + strides.size())); diff --git a/csrc/index_compute.cpp b/csrc/index_compute.cpp index 01b1c4159e2..ff0c86153f5 100644 --- a/csrc/index_compute.cpp +++ b/csrc/index_compute.cpp @@ -1402,7 +1402,7 @@ std::vector Index::getGlobalProducerStridedIndices( } strides[i] = IrBuilder::getItemExpr( IrBuilder::getAttrExpr( - IrBuilder::metadataExpr(producer_tv), "stride"), + IrBuilder::metadataExpr(producer_tv), "alloc_stride"), stride_i++); } } @@ -1758,7 +1758,7 @@ std::vector Index::getStrides(TensorView* tv) { continue; } strides[i] = IrBuilder::getItemExpr( - IrBuilder::getAttrExpr(IrBuilder::metadataExpr(tv), "stride"), + IrBuilder::getAttrExpr(IrBuilder::metadataExpr(tv), "alloc_stride"), stride_i++); } } diff --git a/csrc/ir/utils.cpp b/csrc/ir/utils.cpp index 46590eb3864..9ee0257f3da 100644 --- a/csrc/ir/utils.cpp +++ b/csrc/ir/utils.cpp @@ -1149,11 +1149,12 @@ bool isTensorSize(const Val* val) { return true; } } - return isTensorAttr(val, "size"); + return isTensorAttr(val, "logical_size") || isTensorAttr(val, "alloc_size"); } bool isTensorStride(const Val* val) { - return isTensorAttr(val, "stride"); + return isTensorAttr(val, "logical_stride") || + isTensorAttr(val, "alloc_stride"); } } // namespace nvfuser::ir_utils diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp index 45da3af30b7..f2455d8fe55 100644 --- a/csrc/tensor_metadata.cpp +++ b/csrc/tensor_metadata.cpp @@ -55,11 +55,10 @@ std::vector GetMetaData::evaluate( Struct concrete_value; concrete_value["data"] = PolymorphicValue(Pointer(input.data_ptr(), tv->dtype())); - concrete_value["size"] = PolymorphicValue(input.sizes().vec()); - // TODO: this is not correct, strides actually needs to be based on allocation - // domain, but input.strides() is on the rFactor domain. We need to refactor - // our executor to move related logic here. - concrete_value["stride"] = PolymorphicValue(input.strides().vec()); + concrete_value["logical_size"] = PolymorphicValue(input.sizes().vec()); + concrete_value["logical_stride"] = PolymorphicValue(input.strides().vec()); + concrete_value["alloc_size"] = PolymorphicValue(input.sizes().vec()); + concrete_value["alloc_stride"] = PolymorphicValue(input.strides().vec()); return {PolymorphicValue(concrete_value)}; } diff --git a/csrc/type.cpp b/csrc/type.cpp index 0b7c1f27fef..40377a27352 100644 --- a/csrc/type.cpp +++ b/csrc/type.cpp @@ -37,9 +37,13 @@ DataType metaDataTypeOf(const Val* v) { tv_metadata.name = ss.str(); tv_metadata.types["data"] = NVFUSER_MAYBE_MAKE_SHARED( PointerOf{std::make_shared(tv->dtype())}); - tv_metadata.types["size"] = NVFUSER_MAYBE_MAKE_SHARED2( + tv_metadata.types["logical_size"] = NVFUSER_MAYBE_MAKE_SHARED2( ArrayOf{std::make_shared(DataType::Index), dim}); - tv_metadata.types["stride"] = NVFUSER_MAYBE_MAKE_SHARED2( + tv_metadata.types["logical_stride"] = NVFUSER_MAYBE_MAKE_SHARED2( + ArrayOf{std::make_shared(DataType::Index), dim}); + tv_metadata.types["alloc_size"] = NVFUSER_MAYBE_MAKE_SHARED2( + ArrayOf{std::make_shared(DataType::Index), alloc_dim}); + tv_metadata.types["alloc_stride"] = NVFUSER_MAYBE_MAKE_SHARED2( ArrayOf{std::make_shared(DataType::Index), alloc_dim}); return tv_metadata; } diff --git a/test/test_evaluator.cpp b/test/test_evaluator.cpp index ebd89b3aeca..a725d25513e 100644 --- a/test/test_evaluator.cpp +++ b/test/test_evaluator.cpp @@ -318,8 +318,8 @@ TEST_F(ExprEvalTest, TensorMetaData) { TensorView* tv = makeSymbolicTensor(2); auto metadata = IrBuilder::metadataExpr(tv); auto data = IrBuilder::getAttrExpr(metadata, "data"); - auto sizes = IrBuilder::getAttrExpr(metadata, "size"); - auto strides = IrBuilder::getAttrExpr(metadata, "stride"); + auto sizes = IrBuilder::getAttrExpr(metadata, "logical_size"); + auto strides = IrBuilder::getAttrExpr(metadata, "alloc_stride"); auto size0 = IrBuilder::getItemExpr(sizes, fusion.zeroVal()); auto size1 = IrBuilder::getItemExpr(sizes, fusion.oneVal()); auto stride0 = IrBuilder::getItemExpr(strides, fusion.zeroVal()); diff --git a/test/test_tensor_factories.cpp b/test/test_tensor_factories.cpp index 9c38262c444..9e05fe20bf8 100644 --- a/test/test_tensor_factories.cpp +++ b/test/test_tensor_factories.cpp @@ -539,10 +539,10 @@ TEST_F(TensorFactoryTest, MetadataAsTensor) { auto meta0_copy2 = set(meta0_copy1); auto meta1_copy2 = set(meta1_copy1); - auto size0 = IrBuilder::getAttrExpr(meta0_copy2, "size"); - auto stride0 = IrBuilder::getAttrExpr(meta0_copy2, "stride"); - auto size1 = IrBuilder::getAttrExpr(meta1_copy2, "size"); - auto stride1 = IrBuilder::getAttrExpr(meta1_copy2, "stride"); + auto size0 = IrBuilder::getAttrExpr(meta0_copy2, "logical_size"); + auto stride0 = IrBuilder::getAttrExpr(meta0_copy2, "alloc_stride"); + auto size1 = IrBuilder::getAttrExpr(meta1_copy2, "logical_size"); + auto stride1 = IrBuilder::getAttrExpr(meta1_copy2, "alloc_stride"); auto output = tensor(std::vector{size0, stride0, size1, stride1}); fusion->addOutput(output); From 0ddb2974329a1e3806d1e013e13018eac8ecab8c Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Wed, 26 Jul 2023 23:45:51 -0700 Subject: [PATCH 25/33] move code --- csrc/executor_kernel_arg.cpp | 297 --------------------------------- csrc/ir/nodes.cpp | 24 +++ csrc/tensor_metadata.cpp | 313 ++++++++++++++++++++++++++++++++--- 3 files changed, 317 insertions(+), 317 deletions(-) diff --git a/csrc/executor_kernel_arg.cpp b/csrc/executor_kernel_arg.cpp index a6fece08164..314de44218c 100644 --- a/csrc/executor_kernel_arg.cpp +++ b/csrc/executor_kernel_arg.cpp @@ -14,303 +14,6 @@ namespace nvfuser { -namespace { - -// Forward traverse from rFactor domain to allocation domain, compute frontier -// sizes and strides, validate that splits are divisible and merges are -// contiguous, and update active_ids_ correspondingly. -class ForwardTraverseFromRFactorToAlloc { - ExpressionEvaluator& ee_; - std::unordered_map>& active_ids_; - - void handle(Split* split) { - auto in = split->in(); - auto inner = split->inner(); - auto outer = split->outer(); - auto in_it = active_ids_.find(in); - // TORCH_INTERNAL_ASSERT(in_it != active_ids_.end()) - if (in_it == active_ids_.end()) { - // TODO: see [Allocation domain on both side of rFactor] - return; - } - auto [in_size, in_stride] = in_it->second; - auto factor = ee_.evaluate(split->factor()).as(); - TORCH_INTERNAL_ASSERT( - in_size % factor == 0, - "The rFactor domain and allocation domain of fusion input/output ", - "tensors must be a one-to-one map, therefore, ", - "non-divisible split is not allowed in allocation domain"); - TORCH_INTERNAL_ASSERT(active_ids_.erase(in) == 1); - TORCH_INTERNAL_ASSERT( - active_ids_ - .emplace(inner, std::pair{factor, in_stride}) - .second); - TORCH_INTERNAL_ASSERT(active_ids_ - .emplace( - outer, - std::pair{ - in_size / factor, in_stride * factor}) - .second); - } - - void handle(Merge* merge) { - auto inner = merge->inner(); - auto outer = merge->outer(); - auto out = merge->out(); - auto inner_it = active_ids_.find(inner); - auto outer_it = active_ids_.find(outer); - // TORCH_INTERNAL_ASSERT(inner_it != active_ids_.end()) - // TORCH_INTERNAL_ASSERT(outer_it != active_ids_.end()) - if (inner_it == active_ids_.end() || outer_it == active_ids_.end()) { - // TODO: see [Allocation domain on both side of rFactor] - return; - } - auto [inner_size, inner_stride] = inner_it->second; - auto [outer_size, outer_stride] = outer_it->second; - TORCH_INTERNAL_ASSERT( - inner_stride * inner_size == outer_stride, - "The rFactor domain and allocation domain of fusion input/output ", - "tensors must be a one-to-one map, therefore, ", - "merging of discontiguous dimensions is not allowed in allocation domain"); - TORCH_INTERNAL_ASSERT(active_ids_.erase(inner) == 1); - TORCH_INTERNAL_ASSERT(active_ids_.erase(outer) == 1); - TORCH_INTERNAL_ASSERT(active_ids_ - .emplace( - out, - std::pair{ - inner_size * outer_size, inner_stride}) - .second); - } - - void handle(Expr* expr) { - if (auto split = dynamic_cast(expr)) { - handle(split); - } else if (auto merge = dynamic_cast(expr)) { - handle(merge); - } else { - TORCH_INTERNAL_ASSERT( - false, "Unsupported transormation in allocation domain"); - } - } - - public: - ForwardTraverseFromRFactorToAlloc( - ExpressionEvaluator& ee, - std::unordered_map>& active_ids) - : ee_(ee), active_ids_(active_ids) {} - - void run( - TensorView* tv, - const std::vector& rfactor, - const std::vector& alloc) { - auto forward_exprs = StmtSort::getExprsBetween( - tv->fusion(), - {rfactor.begin(), rfactor.end()}, - {alloc.begin(), alloc.end()}); - for (auto expr : forward_exprs) { - handle(expr); - } - } -}; - -// Similar to ForwardTraverseFromRFactorToAlloc, but in the opposite direction. -class BackwardTraverseFromRFactorToAlloc { - at::Tensor tensor_; - ExpressionEvaluator& ee_; - std::unordered_map>& active_ids_; - - void handle(Split* split) { - auto in = split->in(); - auto inner = split->inner(); - auto outer = split->outer(); - auto inner_it = active_ids_.find(inner); - auto outer_it = active_ids_.find(outer); - // TORCH_INTERNAL_ASSERT(inner_it != active_ids_.end()) - // TORCH_INTERNAL_ASSERT(outer_it != active_ids_.end()) - if (inner_it == active_ids_.end() || outer_it == active_ids_.end()) { - // TODO: see [Allocation domain on both side of rFactor] - return; - } - auto [inner_size, inner_stride] = inner_it->second; - auto [outer_size, outer_stride] = outer_it->second; - TORCH_INTERNAL_ASSERT( - inner_stride * inner_size == outer_stride, - "The rFactor domain and allocation domain of fusion input/output ", - "tensors must be a one-to-one map, therefore, ", - "splitting one dimension into discontiguous dimensions is not allowed in allocation domain"); - TORCH_INTERNAL_ASSERT(active_ids_.erase(inner) == 1); - TORCH_INTERNAL_ASSERT(active_ids_.erase(outer) == 1); - TORCH_INTERNAL_ASSERT(active_ids_ - .emplace( - in, - std::pair{ - inner_size * outer_size, inner_stride}) - .second); - } - - void handle(Merge* merge) { - auto inner = merge->inner(); - auto outer = merge->outer(); - auto out = merge->out(); - auto factor = ee_.evaluate(inner->extent()).as(); - auto out_it = active_ids_.find(out); - // TORCH_INTERNAL_ASSERT(out_it != active_ids_.end()) - if (out_it == active_ids_.end()) { - // TODO: see [Allocation domain on both side of rFactor] - return; - } - auto [out_size, out_stride] = out_it->second; - TORCH_INTERNAL_ASSERT( - out_size % factor == 0, - "The rFactor domain and allocation domain of fusion input/output ", - "tensors must be a one-to-one map, therefore, ", - "the size of the output must divisible by the size of inner dimension"); - TORCH_INTERNAL_ASSERT(active_ids_.erase(out) == 1); - TORCH_INTERNAL_ASSERT( - active_ids_ - .emplace(inner, std::pair{factor, out_stride}) - .second); - TORCH_INTERNAL_ASSERT(active_ids_ - .emplace( - outer, - std::pair{ - out_size / factor, out_stride * factor}) - .second); - } - - void handle(Expr* expr) { - if (auto split = dynamic_cast(expr)) { - handle(split); - } else if (auto merge = dynamic_cast(expr)) { - handle(merge); - } else { - TORCH_INTERNAL_ASSERT( - false, "Unsupported transormation in allocation domain"); - } - } - - public: - BackwardTraverseFromRFactorToAlloc( - ExpressionEvaluator& ee, - std::unordered_map>& active_ids) - : ee_(ee), active_ids_(active_ids) {} - - void run( - TensorView* tv, - const std::vector& rfactor, - const std::vector& alloc) { - auto backward_exprs = StmtSort::getExprsBetween( - tv->fusion(), - {alloc.begin(), alloc.end()}, - {rfactor.begin(), rfactor.end()}); - std::reverse(backward_exprs.begin(), backward_exprs.end()); - for (auto expr : backward_exprs) { - handle(expr); - } - } -}; - -} // namespace - -// Given an ATen tensor, whose sizes and strides are w.r.t to the rFactor domain -// of its corresponding TensorView, compute the sizes and strides of the tensor -// with respect to its allocation domain. -// For example, if the rFactor domain is [I1, I2], and the allocation domain is -// [I2*I1], and the tensor's size is [5, 3] and stride is [2, 10], then the -// resulting size will be [15] and stride will be [2] -// Another example, if the rFactor domain is [I1*I2] and the allocation domain -// is [I1, I2], and the tensor's size is [15] and stride is [7], and the extent -// of I2 is 5, then the resulting size will be [3, 5] and stride will be [35, 7] -std::vector> -inferAndValidateAllocationSizesAndStrides( - const at::Tensor& tensor, - TensorView* tv, - ExpressionEvaluator& ee) { - if (tv == nullptr || !tv->hasAllocation()) { - // When tv is nullptr, or tv does not have allocation, the given sizes and - // strides should already be in the target format. So nothing to do here. - std::vector> result; - for (auto i : c10::irange(tensor.dim())) { - result.emplace_back(tensor.size(i), tensor.stride(i)); - } - return result; - } - const auto& alloc = - TensorDomain::noReductions(tv->getMaybeAllocationDomain()); - const auto& rfactor = TensorDomain::noReductions(tv->getMaybeRFactorDomain()); - - // active IDs and their shape and stride - std::unordered_map> active_ids; - TORCH_INTERNAL_ASSERT((int64_t)rfactor.size() == tensor.dim()); - for (int64_t i : c10::irange((int64_t)rfactor.size())) { - auto rf_id = rfactor.at(i); - active_ids[rf_id] = {tensor.size(i), tensor.stride(i)}; - } - - ForwardTraverseFromRFactorToAlloc(ee, active_ids).run(tv, rfactor, alloc); - BackwardTraverseFromRFactorToAlloc(ee, active_ids).run(tv, rfactor, alloc); - - // Now active_ids should contain the final sizes and strides, unordered. We - // need to put them to the correct order. - std::vector> sizes_strides; - sizes_strides.reserve(alloc.size()); - for (auto i : c10::irange(alloc.size())) { - auto id = alloc.at(i); - sizes_strides.emplace_back(active_ids.at(id)); - } - // Validate final sizes and strides with contiguity - int64_t contiguous_stride = 1; - std::vector> contiguity = tv->getContiguity(); - for (int64_t i = (int64_t)sizes_strides.size() - 1; i >= 0; i--) { - if (alloc.at(i)->isBroadcast()) { - continue; - } - while (!contiguity.back().has_value()) { - contiguity.pop_back(); - } - auto [size, stride] = sizes_strides.at(i); - TORCH_INTERNAL_ASSERT(!contiguity.empty()); - auto last_contiguity = contiguity.back(); - TORCH_INTERNAL_ASSERT( - last_contiguity.has_value(), - "I don't think this check makes sense, but unfortunately ", - "clang-tidy is not smart enough to infer from the context that this is always true."); - if (*last_contiguity) { - TORCH_CHECK( - stride == contiguous_stride, - "Stride mismatch with contiguity info. ", - "tv: ", - tv->toString(), - " allocation domain: ", - ir_utils::toString(tv->getMaybeAllocationDomain()), - " dim: ", - i, - " expected stride: ", - contiguous_stride, - " actual stride: ", - stride); - } - contiguous_stride = stride * size; - contiguity.pop_back(); - } - TORCH_INTERNAL_ASSERT( - contiguity.empty(), - "The size of contiguity mismatch with the dimensionality of allocation domain"); - // Validate that for expanded broadcast, the stride must be zero. - for (int64_t i : c10::irange((int64_t)sizes_strides.size())) { - if (auto alloc_id = alloc.at(i); alloc_id->hasExpandedExtent()) { - auto [_, stride] = sizes_strides.at(i); - TORCH_CHECK( - stride == 0, - "Expecting an expanded dimension on dimension ", - i, - " but found stride ", - stride); - } - } - return sizes_strides; -} - PrimDataType TensorArgAbstract::getSmallestIndexType() const { KernelIndexTypeCompute index_type_helper; for (const auto dim_i : c10::irange(tensor_.ndimension())) { diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp index bf6a26f989e..ff62eb11af1 100644 --- a/csrc/ir/nodes.cpp +++ b/csrc/ir/nodes.cpp @@ -745,6 +745,30 @@ std::vector GetAttr::evaluate( NVFUSER_DEFINE_CLONE_AND_CREATE(GetAttr) +GetMetaData::GetMetaData(IrBuilderPasskey passkey, Val* output, Val* input) + : Expr(passkey) { + addOutput(output); + addInput(input); + TORCH_INTERNAL_ASSERT( + out()->dtype() == metaDataTypeOf(in()), + "Data type mismatch for GetMetaData") +} + +std::string GetMetaData::toString(int indent_size) const { + std::stringstream ss; + indent(ss, indent_size) << out()->toString() << " = getMetaData(" + << in()->toString() << ")\n"; + return ss.str(); +} + +std::string GetMetaData::toInlineString(int indent_size) const { + std::stringstream ss; + ss << "getMetaData(" << in()->toInlineString() << ")"; + return ss.str(); +} + +NVFUSER_DEFINE_CLONE_AND_CREATE(GetMetaData) + TensorConstruct::TensorConstruct( IrBuilderPasskey passkey, TensorView* output, diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp index f2455d8fe55..bbb4889d144 100644 --- a/csrc/tensor_metadata.cpp +++ b/csrc/tensor_metadata.cpp @@ -14,26 +14,301 @@ namespace nvfuser { -GetMetaData::GetMetaData(IrBuilderPasskey passkey, Val* output, Val* input) - : Expr(passkey) { - addOutput(output); - addInput(input); - TORCH_INTERNAL_ASSERT( - out()->dtype() == metaDataTypeOf(in()), - "Data type mismatch for GetMetaData") -} +namespace { -std::string GetMetaData::toString(int indent_size) const { - std::stringstream ss; - indent(ss, indent_size) << out()->toString() << " = getMetaData(" - << in()->toString() << ")\n"; - return ss.str(); -} +// Forward traverse from rFactor domain to allocation domain, compute frontier +// sizes and strides, validate that splits are divisible and merges are +// contiguous, and update active_ids_ correspondingly. +class ForwardTraverseFromRFactorToAlloc { + ExpressionEvaluator& ee_; + std::unordered_map>& active_ids_; + + void handle(Split* split) { + auto in = split->in(); + auto inner = split->inner(); + auto outer = split->outer(); + auto in_it = active_ids_.find(in); + // TORCH_INTERNAL_ASSERT(in_it != active_ids_.end()) + if (in_it == active_ids_.end()) { + // TODO: see [Allocation domain on both side of rFactor] + return; + } + auto [in_size, in_stride] = in_it->second; + auto factor = ee_.evaluate(split->factor()).as(); + TORCH_INTERNAL_ASSERT( + in_size % factor == 0, + "The rFactor domain and allocation domain of fusion input/output ", + "tensors must be a one-to-one map, therefore, ", + "non-divisible split is not allowed in allocation domain"); + TORCH_INTERNAL_ASSERT(active_ids_.erase(in) == 1); + TORCH_INTERNAL_ASSERT( + active_ids_ + .emplace(inner, std::pair{factor, in_stride}) + .second); + TORCH_INTERNAL_ASSERT(active_ids_ + .emplace( + outer, + std::pair{ + in_size / factor, in_stride * factor}) + .second); + } + + void handle(Merge* merge) { + auto inner = merge->inner(); + auto outer = merge->outer(); + auto out = merge->out(); + auto inner_it = active_ids_.find(inner); + auto outer_it = active_ids_.find(outer); + // TORCH_INTERNAL_ASSERT(inner_it != active_ids_.end()) + // TORCH_INTERNAL_ASSERT(outer_it != active_ids_.end()) + if (inner_it == active_ids_.end() || outer_it == active_ids_.end()) { + // TODO: see [Allocation domain on both side of rFactor] + return; + } + auto [inner_size, inner_stride] = inner_it->second; + auto [outer_size, outer_stride] = outer_it->second; + TORCH_INTERNAL_ASSERT( + inner_stride * inner_size == outer_stride, + "The rFactor domain and allocation domain of fusion input/output ", + "tensors must be a one-to-one map, therefore, ", + "merging of discontiguous dimensions is not allowed in allocation domain"); + TORCH_INTERNAL_ASSERT(active_ids_.erase(inner) == 1); + TORCH_INTERNAL_ASSERT(active_ids_.erase(outer) == 1); + TORCH_INTERNAL_ASSERT(active_ids_ + .emplace( + out, + std::pair{ + inner_size * outer_size, inner_stride}) + .second); + } + + void handle(Expr* expr) { + if (auto split = dynamic_cast(expr)) { + handle(split); + } else if (auto merge = dynamic_cast(expr)) { + handle(merge); + } else { + TORCH_INTERNAL_ASSERT( + false, "Unsupported transormation in allocation domain"); + } + } + + public: + ForwardTraverseFromRFactorToAlloc( + ExpressionEvaluator& ee, + std::unordered_map>& active_ids) + : ee_(ee), active_ids_(active_ids) {} + + void run( + TensorView* tv, + const std::vector& rfactor, + const std::vector& alloc) { + auto forward_exprs = StmtSort::getExprsBetween( + tv->fusion(), + {rfactor.begin(), rfactor.end()}, + {alloc.begin(), alloc.end()}); + for (auto expr : forward_exprs) { + handle(expr); + } + } +}; + +// Similar to ForwardTraverseFromRFactorToAlloc, but in the opposite direction. +class BackwardTraverseFromRFactorToAlloc { + at::Tensor tensor_; + ExpressionEvaluator& ee_; + std::unordered_map>& active_ids_; + + void handle(Split* split) { + auto in = split->in(); + auto inner = split->inner(); + auto outer = split->outer(); + auto inner_it = active_ids_.find(inner); + auto outer_it = active_ids_.find(outer); + // TORCH_INTERNAL_ASSERT(inner_it != active_ids_.end()) + // TORCH_INTERNAL_ASSERT(outer_it != active_ids_.end()) + if (inner_it == active_ids_.end() || outer_it == active_ids_.end()) { + // TODO: see [Allocation domain on both side of rFactor] + return; + } + auto [inner_size, inner_stride] = inner_it->second; + auto [outer_size, outer_stride] = outer_it->second; + TORCH_INTERNAL_ASSERT( + inner_stride * inner_size == outer_stride, + "The rFactor domain and allocation domain of fusion input/output ", + "tensors must be a one-to-one map, therefore, ", + "splitting one dimension into discontiguous dimensions is not allowed in allocation domain"); + TORCH_INTERNAL_ASSERT(active_ids_.erase(inner) == 1); + TORCH_INTERNAL_ASSERT(active_ids_.erase(outer) == 1); + TORCH_INTERNAL_ASSERT(active_ids_ + .emplace( + in, + std::pair{ + inner_size * outer_size, inner_stride}) + .second); + } + + void handle(Merge* merge) { + auto inner = merge->inner(); + auto outer = merge->outer(); + auto out = merge->out(); + auto factor = ee_.evaluate(inner->extent()).as(); + auto out_it = active_ids_.find(out); + // TORCH_INTERNAL_ASSERT(out_it != active_ids_.end()) + if (out_it == active_ids_.end()) { + // TODO: see [Allocation domain on both side of rFactor] + return; + } + auto [out_size, out_stride] = out_it->second; + TORCH_INTERNAL_ASSERT( + out_size % factor == 0, + "The rFactor domain and allocation domain of fusion input/output ", + "tensors must be a one-to-one map, therefore, ", + "the size of the output must divisible by the size of inner dimension"); + TORCH_INTERNAL_ASSERT(active_ids_.erase(out) == 1); + TORCH_INTERNAL_ASSERT( + active_ids_ + .emplace(inner, std::pair{factor, out_stride}) + .second); + TORCH_INTERNAL_ASSERT(active_ids_ + .emplace( + outer, + std::pair{ + out_size / factor, out_stride * factor}) + .second); + } -std::string GetMetaData::toInlineString(int indent_size) const { - std::stringstream ss; - ss << "getMetaData(" << in()->toInlineString() << ")"; - return ss.str(); + void handle(Expr* expr) { + if (auto split = dynamic_cast(expr)) { + handle(split); + } else if (auto merge = dynamic_cast(expr)) { + handle(merge); + } else { + TORCH_INTERNAL_ASSERT( + false, "Unsupported transormation in allocation domain"); + } + } + + public: + BackwardTraverseFromRFactorToAlloc( + ExpressionEvaluator& ee, + std::unordered_map>& active_ids) + : ee_(ee), active_ids_(active_ids) {} + + void run( + TensorView* tv, + const std::vector& rfactor, + const std::vector& alloc) { + auto backward_exprs = StmtSort::getExprsBetween( + tv->fusion(), + {alloc.begin(), alloc.end()}, + {rfactor.begin(), rfactor.end()}); + std::reverse(backward_exprs.begin(), backward_exprs.end()); + for (auto expr : backward_exprs) { + handle(expr); + } + } +}; + +} // namespace + +// Given an ATen tensor, whose sizes and strides are w.r.t to the rFactor domain +// of its corresponding TensorView, compute the sizes and strides of the tensor +// with respect to its allocation domain. +// For example, if the rFactor domain is [I1, I2], and the allocation domain is +// [I2*I1], and the tensor's size is [5, 3] and stride is [2, 10], then the +// resulting size will be [15] and stride will be [2] +// Another example, if the rFactor domain is [I1*I2] and the allocation domain +// is [I1, I2], and the tensor's size is [15] and stride is [7], and the extent +// of I2 is 5, then the resulting size will be [3, 5] and stride will be [35, 7] +std::vector> +inferAndValidateAllocationSizesAndStrides( + const at::Tensor& tensor, + TensorView* tv, + ExpressionEvaluator& ee) { + if (tv == nullptr || !tv->hasAllocation()) { + // When tv is nullptr, or tv does not have allocation, the given sizes and + // strides should already be in the target format. So nothing to do here. + std::vector> result; + for (auto i : c10::irange(tensor.dim())) { + result.emplace_back(tensor.size(i), tensor.stride(i)); + } + return result; + } + const auto& alloc = + TensorDomain::noReductions(tv->getMaybeAllocationDomain()); + const auto& rfactor = TensorDomain::noReductions(tv->getMaybeRFactorDomain()); + + // active IDs and their shape and stride + std::unordered_map> active_ids; + TORCH_INTERNAL_ASSERT((int64_t)rfactor.size() == tensor.dim()); + for (int64_t i : c10::irange((int64_t)rfactor.size())) { + auto rf_id = rfactor.at(i); + active_ids[rf_id] = {tensor.size(i), tensor.stride(i)}; + } + + ForwardTraverseFromRFactorToAlloc(ee, active_ids).run(tv, rfactor, alloc); + BackwardTraverseFromRFactorToAlloc(ee, active_ids).run(tv, rfactor, alloc); + + // Now active_ids should contain the final sizes and strides, unordered. We + // need to put them to the correct order. + std::vector> sizes_strides; + sizes_strides.reserve(alloc.size()); + for (auto i : c10::irange(alloc.size())) { + auto id = alloc.at(i); + sizes_strides.emplace_back(active_ids.at(id)); + } + // Validate final sizes and strides with contiguity + int64_t contiguous_stride = 1; + std::vector> contiguity = tv->getContiguity(); + for (int64_t i = (int64_t)sizes_strides.size() - 1; i >= 0; i--) { + if (alloc.at(i)->isBroadcast()) { + continue; + } + while (!contiguity.back().has_value()) { + contiguity.pop_back(); + } + auto [size, stride] = sizes_strides.at(i); + TORCH_INTERNAL_ASSERT(!contiguity.empty()); + auto last_contiguity = contiguity.back(); + TORCH_INTERNAL_ASSERT( + last_contiguity.has_value(), + "I don't think this check makes sense, but unfortunately ", + "clang-tidy is not smart enough to infer from the context that this is always true."); + if (*last_contiguity) { + TORCH_CHECK( + stride == contiguous_stride, + "Stride mismatch with contiguity info. ", + "tv: ", + tv->toString(), + " allocation domain: ", + ir_utils::toString(tv->getMaybeAllocationDomain()), + " dim: ", + i, + " expected stride: ", + contiguous_stride, + " actual stride: ", + stride); + } + contiguous_stride = stride * size; + contiguity.pop_back(); + } + TORCH_INTERNAL_ASSERT( + contiguity.empty(), + "The size of contiguity mismatch with the dimensionality of allocation domain"); + // Validate that for expanded broadcast, the stride must be zero. + for (int64_t i : c10::irange((int64_t)sizes_strides.size())) { + if (auto alloc_id = alloc.at(i); alloc_id->hasExpandedExtent()) { + auto [_, stride] = sizes_strides.at(i); + TORCH_CHECK( + stride == 0, + "Expecting an expanded dimension on dimension ", + i, + " but found stride ", + stride); + } + } + return sizes_strides; } std::vector GetMetaData::evaluate( @@ -62,6 +337,4 @@ std::vector GetMetaData::evaluate( return {PolymorphicValue(concrete_value)}; } -NVFUSER_DEFINE_CLONE_AND_CREATE(GetMetaData) - } // namespace nvfuser From 2d223055ac3b99683cd0974ab840eedd52011916 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Thu, 27 Jul 2023 21:45:40 -0700 Subject: [PATCH 26/33] without precomputed values --- csrc/device_lower/pass/replace_size.cpp | 2 +- csrc/executor_kernel_arg.cpp | 6 ++-- csrc/executor_kernel_arg.h | 23 -------------- csrc/executor_utils.cpp | 24 ++++++++------ csrc/expr_evaluator.cpp | 2 +- csrc/polymorphic_value.h | 4 +++ csrc/scheduler/registry.cpp | 38 +++++++++++----------- csrc/tensor_metadata.cpp | 42 +++++++++++++++---------- runtime/tensor.cu | 4 +-- 9 files changed, 69 insertions(+), 76 deletions(-) diff --git a/csrc/device_lower/pass/replace_size.cpp b/csrc/device_lower/pass/replace_size.cpp index 522a3cb541e..36581612509 100644 --- a/csrc/device_lower/pass/replace_size.cpp +++ b/csrc/device_lower/pass/replace_size.cpp @@ -203,7 +203,7 @@ void replaceSymbolicSizes(Fusion* fusion) { if (tensor_dim_map.find(orig_size) == tensor_dim_map.end() && !orig_size->isFusionInput() && !orig_size->isConstScalar()) { std::stringstream ss; - ss << "T" << tv->name() << ".size[" << dim++ << "]"; + ss << "T" << tv->name() << ".logical_size[" << dim++ << "]"; tensor_dim_map[orig_size] = IrBuilder::create( ss.str(), orig_size->getDataType().value()); } else { diff --git a/csrc/executor_kernel_arg.cpp b/csrc/executor_kernel_arg.cpp index 314de44218c..7c62fd00e6c 100644 --- a/csrc/executor_kernel_arg.cpp +++ b/csrc/executor_kernel_arg.cpp @@ -402,10 +402,8 @@ std::vector getKernelArgument( (std::byte*)tensor.data_ptr(), (std::byte*)tensor.data_ptr() + tensor.element_size()); } else { - auto resolved_arg = getTensorArg(tensor, tv, ee, index_type); - return std::vector( - (std::byte*)resolved_arg->arg(), - (std::byte*)resolved_arg->arg() + resolved_arg->argSize()); + auto metadata = ee.evaluate(IrBuilder::metadataExpr(tv)); + return getTensorArgBuffer(metadata, index_type); } } else if (isIntegralType(parameter->dtype())) { int64_t v = pv.as(); diff --git a/csrc/executor_kernel_arg.h b/csrc/executor_kernel_arg.h index 2c1efd9304c..7e688c46848 100644 --- a/csrc/executor_kernel_arg.h +++ b/csrc/executor_kernel_arg.h @@ -279,13 +279,6 @@ struct TensorArgAbstract : ArgAbstract { } }; -// TODO: move this to GetMetaData::evaluate -std::vector> -inferAndValidateAllocationSizesAndStrides( - const at::Tensor& tensor, - TensorView* tv, - ExpressionEvaluator& ee); - // TODO: remove this template struct TensorArg : public TensorArgAbstract { @@ -298,22 +291,6 @@ struct TensorArg : public TensorArgAbstract { for (const auto i : c10::irange(tensor.ndimension())) { instance_.setSize(i, (typename TENSOR_TYPE::index_type)tensor.size(i)); } - inferSetAndValidateStrides(tensor, tv, eval); - } - - void inferSetAndValidateStrides( - const at::Tensor& tensor, - TensorView* tv, - ExpressionEvaluator& eval) { - auto sizes_strides = - inferAndValidateAllocationSizesAndStrides(tensor, tv, eval); - TORCH_INTERNAL_ASSERT( - (size_t)instance_.nAllocationDims() == sizes_strides.size()); - for (auto i : c10::irange((int64_t)sizes_strides.size())) { - alloc_sizes.at(i) = sizes_strides.at(i).first; - using stride_t = typename TENSOR_TYPE::index_type; - instance_.setStride(i, (stride_t)sizes_strides.at(i).second); - } } int64_t getAllocRank() const override { diff --git a/csrc/executor_utils.cpp b/csrc/executor_utils.cpp index 0734929404c..85fe91d788a 100644 --- a/csrc/executor_utils.cpp +++ b/csrc/executor_utils.cpp @@ -614,10 +614,10 @@ void validateAlignedVectorizeExtents( void validateAlignedVectorizedFusionInputOutput( const at::Tensor& aten_tensor, int word_size, - TensorView* tv) { - ExpressionEvaluator eval; - auto sizes_strides = - inferAndValidateAllocationSizesAndStrides(aten_tensor, tv, eval); + TensorView* tv, + ExpressionEvaluator eval) { + eval.bind(tv, aten_tensor); + auto metadata = eval.evaluate(IrBuilder::metadataExpr(tv)); std::vector no_reduction_to_full; for (int64_t i : @@ -627,7 +627,11 @@ void validateAlignedVectorizedFusionInputOutput( no_reduction_to_full.emplace_back(i); } } - TORCH_INTERNAL_ASSERT(sizes_strides.size() == no_reduction_to_full.size()); + + auto sizes = std::vector(metadata["logical_size"]); + auto strides = std::vector(metadata["alloc_stride"]); + TORCH_INTERNAL_ASSERT(sizes.size() == no_reduction_to_full.size()); + TORCH_INTERNAL_ASSERT(strides.size() == no_reduction_to_full.size()); TORCH_INTERNAL_ASSERT( reinterpret_cast(aten_tensor.data_ptr()) % @@ -647,8 +651,9 @@ void validateAlignedVectorizedFusionInputOutput( // domain must have stride 1. int64_t cur_contig_stride = 1; bool still_rightmost = true; - for (int64_t i = (int64_t)sizes_strides.size() - 1; i >= 0; --i) { - const auto [size, stride] = sizes_strides.at(i); + for (int64_t i = (int64_t)sizes.size() - 1; i >= 0; --i) { + const auto size = sizes.at(i); + const auto stride = strides.at(i); auto alloc_id = tv->getMaybeAllocationDomain().at(no_reduction_to_full.at(i)); const auto is_expanded_broadcasting = @@ -721,14 +726,15 @@ void validateAlignedVectorizedTensors( dynamic_cast(args[pos]); TORCH_INTERNAL_ASSERT(tensor_arg_abstract, "alias io only supports tensor"); validateAlignedVectorizedFusionInputOutput( - tensor_arg_abstract->getTensor(), word_size, tv); + tensor_arg_abstract->getTensor(), word_size, tv, expr_eval); } if (!outputs.empty()) { for (auto pos : tensor_vectorization_validation_entry.get() .aligned_vectorized_out_tensor_pos) { auto tv = kernel->outputs().at(pos)->as(); auto word_size = kernel->summary().vectorized_accesses.at(tv); - validateAlignedVectorizedFusionInputOutput(outputs[pos], word_size, tv); + validateAlignedVectorizedFusionInputOutput( + outputs[pos], word_size, tv, expr_eval); } } } diff --git a/csrc/expr_evaluator.cpp b/csrc/expr_evaluator.cpp index 20c0c234791..1344cb6bdd3 100644 --- a/csrc/expr_evaluator.cpp +++ b/csrc/expr_evaluator.cpp @@ -144,7 +144,7 @@ void ExpressionEvaluator::print() const { debug() << "--------------------\n"; for (const auto& kv : known_values_) { TORCH_INTERNAL_ASSERT(!kv.first->isConstScalar()); - debug() << kv.first << " = " << kv.second << " ; " + debug() << kv.first << " = " << kv.second.type().name() << " ; " << *kv.first->getValType() << "\n"; } diff --git a/csrc/polymorphic_value.h b/csrc/polymorphic_value.h index 01eaf7fa2be..a8747ca20c3 100644 --- a/csrc/polymorphic_value.h +++ b/csrc/polymorphic_value.h @@ -168,6 +168,10 @@ class Pointer { explicit operator unsigned() const { return (unsigned)(int64_t)(*this); } + + explicit operator size_t() const { + return reinterpret_cast(ptr_); + } }; inline Pointer operator+(int64_t offset, const Pointer& ptr) { diff --git a/csrc/scheduler/registry.cpp b/csrc/scheduler/registry.cpp index 92dc047d50d..429cfd1734d 100644 --- a/csrc/scheduler/registry.cpp +++ b/csrc/scheduler/registry.cpp @@ -959,6 +959,8 @@ SchedulerRuntimeInfo::SchedulerRuntimeInfo( complete_fusion_->inputs().size() == args.size(), "Invalid number of arguments passed in for provided fusion group."); + // TODO: not supporting precomputed values for now + precomputed_values = nullptr; expression_evaluator_ = getExpressionEvaluator(args, precomputed_values); if (forced_index_type.has_value()) { @@ -971,34 +973,32 @@ SchedulerRuntimeInfo::SchedulerRuntimeInfo( *expression_evaluator_); } - // Convert all abstract tensor args into tensor args and do tensor stride - // inference - std::vector tvs; - tvs.reserve(complete_fusion_->inputs().size()); - for (auto val : complete_fusion_->inputs()) { - tvs.emplace_back(dynamic_cast(val)); - } - args.getBuffer(index_type_, tvs, *expression_evaluator_); - for (auto inp_i : c10::irange(static_cast(args.size()))) { - auto kernel_arg = args[inp_i]; + auto fusion_inp = complete_fusion_->inputs().at(inp_i); + auto input_tv = dynamic_cast(fusion_inp); // Note: we are skipping CpuScalar tensor here - if (auto tensor_arg_abstract = - dynamic_cast(kernel_arg)) { - auto fusion_inp = complete_fusion_->inputs()[inp_i]; - input_ptrs_[fusion_inp] = tensor_arg_abstract->getPointerAddress(); + if (input_tv != nullptr && !input_tv->isCpuScalar()) { + auto metadata = + expression_evaluator_->evaluate(IrBuilder::metadataExpr(input_tv)); + std::vector alloc_sizes = + (std::vector)metadata["alloc_size"]; + std::vector alloc_strides = + (std::vector)metadata["alloc_stride"]; + TORCH_INTERNAL_ASSERT(alloc_sizes.size() == alloc_strides.size()); + + input_ptrs_[fusion_inp] = (size_t)metadata["data"]; // find and push discontiguous stride - auto dtype_size = dataTypeSize(tensor_arg_abstract->getDataType()); + int64_t dtype_size = dataTypeSize(input_tv->dtype()); input_discontig_strides_[fusion_inp] = {}; - auto dims = tensor_arg_abstract->getAllocRank(); + int64_t dims = alloc_strides.size(); int64_t expected_stride = 1; - for (auto dim = dims - 1; dim >= 0; dim--) { - auto size = tensor_arg_abstract->getAllocSize((int)dim); + for (int64_t dim = dims - 1; dim >= 0; dim--) { + auto size = alloc_sizes.at(dim); if (size <= 1) { continue; } - auto stride = tensor_arg_abstract->getAllocStride((int)dim); + auto stride = alloc_strides.at(dim); if (stride != expected_stride) { input_discontig_strides_[fusion_inp].push_back(stride * dtype_size); expected_stride = stride; diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp index bbb4889d144..e0ff07dede4 100644 --- a/csrc/tensor_metadata.cpp +++ b/csrc/tensor_metadata.cpp @@ -6,10 +6,12 @@ */ // clang-format on +#include #include #include #include #include +#include #include namespace nvfuser { @@ -210,8 +212,6 @@ class BackwardTraverseFromRFactorToAlloc { } }; -} // namespace - // Given an ATen tensor, whose sizes and strides are w.r.t to the rFactor domain // of its corresponding TensorView, compute the sizes and strides of the tensor // with respect to its allocation domain. @@ -221,19 +221,21 @@ class BackwardTraverseFromRFactorToAlloc { // Another example, if the rFactor domain is [I1*I2] and the allocation domain // is [I1, I2], and the tensor's size is [15] and stride is [7], and the extent // of I2 is 5, then the resulting size will be [3, 5] and stride will be [35, 7] -std::vector> +std::pair, std::vector> inferAndValidateAllocationSizesAndStrides( const at::Tensor& tensor, TensorView* tv, - ExpressionEvaluator& ee) { + ExpressionEvaluator ee) { if (tv == nullptr || !tv->hasAllocation()) { // When tv is nullptr, or tv does not have allocation, the given sizes and // strides should already be in the target format. So nothing to do here. - std::vector> result; + std::vector sizes; + std::vector strides; for (auto i : c10::irange(tensor.dim())) { - result.emplace_back(tensor.size(i), tensor.stride(i)); + sizes.emplace_back(tensor.size(i)); + strides.emplace_back(tensor.stride(i)); } - return result; + return {sizes, strides}; } const auto& alloc = TensorDomain::noReductions(tv->getMaybeAllocationDomain()); @@ -252,23 +254,27 @@ inferAndValidateAllocationSizesAndStrides( // Now active_ids should contain the final sizes and strides, unordered. We // need to put them to the correct order. - std::vector> sizes_strides; - sizes_strides.reserve(alloc.size()); + std::vector sizes; + std::vector strides; + sizes.reserve(alloc.size()); + strides.reserve(alloc.size()); for (auto i : c10::irange(alloc.size())) { auto id = alloc.at(i); - sizes_strides.emplace_back(active_ids.at(id)); + sizes.emplace_back(active_ids.at(id).first); + strides.emplace_back(active_ids.at(id).second); } // Validate final sizes and strides with contiguity int64_t contiguous_stride = 1; std::vector> contiguity = tv->getContiguity(); - for (int64_t i = (int64_t)sizes_strides.size() - 1; i >= 0; i--) { + for (int64_t i = (int64_t)sizes.size() - 1; i >= 0; i--) { if (alloc.at(i)->isBroadcast()) { continue; } while (!contiguity.back().has_value()) { contiguity.pop_back(); } - auto [size, stride] = sizes_strides.at(i); + auto size = sizes.at(i); + auto stride = strides.at(i); TORCH_INTERNAL_ASSERT(!contiguity.empty()); auto last_contiguity = contiguity.back(); TORCH_INTERNAL_ASSERT( @@ -297,9 +303,9 @@ inferAndValidateAllocationSizesAndStrides( contiguity.empty(), "The size of contiguity mismatch with the dimensionality of allocation domain"); // Validate that for expanded broadcast, the stride must be zero. - for (int64_t i : c10::irange((int64_t)sizes_strides.size())) { + for (int64_t i : c10::irange((int64_t)strides.size())) { if (auto alloc_id = alloc.at(i); alloc_id->hasExpandedExtent()) { - auto [_, stride] = sizes_strides.at(i); + auto stride = strides.at(i); TORCH_CHECK( stride == 0, "Expecting an expanded dimension on dimension ", @@ -308,9 +314,11 @@ inferAndValidateAllocationSizesAndStrides( stride); } } - return sizes_strides; + return {sizes, strides}; } +} // namespace + std::vector GetMetaData::evaluate( const ExpressionEvaluator& ee, const std::vector& inputs) const { @@ -332,8 +340,8 @@ std::vector GetMetaData::evaluate( PolymorphicValue(Pointer(input.data_ptr(), tv->dtype())); concrete_value["logical_size"] = PolymorphicValue(input.sizes().vec()); concrete_value["logical_stride"] = PolymorphicValue(input.strides().vec()); - concrete_value["alloc_size"] = PolymorphicValue(input.sizes().vec()); - concrete_value["alloc_stride"] = PolymorphicValue(input.strides().vec()); + std::tie(concrete_value["alloc_size"], concrete_value["alloc_stride"]) = + inferAndValidateAllocationSizesAndStrides(input, tv, ee); return {PolymorphicValue(concrete_value)}; } diff --git a/runtime/tensor.cu b/runtime/tensor.cu index 8d5648bb28d..c0ee5436cf7 100644 --- a/runtime/tensor.cu +++ b/runtime/tensor.cu @@ -12,8 +12,8 @@ struct Tensor { }; T* data; - Array size; - Array stride; + Array logical_size; + Array alloc_stride; }; // Specialization for 0-dim case as it does not need size and stride arrays. From ca618beab178793572c371f43655cc9f5ed892c0 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Thu, 27 Jul 2023 22:20:55 -0700 Subject: [PATCH 27/33] fix ExprSimplifierTest --- csrc/ir/nodes.cpp | 2 +- csrc/scheduler/registry.cpp | 7 ++--- test/test_expr_simplifier.cpp | 51 ++++++++++++++++++----------------- 3 files changed, 30 insertions(+), 30 deletions(-) diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp index ff62eb11af1..4e414ec4b89 100644 --- a/csrc/ir/nodes.cpp +++ b/csrc/ir/nodes.cpp @@ -3626,7 +3626,7 @@ bool NamedScalar::sameAs(const Statement* other) const { } bool NamedScalar::isTensorSize() const { - static const std::regex r(R"(T\d+\.size\[\d+\])"); + static const std::regex r(R"(T\d+\.\w*size\[\d+\])"); return std::regex_match(name(), r); } diff --git a/csrc/scheduler/registry.cpp b/csrc/scheduler/registry.cpp index 429cfd1734d..7706e78ea67 100644 --- a/csrc/scheduler/registry.cpp +++ b/csrc/scheduler/registry.cpp @@ -959,8 +959,6 @@ SchedulerRuntimeInfo::SchedulerRuntimeInfo( complete_fusion_->inputs().size() == args.size(), "Invalid number of arguments passed in for provided fusion group."); - // TODO: not supporting precomputed values for now - precomputed_values = nullptr; expression_evaluator_ = getExpressionEvaluator(args, precomputed_values); if (forced_index_type.has_value()) { @@ -1030,11 +1028,10 @@ std::unique_ptr SchedulerRuntimeInfo:: const KernelArgumentHolder& args, PrecomputedValues* precomputed_values) { std::unique_ptr ee = - std::make_unique(); + std::make_unique( + executor_utils::bindInputs(args, complete_fusion_)); if (precomputed_values) { ee->bindPrecomputedValues(precomputed_values); - } else { - *ee = executor_utils::bindInputs(args, complete_fusion_); } return ee; } diff --git a/test/test_expr_simplifier.cpp b/test/test_expr_simplifier.cpp index 7887f400ed9..d0a4260a277 100644 --- a/test/test_expr_simplifier.cpp +++ b/test/test_expr_simplifier.cpp @@ -699,8 +699,8 @@ TEST_F(ExprSimplifierTest, SignProve) { assertProvedNonZero("1"_); assertProvedNonZero("2"_); - assertProvedNonNegative("T123.size[3]"_); - assertProvedNonNegative("T123.stride[3]"_); + assertProvedNonNegative("T123.logical_size[3]"_); + assertProvedNonNegative("T123.alloc_stride[3]"_); std::vector assumptions{ "i1 < 2 && i1 >= 0"_, @@ -780,18 +780,18 @@ TEST_F(ExprSimplifierTest, DistributeGcdRemainderDivMod) { expectSimplifiedMod("i1 * 3 + 2"_, "6"_, "( i1 % 2 ) * 3 + 2"_, {"i1 >= 0"_}); expectSimplifiedDiv( "i1 * 4 + 3"_, - "32 * T0.size[0]"_, - "i1 / ( 8 * T0.size[0] )"_, + "32 * T0.logical_size[0]"_, + "i1 / ( 8 * T0.logical_size[0] )"_, {"i1 >= 0"_}); expectSimplifiedMod( "i1 * 4 + 3"_, - "32 * T0.size[0]"_, - "( i1 % ( 8 * T0.size[0] ) ) * 4 + 3"_, + "32 * T0.logical_size[0]"_, + "( i1 % ( 8 * T0.logical_size[0] ) ) * 4 + 3"_, {"i1 >= 0"_}); expectSimplifiedDiv( - "( ( ( blockIdx.x * 128 + threadIdx.x ) % ( T0.size[3] * 24 ) ) * 4 ) + 3"_, - "32 * T0.size[3]"_, - "( ( blockIdx.x * 128 + threadIdx.x ) % ( T0.size[3] * 24 ) ) / ( 8 * T0.size[3] )"_, + "( ( ( blockIdx.x * 128 + threadIdx.x ) % ( T0.logical_size[3] * 24 ) ) * 4 ) + 3"_, + "32 * T0.logical_size[3]"_, + "( ( blockIdx.x * 128 + threadIdx.x ) % ( T0.logical_size[3] * 24 ) ) / ( 8 * T0.logical_size[3] )"_, {}); } @@ -838,30 +838,32 @@ TEST_F(ExprSimplifierTest, Compare) { EXPECT_TRUE(*simplify("d1 >= d1 * d2"_, "d1 <= 0.0 && d2 >= 1.0"_)); EXPECT_TRUE( *simplifyExpr( - "ceilDiv( T0.size[0] , 128 ) * 4 >= ceilDiv( T0.size[0] , 128 )"_) + "ceilDiv( T0.logical_size[0] , 128 ) * 4 >= ceilDiv( T0.logical_size[0] , 128 )"_) ->getBool()); EXPECT_TRUE(*simplify("ceilDiv( i1 , i2 ) > 0"_, "i1 > 0 && i2 > 0"_)); EXPECT_TRUE(*simplify("ceilDiv( i1 , i2 ) >= 1"_, "i1 > 0 && i2 > 0"_)); EXPECT_TRUE(*simplify( - "blockIdx.x < ceilDiv( T0.size[0] , 128 ) * 4"_, - "blockIdx.x < ceilDiv( T0.size[0] , 128 ) * 4"_)); + "blockIdx.x < ceilDiv( T0.logical_size[0] , 128 ) * 4"_, + "blockIdx.x < ceilDiv( T0.logical_size[0] , 128 ) * 4"_)); EXPECT_TRUE(*simplify("i1 % i2 < i2"_, "i2 >= 0"_)); } TEST_F(ExprSimplifierTest, FundamentalDivisionWithRemainderProperty) { - EXPECT_TRUE( - isEquivalent("i1 / T1.size[0] * T1.size[0] + i1 % T1.size[0]"_, "i1"_)); EXPECT_TRUE(isEquivalent( - "( i2 + i1 / T1.size[0] * T1.size[0] ) + i1 % T1.size[0]"_, "i1 + i2"_)); + "i1 / T1.logical_size[0] * T1.logical_size[0] + i1 % T1.logical_size[0]"_, + "i1"_)); + EXPECT_TRUE(isEquivalent( + "( i2 + i1 / T1.logical_size[0] * T1.logical_size[0] ) + i1 % T1.logical_size[0]"_, + "i1 + i2"_)); EXPECT_TRUE(isEquivalent( - "( i1 / T1.size[0] ) * ( T1.size[0] * T1.size[1] ) + T1.size[1] * ( i1 % T1.size[0] )"_, - "i1 * T1.size[1]"_)); + "( i1 / T1.logical_size[0] ) * ( T1.logical_size[0] * T1.logical_size[1] ) + T1.logical_size[1] * ( i1 % T1.logical_size[0] )"_, + "i1 * T1.logical_size[1]"_)); EXPECT_TRUE(isEquivalent( - "i2 + ( i1 / T1.size[0] ) * ( T1.size[0] * T1.size[1] ) + T1.size[1] * ( i1 % T1.size[0] )"_, - "i1 * T1.size[1] + i2"_)); + "i2 + ( i1 / T1.logical_size[0] ) * ( T1.logical_size[0] * T1.logical_size[1] ) + T1.logical_size[1] * ( i1 % T1.logical_size[0] )"_, + "i1 * T1.logical_size[1] + i2"_)); } TEST_F(ExprSimplifierTest, ReducePredicateRegisterUsage) { @@ -1030,14 +1032,15 @@ TEST_F(ExprSimplifierTest, MinMax) { }; auto expr = - "max( max( ceilDiv( T0.size[0] , 128 ) * 4 , ceilDiv( T0.size[0] , 128 ) ) , 4 )"_; - EXPECT_TRUE(simplify(expr, "T0.size[0] > 0"_) - ->sameAs("ceilDiv( T0.size[0] , 128 ) * 4"_)); + "max( max( ceilDiv( T0.logical_size[0] , 128 ) * 4 , ceilDiv( T0.logical_size[0] , 128 ) ) , 4 )"_; + EXPECT_TRUE(simplify(expr, "T0.logical_size[0] > 0"_) + ->sameAs("ceilDiv( T0.logical_size[0] , 128 ) * 4"_)); } TEST_F(ExprSimplifierTest, PredicateDivToMul) { - auto simplified = simplifyExpr("i1 / T0.size[0] < i2"_, {}, {"i1 >= 0"_}); - auto expect = "i1 < ( i2 * T0.size[0] )"_; + auto simplified = + simplifyExpr("i1 / T0.logical_size[0] < i2"_, {}, {"i1 >= 0"_}); + auto expect = "i1 < ( i2 * T0.logical_size[0] )"_; EXPECT_TRUE(simplified->sameAs(expect)); } From bc847b10179200f296784f2f7d33d523584cd5fc Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Thu, 27 Jul 2023 22:30:45 -0700 Subject: [PATCH 28/33] fix AllocationDomainTest --- csrc/executor_utils.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/csrc/executor_utils.cpp b/csrc/executor_utils.cpp index 85fe91d788a..590ba028518 100644 --- a/csrc/executor_utils.cpp +++ b/csrc/executor_utils.cpp @@ -628,7 +628,7 @@ void validateAlignedVectorizedFusionInputOutput( } } - auto sizes = std::vector(metadata["logical_size"]); + auto sizes = std::vector(metadata["alloc_size"]); auto strides = std::vector(metadata["alloc_stride"]); TORCH_INTERNAL_ASSERT(sizes.size() == no_reduction_to_full.size()); TORCH_INTERNAL_ASSERT(strides.size() == no_reduction_to_full.size()); @@ -682,7 +682,9 @@ void validateAlignedVectorizedFusionInputOutput( " Domain: ", tv->axis(i)->toString(), ", stride: ", - stride) + stride, + ", cur_contig_stride ", + cur_contig_stride); // If the domain is size-1, the next domain is still considered // rightmost. still_rightmost = From c815b873d5b97110ec32e81e41e55c80fc52e2ee Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Thu, 27 Jul 2023 22:45:09 -0700 Subject: [PATCH 29/33] fix MetadataAsTensor --- csrc/codegen.cpp | 4 ++-- csrc/type.cpp | 7 ++++--- csrc/type.h | 5 +++++ 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/csrc/codegen.cpp b/csrc/codegen.cpp index 88e2305cab3..95c249db52f 100644 --- a/csrc/codegen.cpp +++ b/csrc/codegen.cpp @@ -589,7 +589,7 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { [&](auto&& dtype) { using T = std::decay_t; if constexpr (std::is_same_v) { - for (auto& [name, _] : dtype.types) { + for (auto& name : dtype.field_names) { indent() << gen(gop->output(0)) << "." << name << " = " << gen(gop->in()) << "." << name << ";\n"; } @@ -1402,7 +1402,7 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { ldst->out()->dtype(), " = ", ldst->in()->dtype()); - for (auto& [name, _] : out_type.types) { + for (auto& name : out_type.field_names) { TORCH_INTERNAL_ASSERT( in_type.types.find(name) != in_type.types.end(), "Mismatched field in struct assignment: ", diff --git a/csrc/type.cpp b/csrc/type.cpp index 40377a27352..ddb98f8fa0f 100644 --- a/csrc/type.cpp +++ b/csrc/type.cpp @@ -35,6 +35,7 @@ DataType metaDataTypeOf(const Val* v) { StructOf tv_metadata; tv_metadata.name = ss.str(); + tv_metadata.field_names = {"data", "logical_size", "alloc_stride"}; tv_metadata.types["data"] = NVFUSER_MAYBE_MAKE_SHARED( PointerOf{std::make_shared(tv->dtype())}); tv_metadata.types["logical_size"] = NVFUSER_MAYBE_MAKE_SHARED2( @@ -215,9 +216,9 @@ static std::string data_type2string(DataType t) { } std::stringstream ss; ss << "struct { "; - for (auto& [name, type] : dtype.types) { - ss << data_type2string(NVFUSER_MAYBE_STAR type) << " " << name - << "; "; + for (auto& name : dtype.field_names) { + ss << data_type2string(NVFUSER_MAYBE_STAR dtype.types.at(name)) + << " " << name << "; "; } ss << "}"; return ss.str(); diff --git a/csrc/type.h b/csrc/type.h index 2460d1b865a..d0d2fc067d6 100644 --- a/csrc/type.h +++ b/csrc/type.h @@ -114,6 +114,11 @@ struct StructOf { // runtime/, and anonymous structs for others. std::string name; + // The ordered list of field names. This is used to generate the struct type + // on device. This list does not necessarily contain all the fields in the + // struct, but it should contain all the fields that are used on device. + std::vector field_names; + // Note [Incomplete type support in STL] // std::unordered_map is a STL container of incomplete // type. Not all C++ STL containers supports incomplete type due to historical From 7e5dafe5ec4db934d53d798006d73c3d034e917f Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Thu, 27 Jul 2023 22:50:23 -0700 Subject: [PATCH 30/33] fix LoopRotationTest --- test/test_loop_rotation.cpp | 50 ++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/test/test_loop_rotation.cpp b/test/test_loop_rotation.cpp index 5c71941fc91..b5b0d635b03 100644 --- a/test/test_loop_rotation.cpp +++ b/test/test_loop_rotation.cpp @@ -36,13 +36,13 @@ TEST_F(LoopRotationTest, RotateInner) { __global__ void CUDAGeneratedKernel(Tensor T0, Tensor T4) { NVFUSER_DEFINE_MAGIC_ZERO; Array a0; - a0 = (T0).stride; + a0 = (T0).alloc_stride; int64_t i1; i1 = a0[0]; int64_t i2; i2 = a0[1]; #pragma unroll 1 - for(nvfuser_index_t i3 = 0; i3 < T0.size[0]; ++i3) { + for(nvfuser_index_t i3 = 0; i3 < T0.logical_size[0]; ++i3) { int64_t i4; i4 = i1 * i3; int64_t i5; @@ -108,7 +108,7 @@ TEST_F(LoopRotationTest, RotateOuter) { __global__ void CUDAGeneratedKernel(Tensor T0, Tensor T4) { NVFUSER_DEFINE_MAGIC_ZERO; Array a0; - a0 = (T0).stride; + a0 = (T0).alloc_stride; int64_t i1; i1 = a0[1]; int64_t i2; @@ -133,13 +133,13 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor } NVFUSER_UPDATE_MAGIC_ZERO; #pragma unroll 1 - for(nvfuser_index_t i5 = 0; i5 < T0.size[0]; ++i5) { + for(nvfuser_index_t i5 = 0; i5 < T0.logical_size[0]; ++i5) { int64_t i6; i6 = 3 * i5; int64_t i7; i7 = i2 + (i2 * i5); bool b8; - b8 = (1 + i5) < T0.size[0]; + b8 = (1 + i5) < T0.logical_size[0]; // Alias Allocation - register auto& T3 = T1; #pragma unroll @@ -211,13 +211,13 @@ TEST_F(LoopRotationTest, NonDivisibleSplit) { __global__ void CUDAGeneratedKernel(Tensor T0, Tensor T4) { NVFUSER_DEFINE_MAGIC_ZERO; Array a0; - a0 = (T0).stride; + a0 = (T0).alloc_stride; int64_t i1; i1 = a0[0]; int64_t i2; i2 = a0[1]; int64_t i3; - i3 = T0.size[0] * T0.size[1]; + i3 = T0.logical_size[0] * T0.logical_size[1]; float T1[5]; float T2[5]; #pragma unroll @@ -231,7 +231,7 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor i5 = i4 + nvfuser_zero; if ((i5 < i3)) { T1[i4] - = T0[((i1 * (i5 / T0.size[1])) + (i2 * (i5 % T0.size[1])))]; + = T0[((i1 * (i5 / T0.logical_size[1])) + (i2 * (i5 % T0.logical_size[1])))]; } } NVFUSER_UPDATE_MAGIC_ZERO; @@ -242,7 +242,7 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor } NVFUSER_UPDATE_MAGIC_ZERO; #pragma unroll 1 - for(nvfuser_index_t i7 = 0; i7 < (ceilDiv((T0.size[0] * T0.size[1]), 5)); ++i7) { + for(nvfuser_index_t i7 = 0; i7 < (ceilDiv((T0.logical_size[0] * T0.logical_size[1]), 5)); ++i7) { int64_t i8; i8 = 5 * i7; int64_t i9; @@ -276,7 +276,7 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor i13 = i9 + (i4 + nvfuser_zero); if ((i13 < i3)) { T1[i4] - = T0[((i1 * (i13 / T0.size[1])) + (i2 * (i13 % T0.size[1])))]; + = T0[((i1 * (i13 / T0.logical_size[1])) + (i2 * (i13 % T0.logical_size[1])))]; } } NVFUSER_UPDATE_MAGIC_ZERO; @@ -321,7 +321,7 @@ TEST_F(LoopRotationTest, DoubleBuffered) { __global__ void CUDAGeneratedKernel(Tensor T0, Tensor T4) { NVFUSER_DEFINE_MAGIC_ZERO; Array a0; - a0 = (T0).stride; + a0 = (T0).alloc_stride; int64_t i1; i1 = a0[0]; int64_t i2; @@ -336,7 +336,7 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor int64_t i6; i6 = i1 * i4; bool b7; - b7 = (i4 + nvfuser_zero) < T0.size[0]; + b7 = (i4 + nvfuser_zero) < T0.logical_size[0]; #pragma unroll for(nvfuser_index_t i8 = 0; i8 < 3; ++i8) { T1[(i5 + i8)] = 0; @@ -358,7 +358,7 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor } NVFUSER_UPDATE_MAGIC_ZERO; #pragma unroll 1 - for(nvfuser_index_t i10 = 0; i10 < T0.size[0]; ++i10) { + for(nvfuser_index_t i10 = 0; i10 < T0.logical_size[0]; ++i10) { int64_t i11; i11 = 4 + i10; int64_t i12; @@ -370,7 +370,7 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor int64_t i15; i15 = 3 * ((1 + i10) % 5); bool b16; - b16 = i11 < T0.size[0]; + b16 = i11 < T0.logical_size[0]; #pragma unroll for(nvfuser_index_t i8 = 0; i8 < 3; ++i8) { T1[(i12 + i8)] = 0; @@ -438,7 +438,7 @@ TEST_F(LoopRotationTest, SelectDoubleBufferLoad) { __global__ void CUDAGeneratedKernel(Tensor T0, Tensor T4) { NVFUSER_DEFINE_MAGIC_ZERO; Array a0; - a0 = (T0).stride; + a0 = (T0).alloc_stride; int64_t i1; i1 = a0[1]; int64_t i2; @@ -448,7 +448,7 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor int64_t i4; i4 = 5 * i2; bool b5; - b5 = 4 < T0.size[0]; + b5 = 4 < T0.logical_size[0]; float T1[15]; #pragma unroll for(nvfuser_index_t i6 = 0; i6 < 3; ++i6) { @@ -468,7 +468,7 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor int64_t i9; i9 = i2 + (i2 * i7); bool b10; - b10 = ((1 + i7) + nvfuser_zero) < T0.size[0]; + b10 = ((1 + i7) + nvfuser_zero) < T0.logical_size[0]; #pragma unroll for(nvfuser_index_t i6 = 0; i6 < 3; ++i6) { T1[(i8 + i6)] = 0; @@ -503,7 +503,7 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor } NVFUSER_UPDATE_MAGIC_ZERO; #pragma unroll 1 - for(nvfuser_index_t i12 = 0; i12 < T0.size[0]; ++i12) { + for(nvfuser_index_t i12 = 0; i12 < T0.logical_size[0]; ++i12) { int64_t i13; i13 = 3 * i12; int64_t i14; @@ -513,7 +513,7 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor int64_t i16; i16 = 3 * ((1 + i12) % 5); bool b17; - b17 = (5 + i12) < T0.size[0]; + b17 = (5 + i12) < T0.logical_size[0]; float T3[3]; #pragma unroll for(nvfuser_index_t i18 = 0; i18 < 3; ++i18) { @@ -596,13 +596,13 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor unsigned smem_offset = 0; NVFUSER_DEFINE_MAGIC_ZERO; Tensor s0; - s0.stride = T0.stride; - s0.size = T0.size; s0.data = T0.data; + s0.logical_size = T0.logical_size; + s0.alloc_stride = T0.alloc_stride; float* ptr1; ptr1 = s0.data; Array a2; - a2 = s0.stride; + a2 = s0.alloc_stride; int64_t i3; i3 = a2[0]; int64_t i4; @@ -619,7 +619,7 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor unsigned i8; i8 = (toSmem((T4))) + (12 * i6); bool b9; - b9 = (i6 + nvfuser_zero) < T0.size[0]; + b9 = (i6 + nvfuser_zero) < T0.logical_size[0]; #pragma unroll for(nvfuser_index_t i10 = 0; i10 < 3; ++i10) { Ampere::cpAsyncCa((i8 + (4 * i10)), (ptr7 + (i4 * (i10 + nvfuser_zero))), b9); @@ -632,7 +632,7 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor T1[0] = T4[0]; #pragma unroll 1 - for(nvfuser_index_t i11 = 0; i11 < T0.size[0]; ++i11) { + for(nvfuser_index_t i11 = 0; i11 < T0.logical_size[0]; ++i11) { float* ptr12; ptr12 = ptr5 + (i3 * i11); int64_t i13; @@ -644,7 +644,7 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor int64_t i16; i16 = 3 * i11; bool b17; - b17 = i13 < T0.size[0]; + b17 = i13 < T0.logical_size[0]; #pragma unroll for(nvfuser_index_t i10 = 0; i10 < 3; ++i10) { Ampere::cpAsyncCa((i14 + (4 * i10)), (ptr12 + (i4 * (i10 + nvfuser_zero))), b17); From 1eb83380ce1fba60ace72cada6b3397db38315be Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Thu, 27 Jul 2023 22:59:41 -0700 Subject: [PATCH 31/33] fix other tests --- test/test_gpu1.cpp | 2 +- test/test_gpu2.cpp | 74 +++++++++++++++++++++++----------------------- test/test_gpu3.cpp | 2 +- 3 files changed, 39 insertions(+), 39 deletions(-) diff --git a/test/test_gpu1.cpp b/test/test_gpu1.cpp index 188bd66e524..b78a60c4a00 100644 --- a/test/test_gpu1.cpp +++ b/test/test_gpu1.cpp @@ -892,7 +892,7 @@ TEST_F(NVFuserTest, FusionParser_CUDA) { __global__ void CUDAGeneratedKernel(Tensor T0, Tensor T1, Tensor T3) { int64_t i0; i0 = ((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)); - if ((i0 < T0.size[0])) { + if ((i0 < T0.logical_size[0])) { float T5[1]; T5[0] = 0; T5[0] diff --git a/test/test_gpu2.cpp b/test/test_gpu2.cpp index c5f84e7b3fd..73985284b13 100644 --- a/test/test_gpu2.cpp +++ b/test/test_gpu2.cpp @@ -2255,8 +2255,8 @@ TEST_F(NVFuserTest, FusionSimpleCompileRtc_CUDA) { std::string kernel = R"( __global__ void kernel1(Tensor T0, Tensor T1) { if(threadIdx.x==0){ - for(size_t ki28 = 0; ki28 < T0.size[0]; ++ki28) { - T1[ki28*T1.stride[0]] = T0[ki28*T0.stride[0]]*2; + for(size_t ki28 = 0; ki28 < T0.logical_size[0]; ++ki28) { + T1[ki28*T1.alloc_stride[0]] = T0[ki28*T0.alloc_stride[0]]*2; } } } @@ -2292,27 +2292,27 @@ __global__ void kernel1( Tensor out_var, Tensor out_avg ){ - for(int i0=0;i0 T0, Tensor<__half, 4, 4> T2, Tensor<__half, 4, 4> T7) { int64_t i0; - i0 = T0.size[2] * T0.size[1]; + i0 = T0.logical_size[2] * T0.logical_size[1]; int64_t i1; i1 = ((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)); int64_t i2; - i2 = (T0.size[1] * T0.size[2]) * T0.size[3]; + i2 = (T0.logical_size[1] * T0.logical_size[2]) * T0.logical_size[3]; int64_t i3; i3 = i1 % i2; int64_t i4; - i4 = T0.size[2] * T0.size[3]; + i4 = T0.logical_size[2] * T0.logical_size[3]; int64_t i5; i5 = i3 % i4; - if ((i1 < (((T0.size[0] * T0.size[1]) * T0.size[2]) * T0.size[3]))) { + if ((i1 < (((T0.logical_size[0] * T0.logical_size[1]) * T0.logical_size[2]) * T0.logical_size[3]))) { __half T9[1]; T9[0] = 0; T9[0] - = T2[(((((i0 * T0.size[3]) * (i1 / i2)) + (i0 * (i5 % T0.size[3]))) + (T0.size[2] * (i3 / i4))) + (i5 / T0.size[3]))]; + = T2[(((((i0 * T0.logical_size[3]) * (i1 / i2)) + (i0 * (i5 % T0.logical_size[3]))) + (T0.logical_size[2] * (i3 / i4))) + (i5 / T0.logical_size[3]))]; __half T8[1]; T8[0] = 0; T8[0] diff --git a/test/test_gpu3.cpp b/test/test_gpu3.cpp index 668cac4e426..3a5fa452ad5 100644 --- a/test/test_gpu3.cpp +++ b/test/test_gpu3.cpp @@ -1731,7 +1731,7 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor int64_t i0; i0 = ((nvfuser_index_t)threadIdx.x) + (256 * ((nvfuser_index_t)blockIdx.x)); int64_t i1; - i1 = T0.size[0] * T0.size[1]; + i1 = T0.logical_size[0] * T0.logical_size[1]; bool b2; b2 = i0 < i1; float f3; From cfc9b720eb8325e89f2258562df732db21706d70 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Thu, 27 Jul 2023 23:13:26 -0700 Subject: [PATCH 32/33] unchange --- csrc/expr_evaluator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/expr_evaluator.cpp b/csrc/expr_evaluator.cpp index 1344cb6bdd3..20c0c234791 100644 --- a/csrc/expr_evaluator.cpp +++ b/csrc/expr_evaluator.cpp @@ -144,7 +144,7 @@ void ExpressionEvaluator::print() const { debug() << "--------------------\n"; for (const auto& kv : known_values_) { TORCH_INTERNAL_ASSERT(!kv.first->isConstScalar()); - debug() << kv.first << " = " << kv.second.type().name() << " ; " + debug() << kv.first << " = " << kv.second << " ; " << *kv.first->getValType() << "\n"; } From 2a21b8f2ed877808774d535861897f778c306a5d Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Thu, 27 Jul 2023 23:52:57 -0700 Subject: [PATCH 33/33] tidy --- csrc/scheduler/registry.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/scheduler/registry.cpp b/csrc/scheduler/registry.cpp index 7706e78ea67..e56ce83c4bf 100644 --- a/csrc/scheduler/registry.cpp +++ b/csrc/scheduler/registry.cpp @@ -989,7 +989,7 @@ SchedulerRuntimeInfo::SchedulerRuntimeInfo( // find and push discontiguous stride int64_t dtype_size = dataTypeSize(input_tv->dtype()); input_discontig_strides_[fusion_inp] = {}; - int64_t dims = alloc_strides.size(); + int64_t dims = (int64_t)alloc_strides.size(); int64_t expected_stride = 1; for (int64_t dim = dims - 1; dim >= 0; dim--) { auto size = alloc_sizes.at(dim);