From 9a0dc9e8d6e0cdc8699384a8bae203591d01ec4b Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 26 Mar 2025 04:29:58 -0700 Subject: [PATCH 1/5] host ir alias and prealloc output support --- csrc/host_ir/container.cpp | 5 + csrc/host_ir/container.h | 12 ++ csrc/host_ir/executor.cpp | 216 ++++++++++++++++++------------------ csrc/host_ir/executor.h | 33 +++++- tests/cpp/test_host_irs.cpp | 112 ++++++++++++++++++- 5 files changed, 268 insertions(+), 110 deletions(-) diff --git a/csrc/host_ir/container.cpp b/csrc/host_ir/container.cpp index ca4c98f2a56..a133c0521ba 100644 --- a/csrc/host_ir/container.cpp +++ b/csrc/host_ir/container.cpp @@ -35,6 +35,11 @@ Stream* HostIrContainer::getDefaultStream() { std::ostream& HostIrContainer::print(std::ostream& os) const { IrMathPrinter op_exprs(os); op_exprs.handle(this); + os << "Aliases:{"; + for (const auto& alias : alias_) { + os << "\n " << alias.first << " -> " << alias.second; + } + os << "\n}\n"; return os; } diff --git a/csrc/host_ir/container.h b/csrc/host_ir/container.h index ab029f63fa2..7dcd66b4436 100644 --- a/csrc/host_ir/container.h +++ b/csrc/host_ir/container.h @@ -55,10 +55,22 @@ class HostIrContainer final : public Fusion { Stream* getDefaultStream(); + void markAlias(TensorView* original, const TensorView* new_alias) { + if (alias_.count(original)) { + original = alias_[original]->as(); + } + alias_[new_alias] = original; + } + + const auto& alias() const { + return alias_; + } + private: std::vector top_level_exprs_; std::vector> kernel_executors_; Stream* default_stream_ = nullptr; + std::unordered_map alias_; }; } // namespace hir diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 4b6c2fe6796..4b631bd0061 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -171,32 +171,6 @@ KernelArgumentHolder HostIrExecutor::run( namespace hir { -namespace { - -at::Tensor getKnownTensorOrUndefined( - Val* val, - const ExpressionEvaluator& expr_evaluator) { - return expr_evaluator.isKnown(val) - ? expr_evaluator.evaluate(val).as() - : at::Tensor(); -} - -KernelArgumentHolder getKnownTensorOrUndefined( - const std::vector& vals, - const ExpressionEvaluator& expr_evaluator) { - std::vector tensors(vals.size()); - std::transform( - vals.begin(), - vals.end(), - tensors.begin(), - [&expr_evaluator](Val* val) -> at::Tensor { - return getKnownTensorOrUndefined(val, expr_evaluator); - }); - return KernelArgumentHolder(tensors); -} - -} // namespace - HostIrEvaluator::HostIrEvaluator( std::unique_ptr container, Communicator* communicator, @@ -216,10 +190,23 @@ HostIrEvaluator::HostIrEvaluator( {container_->getDefaultStream(), c10::cuda::getDefaultCUDAStream( static_cast(device_index))}); - expr_evaluator_.bind("numberOfStreams", params_.number_of_streams); + NVF_ERROR( + std::all_of( + container_->inputs().begin(), + container_->inputs().end(), + [this](Val* input) { return !container_->alias().count(input); }), + "Inputs cannot be aliased"); } -KernelArgumentHolder HostIrEvaluator::dispatchAndCollectOutputs() { +KernelArgumentHolder HostIrEvaluator::runWithInput( + const std::unordered_map& val_to_PValue) { + expr_evaluator_ = ExpressionEvaluator(); + expr_evaluator_.bind("numberOfStreams", params_.number_of_streams); + // process input values, converting IValue to PolymorphicValue + for (const auto& [val, pvalue] : val_to_PValue) { + bind(val, pvalue); + } + // Interpret each instruction in an "eager" way by iterate over the Host Ir // Container's top level expression list for (auto expr : container_->topLevelExprs()) { @@ -227,17 +214,15 @@ KernelArgumentHolder HostIrEvaluator::dispatchAndCollectOutputs() { } // Collect global outputs - return getKnownTensorOrUndefined(container_->outputs(), expr_evaluator_); -} - -KernelArgumentHolder HostIrEvaluator::runWithInput( - const std::unordered_map& val_to_PValue) { - // process input values, converting IValue to PolymorphicValue - for (const auto& [val, pvalue] : val_to_PValue) { - expr_evaluator_.bind(val, pvalue); - } - - return dispatchAndCollectOutputs(); + std::vector outputs(container_->outputs().size()); + std::transform( + container_->outputs().begin(), + container_->outputs().end(), + outputs.begin(), + [this](Val* val) -> at::Tensor { + return this->getKnownTensorOrUndefined(val); + }); + return KernelArgumentHolder(outputs); } std::string HostIrEvaluator::canRun() const { @@ -320,13 +305,7 @@ void HostIrEvaluator::handle(Synchronize* synchronize) { void HostIrEvaluator::handle(LaunchKernel* launch_kernel) { KernelArgumentHolder args; for (auto& input : launch_kernel->inputs()) { - NVF_ERROR( - expr_evaluator_.isKnown(input), - "No buffer associated with Val ", - input, - " for handling ", - launch_kernel->toString()); - args.push(expr_evaluator_.evaluate(input)); + args.push(getKnownConcreteData(input)); } args.setDeviceIndex(); @@ -341,25 +320,35 @@ void HostIrEvaluator::handle(LaunchKernel* launch_kernel) { // Store the outputs in the context for (auto output_idx : c10::irange(outputs.size())) { - expr_evaluator_.bind( - launch_kernel->outputs().at(output_idx), outputs[output_idx]); + bind(launch_kernel->outputs().at(output_idx), outputs[output_idx]); } } void HostIrEvaluator::handle(PostOnStream* post_ir) { KernelArgumentHolder input_args; for (auto& input : post_ir->inputs()) { - NVF_ERROR( - expr_evaluator_.isKnown(input), - "No buffer associated with Val ", - input, - " for handling ", - post_ir->toString()); - input_args.push(expr_evaluator_.evaluate(input)); + input_args.push(getKnownConcreteData(input)); } input_args.setDeviceIndex(); // placeholder for storing the outputs KernelArgumentHolder outputs; + bool use_preallocated_outputs = std::all_of( + post_ir->outputs().begin(), + post_ir->outputs().end(), + [this](Val* output) { return this->isKnown(output); }); + NVF_ERROR( + use_preallocated_outputs || + std::all_of( + post_ir->outputs().begin(), + post_ir->outputs().end(), + [this](Val* output) { return !this->isKnown(output); }), + "outputs must be all or none preallocated in expr ", + post_ir); + if (use_preallocated_outputs) { + for (auto output : post_ir->outputs()) { + outputs.push(getKnownConcreteData(output)); + } + } NVF_ERROR( post_ir->hostOpToPost()->isA(), @@ -376,16 +365,23 @@ void HostIrEvaluator::handle(PostOnStream* post_ir) { /*fusion_id=*/0, !params_.skip_auto_scheduling); } - outputs = fec_.at(hu).runFusionWithInputs(input_args); + if (use_preallocated_outputs) { + TORCH_WARN( + "FusionExecutorCache does not support with preallocated outputs, so we are copying the outputs in expr ", + post_ir); + auto tmp_outputs = fec_.at(hu).runFusionWithInputs(input_args); + for (auto output_idx : c10::irange(tmp_outputs.size())) { + outputs[output_idx].as().copy_( + tmp_outputs[output_idx].as()); + } + } else { + outputs = fec_.at(hu).runFusionWithInputs(input_args); + } } else { // This path should generally be avoided as it will likely send the fusion // held in HostUnit directly to KernelExecutor which means it will try to // compile and run a device kernel with a single thread. - if (auto it = executors_.find(hu); it != executors_.end()) { - ExecutorAbstract* ea = it->second.get(); - outputs = ExecutorDispatch::run(ea, input_args); - - } else { + if (auto it = executors_.find(hu); it == executors_.end()) { DynamicTransform::concretizeFusion(hu->fusion_to_execute(), input_args); auto it2 = executors_.insert( {hu, @@ -402,14 +398,20 @@ void HostIrEvaluator::handle(PostOnStream* post_ir) { } else { ExecutorDispatch::compile(ea, hu->fusion_to_execute()); } + } + ExecutorAbstract* ea = executors_[hu].get(); + if (use_preallocated_outputs) { + ExecutorDispatch::run(ea, input_args, outputs); + } else { outputs = ExecutorDispatch::run(ea, input_args); } } - // Store the outputs in the context - for (auto output_idx : c10::irange(outputs.size())) { - expr_evaluator_.bind( - post_ir->outputs().at(output_idx), outputs[output_idx]); + if (!use_preallocated_outputs) { + // Store the outputs in the context + for (auto output_idx : c10::irange(outputs.size())) { + bind(post_ir->outputs().at(output_idx), outputs[output_idx]); + } } } @@ -418,10 +420,9 @@ void HostIrEvaluator::handle(Communication* communication) { communicator_ != nullptr && communicator_->is_available(), "A valid communicator must be provided"); - at::Tensor input_tensor = - getKnownTensorOrUndefined(communication->input(0), expr_evaluator_); + at::Tensor input_tensor = getKnownTensorOrUndefined(communication->input(0)); at::Tensor output_tensor = - getKnownTensorOrUndefined(communication->output(0), expr_evaluator_); + getKnownTensorOrUndefined(communication->output(0)); CommunicatorBackend backend_type = communication->backend(); c10d::Backend* backend = @@ -439,8 +440,7 @@ void HostIrEvaluator::handle(P2PCommunication* communication) { communicator_ != nullptr && communicator_->is_available(), "A valid communicator must be provided"); - at::Tensor buffer = - getKnownTensorOrUndefined(communication->buffer(), expr_evaluator_); + at::Tensor buffer = getKnownTensorOrUndefined(communication->buffer()); works_[communication] = postSingleCommunication( communication, @@ -495,11 +495,11 @@ void HostIrEvaluator::handle(ForLoop* for_loop) { for (auto i = start; i < stop; i += step) { // invalidate i and its consumers before binding - expr_evaluator_.invalidate(for_loop->index()); + invalidate(for_loop->index()); for (auto consumer : allConsumerValsOf(for_loop->index())) { - expr_evaluator_.invalidate(consumer); + invalidate(consumer); } - expr_evaluator_.bind(for_loop->index(), i); + bind(for_loop->index(), i); for (Expr* expr : for_loop->body().exprs()) { dispatch(expr); } @@ -536,15 +536,11 @@ void HostIrEvaluator::handle(MatmulOp* matmul) { TensorView* a = matmul->inA(); TensorView* b = matmul->inB(); TensorView* out = matmul->out(); - NVF_ERROR( - expr_evaluator_.isKnown(a) && expr_evaluator_.isKnown(b), - "Inputs of the matmul ", - matmul->toString(), - "must be precomputed before being retrieved"); - if (expr_evaluator_.isKnown(out)) { - auto t_a = expr_evaluator_.evaluate(a).as(); - auto t_b = expr_evaluator_.evaluate(b).as(); - auto t_out = expr_evaluator_.evaluate(out).as(); + + if (isKnown(out)) { + auto t_a = getKnownConcreteData(a).as(); + auto t_b = getKnownConcreteData(b).as(); + auto t_out = getKnownConcreteData(out).as(); at::matmul_out(t_out, t_a, t_b); } else { unhandled(matmul); @@ -556,24 +552,18 @@ void HostIrEvaluator::handle(LinearOp* linear) { TensorView* weight = linear->inB()->as(); TensorView* bias = linear->bias()->as(); TensorView* out = linear->out()->as(); - NVF_ERROR( - expr_evaluator_.isKnown(in) && expr_evaluator_.isKnown(weight) && - (!linear->has_bias() || expr_evaluator_.isKnown(bias)), - "Inputs of the Linear Op ", - linear->toString(), - "must be precomputed before being retrieved"); - if (!expr_evaluator_.isKnown(out)) { + if (!isKnown(out)) { unhandled(linear); return; } - auto in_at = expr_evaluator_.evaluate(in).as(); - auto weight_at = expr_evaluator_.evaluate(weight).as(); - auto out_at = expr_evaluator_.evaluate(out).as(); + auto in_at = getKnownConcreteData(in).as(); + auto weight_at = getKnownConcreteData(weight).as(); + auto out_at = getKnownConcreteData(out).as(); if (linear->has_bias()) { - auto bias_at = expr_evaluator_.evaluate(bias).as(); + auto bias_at = getKnownConcreteData(bias).as(); at::linear_out(out_at, in_at, weight_at.squeeze(), bias_at.squeeze()); } else { at::linear_out(out_at, in_at, weight_at.squeeze()); @@ -600,25 +590,37 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) { c10::nullopt, device, c10::nullopt); - - expr_evaluator_.bind(tv, tensor); + bind(tv, tensor); } void HostIrEvaluator::unhandled(Statement* stmt) { NVF_ERROR(stmt->isA(), stmt, " must be an Expr"); auto* expr = stmt->as(); - for (auto input : ir_utils::filterByType(expr->inputs())) { - NVF_ERROR( - expr_evaluator_.isKnown(input), - "input ", - input->toString(), - " of the expression ", - expr->toString(), - "must be precomputed before being retrieved"); - } - for (auto output : expr->outputs()) { - expr_evaluator_.bind( - output, expr_evaluator_.evaluate(output), /*evaluate_validate=*/true); + std::vector inputs; + for (auto input : expr->inputs()) { + if (input->isA()) { + // Tensor inputs must be already computed at this point + inputs.push_back(getKnownConcreteData(input)); + } else { + inputs.push_back(expr_evaluator_.evaluate(input)); + } + } + + // Check that there is no pre-allocated output + NVF_ERROR( + std::all_of( + expr->outputs().begin(), + expr->outputs().end(), + [this](Val* output) { + return !this->expr_evaluator_.isKnown(output); + }), + "Do not support pre-allocated outputs for the op ", + expr); + // using ExpressionEvaluator::evaluate to evaluate the output is not valid + // here if the output or one of its producer is an alias + auto concrete_outputs = expr->evaluate(expr_evaluator_, inputs); + for (int64_t i : c10::irange(expr->outputs().size())) { + bind(expr->output(i), concrete_outputs.at(i)); } } diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index f1b8ed4ef88..73f52a7bb90 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -133,7 +133,38 @@ class HostIrEvaluator final : public OptOutDispatch { c10::cuda::CUDAStream getCUDAStream(Stream* stream); - KernelArgumentHolder dispatchAndCollectOutputs(); + Val* getAlias(Val* val) const { + const auto& aliases = container_->alias(); + auto it = aliases.find(val); + return it != aliases.end() ? it->second : val; + } + + bool isKnown(Val* value) const { + return expr_evaluator_.isKnown(getAlias(value)); + } + + PolymorphicValue getKnownConcreteData(Val* val) const { + NVF_ERROR( + isKnown(val), + "value ", + val->toString(), + "must be precomputed before being retrieved"); + return expr_evaluator_.evaluate(getAlias(val)); + } + + at::Tensor getKnownTensorOrUndefined(Val* val) const { + return isKnown(val) + ? expr_evaluator_.evaluate(getAlias(val)).as() + : at::Tensor(); + } + + void bind(Val* value, PolymorphicValue concrete_value) { + expr_evaluator_.bind(getAlias(value), concrete_value); + } + + void invalidate(Val* value) { + expr_evaluator_.invalidate(getAlias(value)); + } std::unique_ptr container_; Communicator* communicator_; diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp index da85466c8a0..ba296f6f357 100644 --- a/tests/cpp/test_host_irs.cpp +++ b/tests/cpp/test_host_irs.cpp @@ -456,6 +456,62 @@ TEST_P(HostIrTest, ForLoops) { EXPECT_TRUE(expected_result.equal(buffer_at)); } +TEST_P(HostIrTest, PreAllocatedOutputs) { + const std::vector input_sizes = {4, 8, 32}; + const std::vector output_sizes = { + input_sizes.at(1), input_sizes.at(2)}; + + auto get_fusion = [input_sizes]() -> std::unique_ptr { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + auto tv0 = makeConcreteTensor(input_sizes); + auto tv1 = add(tv0, tv0); + auto tv2 = sum(tv1, {0}); + fusion->addInput(tv0); + fusion->addOutput(tv2); + return fusion; + }; + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + auto host_unit = IrBuilder::create(get_fusion()); + + IrCloner ir_cloner(hic.get()); + std::vector post_on_stream_inputs = { + ir_cloner.clone(host_unit->fusion_to_execute()->inputs().at(0))}; + std::vector post_on_stream_outputs = { + ir_cloner.clone(host_unit->fusion_to_execute()->outputs().at(0))}; + + auto post_on_stream = IrBuilder::create( + host_unit, post_on_stream_inputs, post_on_stream_outputs); + + hic->pushBackTopLevelExprs(post_on_stream); + + hic->addInput(post_on_stream->inputs().at(0)); + hic->addInput(post_on_stream->outputs().at(0)); + + HostIrEvaluatorParams params; + auto [use_fusion_executor_cache] = GetParam(); + params.use_fusion_executor_cache = use_fusion_executor_cache; + HostIrEvaluator hie(std::move(hic), nullptr, params); + + // define concrete inputs and compute ref output for validation + auto options = at::TensorOptions().device(at::kCUDA, 0); + auto input = at::randn(input_sizes, options); + auto output = at::empty(output_sizes, options); + auto ref_output = at::sum(input * 2, {0}); + + hie.runWithInput( + {{post_on_stream->inputs().at(0), input}, + {post_on_stream->outputs().at(0), output}}); + + // validate the obtained results + GTEST_EXPECT_TRUE(torch::allclose(ref_output, output)) + << "Output: " << output << " Expected: " << ref_output; +} + INSTANTIATE_TEST_SUITE_P( , HostIrTest, @@ -1095,7 +1151,12 @@ TEST_F(IfThenElseTest, HostIr) { hic->addOutput(output_buffer); hic->pushBackTopLevelExprs(if_then_else); - HostIrEvaluator hie(std::move(hic)); + // Need to use FusionExecutorCache, otherwise hitting error + // https://github.com/NVIDIA/Fuser/blob/4d032f74d2347fd68f5be607ef94956500eb917b/csrc/runtime/executor.cpp#L750 + HostIrEvaluator hie( + std::move(hic), + /*Communicator=*/nullptr, + {.use_fusion_executor_cache = true}); for (auto boolean : {true, false}) { auto options = @@ -1155,7 +1216,7 @@ TEST_F(AllocationTest, inHostForLoop) { TensorView* tv0 = makeConcreteTensor(sizes); tv0->setMemoryType(MemoryType::Global); auto* allocate = IrBuilder::create(tv0, MemoryType::Global); - TensorView* tv1 = abs(tv0); + TensorView* tv1 = set(tv0); for_loop->body().push_back(allocate); for_loop->body().push_back(tv1->definition()); @@ -1170,6 +1231,53 @@ TEST_F(AllocationTest, inHostForLoop) { EXPECT_EQ(sizes, outputs[0].as().sizes()); } +using HirAlias = NVFuserTest; + +TEST_F(HirAlias, SetAndGet) { + const std::vector sizes = {8, 64}; + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + TensorView* tv0 = makeConcreteTensor(sizes); + TensorView* tv1 = set(tv0); + TensorView* tv2 = makeConcreteTensor(sizes); + hic->markAlias(tv1, tv2); + TensorView* tv3 = set(tv2); + TensorView* tv4 = makeConcreteTensor(sizes); + hic->markAlias(tv3, tv4); + hic->addInput(tv0); + hic->addOutput(tv4); + hic->pushBackTopLevelExprs(tv1->definition()); + hic->pushBackTopLevelExprs(tv3->definition()); + + HostIrEvaluator hie(std::move(hic)); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + at::Tensor tv0_aten = at::randn(sizes, options); + + at::Tensor out_aten = hie.runWithInput({{tv0, tv0_aten}})[0].as(); + + at::Tensor expected_out = tv0_aten; + EXPECT_TRUE(out_aten.equal(expected_out)) + << "Obtained output: " << out_aten << "\n" + << "Expected output: " << expected_out; +} + +TEST_F(HirAlias, ThrowOnInputAlias) { + const std::vector sizes = {8, 64}; + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + TensorView* tv0 = makeConcreteTensor(sizes); + TensorView* tv1 = set(tv0); + hic->markAlias(tv1, tv0); + hic->addInput(tv0); + + EXPECT_ANY_THROW(HostIrEvaluator hie(std::move(hic))); +} + } // namespace hir } // namespace nvfuser From 9820d5aba5b81a02ee96dbcc5ba651837d6add8c Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 26 Mar 2025 04:48:56 -0700 Subject: [PATCH 2/5] harden and simplify allocation in for loop test --- tests/cpp/test_host_irs.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp index ba296f6f357..654e60c5d31 100644 --- a/tests/cpp/test_host_irs.cpp +++ b/tests/cpp/test_host_irs.cpp @@ -1216,13 +1216,11 @@ TEST_F(AllocationTest, inHostForLoop) { TensorView* tv0 = makeConcreteTensor(sizes); tv0->setMemoryType(MemoryType::Global); auto* allocate = IrBuilder::create(tv0, MemoryType::Global); - TensorView* tv1 = set(tv0); for_loop->body().push_back(allocate); - for_loop->body().push_back(tv1->definition()); hic->pushBackTopLevelExprs(for_loop); - hic->addOutput(tv1); + hic->addOutput(tv0); HostIrEvaluator hie(std::move(hic)); From e1db5183158a3662827a6cb70caae7e63c0e2191 Mon Sep 17 00:00:00 2001 From: snordmann Date: Mon, 14 Apr 2025 06:49:48 -0700 Subject: [PATCH 3/5] reviews --- csrc/host_ir/container.h | 2 +- csrc/host_ir/executor.h | 2 +- tests/cpp/test_host_irs.cpp | 41 ++++++++++++++++++++++++++++++++----- 3 files changed, 38 insertions(+), 7 deletions(-) diff --git a/csrc/host_ir/container.h b/csrc/host_ir/container.h index 7dcd66b4436..07bd896790b 100644 --- a/csrc/host_ir/container.h +++ b/csrc/host_ir/container.h @@ -56,7 +56,7 @@ class HostIrContainer final : public Fusion { Stream* getDefaultStream(); void markAlias(TensorView* original, const TensorView* new_alias) { - if (alias_.count(original)) { + while (alias_.count(original)) { original = alias_[original]->as(); } alias_[new_alias] = original; diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index 8f4d425fd09..c1486285d19 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -140,7 +140,7 @@ class HostIrEvaluator final : public OptOutDispatch { Val* getAlias(Val* val) const { const auto& aliases = container_->alias(); auto it = aliases.find(val); - return it != aliases.end() ? it->second : val; + return it != aliases.end() ? getAlias(it->second) : val; } bool isKnown(Val* value) const { diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp index 654e60c5d31..6a41e47c744 100644 --- a/tests/cpp/test_host_irs.cpp +++ b/tests/cpp/test_host_irs.cpp @@ -139,7 +139,7 @@ TEST_P(HostIrTest, SingleFusion) { auto outputs = hie.runWithInput({{post_on_stream->inputs().at(0), t0}}); // validate the obtained results - GTEST_EXPECT_TRUE(torch::allclose(ref_output, outputs[0].as())); + EXPECT_TRUE(torch::allclose(ref_output, outputs[0].as())); } /* @@ -236,7 +236,7 @@ TEST_P(HostIrTest, TwoFusions) { auto outputs = hie.runWithInput({{post_on_stream_0->inputs().at(0), t0}}); // validate the obtained results - GTEST_EXPECT_TRUE(torch::allclose(ref_output, outputs[0].as())); + EXPECT_TRUE(torch::allclose(ref_output, outputs[0].as())); } /* @@ -365,7 +365,7 @@ TEST_P(HostIrTest, ThreeFusions) { auto outputs = hie.runWithInput({{post_on_stream_0->inputs().at(0), t0_0}}); // validate the obtained results - GTEST_EXPECT_TRUE(torch::allclose(t2_2, outputs[0].as())); + EXPECT_TRUE(torch::allclose(t2_2, outputs[0].as())); } // This unit test the for-loop IR by implementing a program that could be @@ -508,7 +508,7 @@ TEST_P(HostIrTest, PreAllocatedOutputs) { {post_on_stream->outputs().at(0), output}}); // validate the obtained results - GTEST_EXPECT_TRUE(torch::allclose(ref_output, output)) + EXPECT_TRUE(torch::allclose(ref_output, output)) << "Output: " << output << " Expected: " << ref_output; } @@ -724,7 +724,7 @@ TEST_P(StreamHostIrTest, SingleFusionMultipleStreams) { // validate the obtained results for (int i = 0; i < n_iterations; i++) { - GTEST_EXPECT_TRUE(torch::allclose(ref_output, outputs[i].as())); + EXPECT_TRUE(torch::allclose(ref_output, outputs[i].as())); } EXPECT_NE( c10::cuda::getDefaultCUDAStream(0), c10::cuda::getCurrentCUDAStream(0)); @@ -1262,6 +1262,37 @@ TEST_F(HirAlias, SetAndGet) { << "Expected output: " << expected_out; } +TEST_F(HirAlias, SetAndGetReversedOrder) { + const std::vector sizes = {8, 64}; + + auto hic = std::make_unique(); + FusionGuard fg(hic.get()); + + TensorView* tv0 = makeConcreteTensor(sizes); + TensorView* tv1 = set(tv0); + TensorView* tv2 = makeConcreteTensor(sizes); + TensorView* tv3 = set(tv2); + TensorView* tv4 = makeConcreteTensor(sizes); + hic->markAlias(tv3, tv4); + hic->markAlias(tv1, tv2); + hic->addInput(tv0); + hic->addOutput(tv4); + hic->pushBackTopLevelExprs(tv1->definition()); + hic->pushBackTopLevelExprs(tv3->definition()); + + HostIrEvaluator hie(std::move(hic)); + + auto options = at::TensorOptions().device(at::kCUDA, 0); + at::Tensor tv0_aten = at::randn(sizes, options); + + at::Tensor out_aten = hie.runWithInput({{tv0, tv0_aten}})[0].as(); + + at::Tensor expected_out = tv0_aten; + EXPECT_TRUE(out_aten.equal(expected_out)) + << "Obtained output: " << out_aten << "\n" + << "Expected output: " << expected_out; +} + TEST_F(HirAlias, ThrowOnInputAlias) { const std::vector sizes = {8, 64}; From eb46aef5aa46340e883d858fb4e707f27e6d28d3 Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 16 Apr 2025 02:22:24 -0700 Subject: [PATCH 4/5] minor comment --- csrc/host_ir/container.cpp | 2 +- csrc/host_ir/executor.cpp | 22 +++++++++++----------- csrc/host_ir/executor.h | 2 +- tests/cpp/test_host_irs.cpp | 1 + 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/csrc/host_ir/container.cpp b/csrc/host_ir/container.cpp index a133c0521ba..83e668770fc 100644 --- a/csrc/host_ir/container.cpp +++ b/csrc/host_ir/container.cpp @@ -26,7 +26,7 @@ HostIrContainer::HostIrContainer(int64_t num_kernel_executors) HostIrContainer::~HostIrContainer() = default; Stream* HostIrContainer::getDefaultStream() { - if (!default_stream_) { + if (default_stream_ == nullptr) { default_stream_ = IrBuilder::createInContainer(this); } return default_stream_; diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 551f35487c9..89710eaae4b 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -327,7 +327,7 @@ void HostIrEvaluator::handle(Synchronize* synchronize) { void HostIrEvaluator::handle(LaunchKernel* launch_kernel) { KernelArgumentHolder args; for (auto& input : launch_kernel->inputs()) { - args.push(getKnownConcreteData(input)); + args.push(getKnownConcreteValue(input)); } args.setDeviceIndex(); @@ -349,7 +349,7 @@ void HostIrEvaluator::handle(LaunchKernel* launch_kernel) { void HostIrEvaluator::handle(PostOnStream* post_ir) { KernelArgumentHolder input_args; for (auto& input : post_ir->inputs()) { - input_args.push(getKnownConcreteData(input)); + input_args.push(getKnownConcreteValue(input)); } input_args.setDeviceIndex(); // placeholder for storing the outputs @@ -368,7 +368,7 @@ void HostIrEvaluator::handle(PostOnStream* post_ir) { post_ir); if (use_preallocated_outputs) { for (auto output : post_ir->outputs()) { - outputs.push(getKnownConcreteData(output)); + outputs.push(getKnownConcreteValue(output)); } } @@ -599,9 +599,9 @@ void HostIrEvaluator::handle(MatmulOp* matmul) { TensorView* out = matmul->out(); if (isKnown(out)) { - auto t_a = getKnownConcreteData(a).as(); - auto t_b = getKnownConcreteData(b).as(); - auto t_out = getKnownConcreteData(out).as(); + auto t_a = getKnownConcreteValue(a).as(); + auto t_b = getKnownConcreteValue(b).as(); + auto t_out = getKnownConcreteValue(out).as(); at::matmul_out(t_out, t_a, t_b); } else { unhandled(matmul); @@ -619,12 +619,12 @@ void HostIrEvaluator::handle(LinearOp* linear) { return; } - auto in_at = getKnownConcreteData(in).as(); - auto weight_at = getKnownConcreteData(weight).as(); - auto out_at = getKnownConcreteData(out).as(); + auto in_at = getKnownConcreteValue(in).as(); + auto weight_at = getKnownConcreteValue(weight).as(); + auto out_at = getKnownConcreteValue(out).as(); if (linear->has_bias()) { - auto bias_at = getKnownConcreteData(bias).as(); + auto bias_at = getKnownConcreteValue(bias).as(); at::linear_out(out_at, in_at, weight_at.squeeze(), bias_at.squeeze()); } else { at::linear_out(out_at, in_at, weight_at.squeeze()); @@ -661,7 +661,7 @@ void HostIrEvaluator::unhandled(Statement* stmt) { for (auto input : expr->inputs()) { if (input->isA()) { // Tensor inputs must be already computed at this point - inputs.push_back(getKnownConcreteData(input)); + inputs.push_back(getKnownConcreteValue(input)); } else { inputs.push_back(expr_evaluator_.evaluate(input)); } diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h index b603359d5b3..d71b74e0dda 100644 --- a/csrc/host_ir/executor.h +++ b/csrc/host_ir/executor.h @@ -149,7 +149,7 @@ class HostIrEvaluator final : public OptOutDispatch { return expr_evaluator_.isKnown(getAlias(value)); } - PolymorphicValue getKnownConcreteData(Val* val) const { + PolymorphicValue getKnownConcreteValue(Val* val) const { NVF_ERROR( isKnown(val), "value ", diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp index 6a41e47c744..f8660f3bb83 100644 --- a/tests/cpp/test_host_irs.cpp +++ b/tests/cpp/test_host_irs.cpp @@ -6,6 +6,7 @@ */ // clang-format on #include +#include #include #include From 5f161f5357d9994cd8e9122e06262b02a343103b Mon Sep 17 00:00:00 2001 From: snordmann Date: Wed, 16 Apr 2025 02:44:03 -0700 Subject: [PATCH 5/5] lint --- tests/cpp/test_host_irs.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp index f8660f3bb83..6a41e47c744 100644 --- a/tests/cpp/test_host_irs.cpp +++ b/tests/cpp/test_host_irs.cpp @@ -6,7 +6,6 @@ */ // clang-format on #include -#include #include #include