Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
a053511
Store alloc and logical info in global buffer info.
csarofeen Feb 24, 2025
c6eb652
Draft removing expr_eval from KernelExecutor::run.
csarofeen Feb 24, 2025
543f910
Lint.
csarofeen Feb 24, 2025
f3d4320
Build fixes.
csarofeen Feb 24, 2025
66777eb
Getting closer, just an IMA now.
csarofeen Feb 24, 2025
0a30236
Fix basic tests.
csarofeen Feb 25, 2025
93f7ebd
Still debugging.
csarofeen Feb 25, 2025
ddd62d3
Fix TMA Support.
csarofeen Feb 25, 2025
f91c252
Drop.
csarofeen Feb 25, 2025
cf15f43
All but one allocation domain test working.
csarofeen Feb 25, 2025
eaca8a8
All allocation tests working.
csarofeen Feb 25, 2025
651dd68
Debugging in process.
csarofeen Feb 26, 2025
ce150d9
Down to 5 failures in test_nvfuser.
csarofeen Feb 26, 2025
561dfb1
Slice fix.
csarofeen Feb 27, 2025
868afb7
Debugging.
csarofeen Feb 27, 2025
f6f57ea
Debugging still.
csarofeen Feb 27, 2025
c589bc5
Distrbuted tests working.
csarofeen Feb 27, 2025
f562896
Remove precomputed values from initialize executor entry, for some re…
csarofeen Mar 1, 2025
eb291c5
Fix most failures.
csarofeen Mar 1, 2025
e4d94c6
Cleanup.
csarofeen Mar 1, 2025
ae81a73
Fix serialization.
csarofeen Mar 1, 2025
378d251
Fix serialization.
csarofeen Mar 2, 2025
5b84a33
Merge branch 'main' of https://github.com/NVIDIA/Fuser into executor_…
csarofeen Mar 2, 2025
9ba868e
Clang.
csarofeen Mar 3, 2025
9353408
Merge branch 'main' of https://github.com/NVIDIA/Fuser into executor_…
csarofeen Mar 3, 2025
32a87fb
Remove output to output aliasing, it's not used.
csarofeen Mar 3, 2025
cec8ab3
Update HostIR Exec and remove duplicate allocation code.
csarofeen Mar 3, 2025
e05cd7f
Revert test debug changes.
csarofeen Mar 3, 2025
7c2ac65
Rename and remove dead code.
csarofeen Mar 3, 2025
7a320bd
Clang.
csarofeen Mar 3, 2025
15b03c5
Cleanup, remove dead code.
csarofeen Mar 3, 2025
0db5b89
Cleanup argument to byte conversion.
csarofeen Mar 3, 2025
f2be006
Merge branch 'main' of https://github.com/NVIDIA/Fuser into executor_…
csarofeen Mar 3, 2025
2241594
PR Comments.
csarofeen Mar 4, 2025
9cbff16
Clang.
csarofeen Mar 4, 2025
28e7cf9
Make allocation sizes/strides optional in tensor shape info.
csarofeen Mar 4, 2025
a6252f7
Remove intermediate logic in allocations.cpp output logic can be used…
csarofeen Mar 4, 2025
1769277
Merge branch 'main' into executor_cleanup
csarofeen Mar 4, 2025
9e5a9ca
Fix for ExpandedBroadcastGlobalIntermediateTest
csarofeen Mar 4, 2025
c23ca58
Merge branch 'main' of https://github.com/NVIDIA/Fuser into executor_…
csarofeen Mar 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,5 @@ foo.bin

# Mac OS internal file
.DS_Store

test_log*
9 changes: 3 additions & 6 deletions benchmarks/cpp/matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ static void SingleMatmulBase(
// Compile kernel
auto launch_constraints = LaunchParams();
KernelExecutor ke;
ke.compile(fusion, args.toC10Array(), launch_constraints, cparams);
ke.compile(fusion, args, launch_constraints, cparams);
NVF_CHECK(
getBankConflictInfo(ke.compiledKernel()->kernel(), launch_constraints)
.empty(),
Expand Down Expand Up @@ -352,7 +352,7 @@ static void SingleMatmulPartitionedK(
// Compile kernel
KernelExecutor ke;
auto lparams = LaunchParams();
ke.compile(fusion, args.toC10Array(), lparams, cparams);
ke.compile(fusion, args, lparams, cparams);
NVF_CHECK(
getBankConflictInfo(ke.compiledKernel()->kernel(), lparams).empty(),
"Shared memory bank conflict not removed.");
Expand Down Expand Up @@ -461,10 +461,7 @@ static void NvFuserScheduler_MatmulSplitKReduction(
// Compile kernel
KernelExecutor ke;
ke.compile(
fusion,
args.toC10Array(),
heuristic_params->lparams,
heuristic_params->cparams);
fusion, args, heuristic_params->lparams, heuristic_params->cparams);

NVF_CHECK(
getBankConflictInfo(
Expand Down
66 changes: 43 additions & 23 deletions csrc/host_ir/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,21 +86,6 @@ bool HostIrExecutor::isCompiled() const {
return (bool)host_ir_container_;
}

namespace {
// Host IR specific function, returns the at:Tensor (ordered list) associated
// with the provdied Fusion output tv
at::Tensor findBufferForFusionOutput(
const KernelArgumentHolder& output_args,
const Val* fusion_out,
const Fusion* fusion) {
auto i =
std::find(fusion->outputs().begin(), fusion->outputs().end(), fusion_out);
NVF_ERROR(i != fusion->outputs().end());
auto index = std::distance(fusion->outputs().begin(), i);
return output_args[index].as<at::Tensor>();
}
} // namespace

KernelArgumentHolder HostIrExecutor::run(
KernelArgumentHolder& args,
KernelArgumentHolder output_args) {
Expand All @@ -120,13 +105,17 @@ KernelArgumentHolder HostIrExecutor::run(
auto expr_eval = executor_utils::bindInputs(args, host_ir_container_.get());

if (output_args.empty()) {
std::vector<GlobalBufferInfo> output_info = getBufferInfos(
std::vector<GlobalBufferInfo> output_infos = getBufferInfos(
expr_eval, PrimDataType::Int, host_ir_container_->outputs());
auto output_alias_to_input =
executor_utils::getOutputAliasToInputMap(host_ir_container_.get());
output_args = allocateOutputs(
host_ir_container_.get(),
output_info,
output_infos,
output_alias_to_input,
c10::Device(c10::DeviceType::CUDA, args.getDeviceIndex()),
expr_eval);
args,
true);
}

// TODO: If outputs are provided validate they're the correct size
Expand All @@ -136,8 +125,18 @@ KernelArgumentHolder HostIrExecutor::run(
c10d::Backend* backend =
communicator_->getBackendForTeam(communication->team(), std::nullopt);
auto in_tensor = expr_eval.evaluate(communication->in()).as<at::Tensor>();
at::Tensor out_tensor = findBufferForFusionOutput(
output_args, communication->out(), host_ir_container_.get());
auto out_idx = std::distance(
host_ir_container_->outputs().begin(),
std::find(
host_ir_container_->outputs().begin(),
host_ir_container_->outputs().end(),
communication->out()));

NVF_ERROR(
out_idx < (int64_t)host_ir_container_->outputs().size(),
"Output tensor not found in fusion outputs");
auto out_tensor = output_args[out_idx].as<at::Tensor>();

c10::intrusive_ptr<c10d::Work> work = postSingleCommunication(
communication,
communicator_->deviceId(),
Expand All @@ -148,6 +147,19 @@ KernelArgumentHolder HostIrExecutor::run(
work->wait();
}
}

// Evaluate outputs that are marked as Evaluate
for (auto out_idx : c10::irange(host_ir_container_->outputs().size())) {
auto out = host_ir_container_->outputs()[out_idx];
auto alias_info = host_ir_container_->getOutputAlias(out);
if (alias_info.type == AllocationType::Evaluate) {
NVF_ERROR(
!output_args[out_idx].hasValue(),
"Output tensor already has a value");
output_args[out_idx] = expr_eval.evaluate(out);
}
}

if (isProfilerEnabled()) {
FusionProfiler::segment(group_id_).setDevice(args.getDeviceIndex());
FusionProfiler::segment(group_id_).stopKernel();
Expand Down Expand Up @@ -572,13 +584,21 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) {
"Allocation must be on a TensorView but got ",
allocate->buffer());
TensorView* tv = allocate->buffer()->as<TensorView>();
if (expr_evaluator_.isKnown(tv)) {
return;
}
GlobalBufferInfo info =
getBufferInfos(expr_evaluator_, PrimDataType::Int, {tv}).at(0);
AliasInfo alias_info = {
.type = AllocationType::New, .aliased_io = nullptr, .hide_output = false};
c10::Device device =
communicator_ ? communicator_->device() : at::Device("cuda:0");
at::Tensor tensor = allocateTensor(info, alias_info, device, expr_evaluator_);
auto tensor = at::native::empty_strided_cuda(
info.shape_info.logical_sizes,
info.shape_info.logical_strides,
info.type,
c10::nullopt,
device,
c10::nullopt);

expr_evaluator_.bind(tv, tensor);
}

Expand Down
Loading