From a05351165811f75d0c15074190b5378fc8be369d Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Mon, 24 Feb 2025 06:20:19 -0800 Subject: [PATCH 01/35] Store alloc and logical info in global buffer info. --- csrc/runtime/allocations.cpp | 57 ++++++++++++++++++++++++++-------- csrc/runtime/allocations.h | 15 +++++++-- csrc/runtime/executor.cpp | 60 +++++++++++++++++++++++++----------- csrc/serde/fusion_cache.fbs | 6 ++-- 4 files changed, 104 insertions(+), 34 deletions(-) diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp index b3bc66a1947..75f3e4fcabb 100644 --- a/csrc/runtime/allocations.cpp +++ b/csrc/runtime/allocations.cpp @@ -284,8 +284,8 @@ at::Tensor allocateTensor( switch (alias_info.type) { case AllocationType::New: { auto alloc_tensor = at::native::empty_strided_cuda( - out_info.sizes, - out_info.strides, + out_info.shape_info.logical_sizes, + out_info.shape_info.logical_strides, out_info.type, c10::nullopt, device, @@ -380,7 +380,7 @@ GlobalBufferInfo getBufferInfo( FUSER_PERF_SCOPE("fusion_executor::allocations::getBufferInfo"); GlobalBufferInfo info; info.tv = tv; - std::tie(info.sizes, info.strides) = inferShapeOfOutput(info.tv, expr_eval); + info.shape_info = inferTensorShapes(info.tv, expr_eval); auto dtype = (info.tv->dtype() == DataType::Index ? index_dtype : info.tv->dtype()); info.type = data_type_to_aten(dtype); @@ -711,16 +711,9 @@ at::Tensor transformFromAllocationToLogical( return tensor.permute(dims); } -} // namespace - -std::pair, std::vector> inferShapeOfOutput( +std::pair, std::vector> inferAllocationShape( TensorView* tv, const ExpressionEvaluator& expr_eval) { - FUSER_PERF_SCOPE("fusion_executor::allocations::inferShapeOfOutput"); - // Fusion outputs do not come with Allocate and - // need to be allocated while taking expanded broadcasts into - // account. - std::vector symbolic_sizes; std::vector expand_flags; @@ -745,8 +738,20 @@ std::pair, std::vector> inferShapeOfOutput( expand_flags.push_back(false); } } + return inferShape(tv, symbolic_sizes, expand_flags, expr_eval); +} + +} // namespace - auto size_stride = inferShape(tv, symbolic_sizes, expand_flags, expr_eval); +std::pair, std::vector> inferShapeOfOutput( + TensorView* tv, + const ExpressionEvaluator& expr_eval) { + FUSER_PERF_SCOPE("fusion_executor::allocations::inferShapeOfOutput"); + // Fusion outputs do not come with Allocate and + // need to be allocated while taking expanded broadcasts into + // account. + + auto size_stride = inferAllocationShape(tv, expr_eval); if (!tv->hasAllocation()) { return size_stride; } @@ -761,4 +766,32 @@ std::pair, std::vector> inferShapeOfOutput( return {meta_tensor.sizes().vec(), meta_tensor.strides().vec()}; } +TensorShapeInfo inferTensorShapes( + TensorView* tv, + const ExpressionEvaluator& expr_eval) { + auto allocation_size_stride = inferAllocationShape(tv, expr_eval); + if (!tv->hasAllocation()) { + return TensorShapeInfo{ + allocation_size_stride.first, + allocation_size_stride.second, + allocation_size_stride.first, + allocation_size_stride.second}; + } + + auto options = + c10::TensorOptions().device(c10::Device(c10::DeviceType::Meta)); + auto logical_meta_tensor = at::empty_strided( + allocation_size_stride.first, allocation_size_stride.second, options); + // TODO(jiej): we should refactor it here, there's no need to use + // logical_meta_tensor at all, size + stride should be used directly in the + // `transformFromAllocationToLogical` + logical_meta_tensor = + transformFromAllocationToLogical(logical_meta_tensor, tv, expr_eval); + return { + logical_meta_tensor.sizes().vec(), + logical_meta_tensor.strides().vec(), + allocation_size_stride.first, + allocation_size_stride.second}; +} + } // namespace nvfuser diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h index 01fb9a041c1..8b4ca523e9d 100644 --- a/csrc/runtime/allocations.h +++ b/csrc/runtime/allocations.h @@ -15,10 +15,16 @@ namespace nvfuser { +struct TensorShapeInfo { + std::vector logical_sizes; + std::vector logical_strides; + std::vector allocation_sizes; + std::vector allocation_strides; +}; + struct GlobalBufferInfo { TensorView* tv = nullptr; - std::vector sizes; - std::vector strides; + TensorShapeInfo shape_info; at::ScalarType type = at::ScalarType::Undefined; bool zero_init = false; bool resets_to_zero = false; @@ -61,6 +67,11 @@ std::pair, std::vector> inferShapeOfOutput( TensorView* tv, const ExpressionEvaluator& expr_eval); +// Infer the sizes and strides of an output tensor +TensorShapeInfo inferTensorShapes( + TensorView* tv, + const ExpressionEvaluator& expr_eval); + // Allocate an `at::Tensor` for `out_info` or compute it as an alias. at::Tensor allocateTensor( const GlobalBufferInfo& out_info, diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index 54c3b8bf61c..88911015cc5 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -480,9 +480,13 @@ std::vector KernelExecutor::getIntermediateBufferInfo( tv->getMaybeAllocationDomain().begin(), tv->getMaybeAllocationDomain().end(), [](IterDomain* id) { return id->hasExpandedExtent(); }); - std::tie(info.sizes, info.strides) = has_expanded_domains + auto [sizes, strides] = has_expanded_domains ? inferShapeOfOutput(tv, expr_eval) : inferShapeOfIntermediate(tv, alloc, expr_eval); + info.shape_info.allocation_sizes = sizes; + info.shape_info.allocation_strides = strides; + info.shape_info.logical_sizes = sizes; + info.shape_info.logical_strides = strides; auto dtype = (tv->dtype() == DataType::Index ? index_type : tv->dtype()); info.type = data_type_to_aten(dtype); @@ -664,10 +668,13 @@ void KernelExecutor::initializeExecutorEntry( // future uses of this ExecutorEntry may not be provided with // allocated outputs for (const auto& output : outputs) { - output_info.emplace_back(GlobalBufferInfo{ - .sizes = output.sizes().vec(), - .strides = output.strides().vec(), - .type = output.scalar_type()}); + GlobalBufferInfo info; + info.type = output.scalar_type(); + info.shape_info.allocation_sizes = output.sizes().vec(); + info.shape_info.allocation_strides = output.strides().vec(); + info.shape_info.logical_sizes = output.sizes().vec(); + info.shape_info.logical_strides = output.strides().vec(); + output_info.emplace_back(info); } } @@ -998,14 +1005,17 @@ std::vector KernelExecutor::run( const auto& buf_info = executor_entry->intermediates.at(i); bool has_expansion = false; std::vector unexpanded_sizes; - unexpanded_sizes.reserve(buf_info.sizes.size()); - NVF_ERROR(buf_info.sizes.size() == buf_info.strides.size()) - for (const auto j : c10::irange(buf_info.sizes.size())) { - if (buf_info.strides[j] == 0) { + unexpanded_sizes.reserve(buf_info.shape_info.allocation_sizes.size()); + NVF_ERROR( + buf_info.shape_info.allocation_sizes.size() == + buf_info.shape_info.allocation_strides.size()) + for (const auto j : + c10::irange(buf_info.shape_info.allocation_sizes.size())) { + if (buf_info.shape_info.allocation_strides[j] == 0) { has_expansion = true; unexpanded_sizes.push_back(1L); } else { - unexpanded_sizes.push_back(buf_info.sizes[j]); + unexpanded_sizes.push_back(buf_info.shape_info.allocation_sizes[j]); } } at::Tensor intermediate_buffer; @@ -1036,8 +1046,8 @@ std::vector KernelExecutor::run( } } if (has_expansion) { - intermediate_buffer = - at::native::expand(intermediate_buffer, buf_info.sizes); + intermediate_buffer = at::native::expand( + intermediate_buffer, buf_info.shape_info.allocation_sizes); } args.push(intermediate_buffer); intermediates.push_back(intermediate_buffer); @@ -1309,8 +1319,10 @@ flatbuffers::Offset KernelExecutor::serialize( return serde::CreateGlobalBufferInfoDirect( builder, tv_position, - &data.sizes, - &data.strides, + &data.shape_info.logical_sizes, + &data.shape_info.logical_strides, + &data.shape_info.allocation_sizes, + &data.shape_info.allocation_strides, nvfuser::toUnderlying(data.type), data.zero_init, data.resets_to_zero, @@ -1430,14 +1442,26 @@ GlobalBufferInfo KernelExecutor::deserialize( info.tv = dynamic_cast(out_val->buffer()); } - for (auto dim_size : *buffer->sizes()) { - info.sizes.emplace_back(dim_size); + TensorShapeInfo shape_info; + + for (auto dim_size : *buffer->logical_sizes()) { + shape_info.logical_sizes.emplace_back(dim_size); + } + + for (auto dim_stride : *buffer->logical_strides()) { + shape_info.logical_strides.emplace_back(dim_stride); } - for (auto dim_stride : *buffer->strides()) { - info.strides.emplace_back(dim_stride); + for (auto dim_size : *buffer->alloc_sizes()) { + shape_info.allocation_sizes.emplace_back(dim_size); } + for (auto dim_stride : *buffer->alloc_strides()) { + shape_info.allocation_strides.emplace_back(dim_stride); + } + + info.shape_info = shape_info; + info.type = serde::mapToAtenDtype(buffer->dtype()); info.zero_init = buffer->zero_init(); info.resets_to_zero = buffer->resets_to_zero(); diff --git a/csrc/serde/fusion_cache.fbs b/csrc/serde/fusion_cache.fbs index 562f35d9e59..30a75729147 100644 --- a/csrc/serde/fusion_cache.fbs +++ b/csrc/serde/fusion_cache.fbs @@ -215,8 +215,10 @@ table LaunchParams { // For intermediate tensors, we use its position in the KernelSummary global_allocations. table GlobalBufferInfo { tv : long = -1; - sizes : [long]; - strides : [long]; + logical_sizes : [long]; + logical_strides : [long]; + alloc_sizes : [long]; + alloc_strides : [long]; dtype : long; zero_init : bool; resets_to_zero : bool; From c6eb652a0a50e65f1e3186eb875ce034dba9b3a8 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Mon, 24 Feb 2025 07:26:22 -0800 Subject: [PATCH 02/35] Draft removing expr_eval from KernelExecutor::run. --- csrc/runtime/allocations.cpp | 42 +++++++++++ csrc/runtime/allocations.h | 6 ++ csrc/runtime/executor.cpp | 105 ++++++++++++++++++++++----- csrc/runtime/executor.h | 7 ++ csrc/runtime/executor_kernel_arg.cpp | 32 ++++++++ csrc/runtime/executor_kernel_arg.h | 5 ++ csrc/tensor_metadata.cpp | 3 +- 7 files changed, 181 insertions(+), 19 deletions(-) diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp index 75f3e4fcabb..53da6c08728 100644 --- a/csrc/runtime/allocations.cpp +++ b/csrc/runtime/allocations.cpp @@ -372,6 +372,48 @@ std::vector allocateOutputs( return out_tensors; } +std::vector allocateKernelOutputs( + const Fusion* fusion, + const std::vector& output_infos, + const c10::Device& device, + const KernelArgumentHolder& args) { + FUSER_PERF_SCOPE("fusion_executor::allocations::allocateOutputs"); + + NVF_ERROR( + std::any_of( + output_infos.output_aliased_to_output.begin(), + output_infos.output_aliased_to_output.end(), + [](int idx) { return idx != -1; }), + "Kernel's don't support output to output aliasing."); + + std::vector out_tensors; + out_tensors.reserve(output_infos.size()); + for (auto out_idx : c10::irange(output_infos.size())) { + auto out_info = output_infos.at(out_idx); + if (output_infos.output_aliased_to_input.at(out_info.tv->startIdx()) == + -1) { + auto alloc_tensor = at::native::empty_strided_cuda( + out_info.shape_info.logical_sizes, + out_info.shape_info.logical_strides, + out_info.type, + c10::nullopt, + device, + c10::nullopt); + if (shouldFillAllocationWithNan()) { + fillTensorWithNan(alloc_tensor); + } + out_tensors.emplace_back(alloc_tensor); + } else { + auto input_arg = args[output_infos.output_aliased_to_input.at(out_idx)]; + NVF_ERROR( + input_arg.is(), + "Aliased input argument is not a tensor."); + out_tensors.emplace_back(input_arg.as()); + } + } + return out_tensors; +} + namespace { GlobalBufferInfo getBufferInfo( ExpressionEvaluator& expr_eval, diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h index 8b4ca523e9d..06333a91caa 100644 --- a/csrc/runtime/allocations.h +++ b/csrc/runtime/allocations.h @@ -87,6 +87,12 @@ std::vector allocateOutputs( const c10::Device& device, ExpressionEvaluator& ee); +std::vector allocateKernelOutputs( + const Fusion* fusion, + const std::vector& output_infos, + const c10::Device& device, + const KernelArgumentHolder& args); + //! Return information necessary for allocating output tensors. Input //! and output tensors are allowed to alias each other, which is //! specified by the list of int pairs of input and output indices diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index 88911015cc5..f8b2742b8ae 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -667,9 +667,16 @@ void KernelExecutor::initializeExecutorEntry( // Need to save the information necessary for allocations as // future uses of this ExecutorEntry may not be provided with // allocated outputs - for (const auto& output : outputs) { + for (auto output_idx : c10::irange(outputs.size())) { + const auto& output = outputs[output_idx]; GlobalBufferInfo info; info.type = output.scalar_type(); + info.tv = compiled_kernel_->kernel()->outputs()[output_idx]; + NVF_ERROR( + !info.tv->hasAllocation(), + "Accepting allocated outputs is not currently supported with allocation domain. ", + "Allocation domain found for tv: ", + info.tv->toString()); info.shape_info.allocation_sizes = output.sizes().vec(); info.shape_info.allocation_strides = output.strides().vec(); info.shape_info.logical_sizes = output.sizes().vec(); @@ -678,11 +685,49 @@ void KernelExecutor::initializeExecutorEntry( } } + std::vector output_aliased_to_input(outputs.size(), -1); + std::vector output_aliased_to_output(outputs.size(), -1); + + for (auto output_idx : c10::irange(outputs.size())) { + auto out_info = output_info[output_idx]; + auto fusion = compiled_kernel_->kernel()->as(); + auto alias_info = fusion->getOutputAliasInfo(out_info.tv); + NVF_ERROR( + alias_info.type != AllocationType::Evaluate, + "Outputs should not be evaluate type for kernels."); + if (alias_info.type == AllocationType::New) { + continue; + } + auto aliased_to = alias_info.aliased_io->as(); + auto aliased_to_idx = + std::find( + fusion->inputs().begin(), fusion->inputs().end(), aliased_to) - + fusion->inputs().begin(); + if (aliased_to_idx < fusion->inputs().size()) { + output_aliased_to_input[output_idx] = aliased_to_idx; + } else { + output_aliased_to_output[output_idx] = + std::find( + fusion->outputs().begin(), fusion->outputs().end(), aliased_to) - + fusion->outputs().begin(); + NVF_ERROR( + output_aliased_to_output[output_idx] < fusion->outputs().size(), + "Alias found but is not an output or input of the fusion. ", + "Aliased to tv: ", + aliased_to->toString(), + "\nFusion Inputs:\n ", + fusion->inputs(), + "\nFusion Outputs:\n ", + fusion->outputs()); + } + } + auto intermediates = getIntermediateBufferInfo(expr_eval, index_type); // All information is gathered. Save it to ExecutorEntry executor_entry.launch_params = launch_params; executor_entry.outputs = output_info; + executor_entry.output_aliased_to = output_aliased_to; executor_entry.intermediates = intermediates; executor_entry.init = true; } @@ -782,6 +827,32 @@ void KernelExecutor::computeArgs( } } +// set the arguments that we'll pass to cuLaunchKernel +// TODO: Add to header +void KernelExecutor::computeArgs2( + ExecutorEntry& entry, + const std::vector& outputs, + const std::vector& intermediates) const { + FUSER_PERF_SCOPE("KernelExecutor::computeArgs2"); + + entry.args.resize(outputs.size() + intermediates.size()); + entry.arg_ptrs.resize(outputs.size() + intermediates.size()); + + NVF_ERROR(entry.outputs.size() == outputs.size(), "Outputs size mismatch"); + NVF_ERROR(entry.intermediates.size() == intermediates.size(), "Intermediates size mismatch"); + + const PrimDataType idx_type = compiled_kernel_->kernel()->indexType(); + for (size_t out_idx = 0; out_idx < outputs.size(); ++out_idx) { + entry.args[out_idx] = getKernelArgument(outputs[out_idx], entry.outputs[out_idx], idx_type); + entry.arg_ptrs[out_idx] = entry.args[out_idx].data(); + } + + for (size_t inter_idx = 0; inter_idx < intermediates.size(); ++inter_idx) { + entry.args[out_idx + inter_idx] = getKernelArgument(intermediates[inter_idx], entry.intermediates[inter_idx], idx_type); + entry.arg_ptrs[out_idx + inter_idx] = entry.args[out_idx + inter_idx].data(); + } +} + // Reset the arguments that we'll pass to cuLaunchKernel. This needs to be // invoked on every shape change. void KernelExecutor::recomputeArgs( @@ -972,15 +1043,15 @@ std::vector KernelExecutor::run( at::AutoDispatchBelowADInplaceOrView non_variable_type_mode; // Bind fusion inputs - auto expr_eval = executor_utils::bindInputs(args, compiled_kernel_->kernel()); + // auto expr_eval = executor_utils::bindInputs(args, compiled_kernel_->kernel()); // only allocate outputs when not given if (outputs.empty()) { - outputs = allocateOutputs( + outputs = allocateKernelOutputs( compiled_kernel_->kernel()->as(), executor_entry->outputs, compiled_kernel_->device(), - expr_eval); + args); } args.push(outputs); @@ -993,8 +1064,8 @@ std::vector KernelExecutor::run( // Skip trivially forwarded outputs because they are just placeholders continue; } - expr_eval.bind( - output, args[compiled_kernel_->kernel()->inputs().size() + i]); + // expr_eval.bind( + // output, args[compiled_kernel_->kernel()->inputs().size() + i]); } std::vector intermediates; @@ -1051,23 +1122,21 @@ std::vector KernelExecutor::run( } args.push(intermediate_buffer); intermediates.push_back(intermediate_buffer); - expr_eval.bind( - compiled_kernel_->kernel() - ->summary() - .global_allocations.at(i) - ->buffer(), - args - [compiled_kernel_->kernel()->inputs().size() + outputs.size() + - i]); + // expr_eval.bind( + // compiled_kernel_->kernel() + // ->summary() + // .global_allocations.at(i) + // ->buffer(), + // args + // [compiled_kernel_->kernel()->inputs().size() + outputs.size() + + // i]); if (buf_info.is_profile_buffer) { profile_buffer = intermediate_buffer; } } } - if (executor_entry->args.empty()) { - computeArgs(*executor_entry, expr_eval, compiled_kernel_->kernel()); - } + computeArgs2(*executor_entry, outputs, intermediates); if (isDebugDumpEnabled(DebugDumpOption::LaunchParam)) { launch_params_.print(); @@ -1093,7 +1162,7 @@ std::vector KernelExecutor::run( FUSER_PERF_SCOPE("KernelExecutor::runFusion::execute_kernel"); ensureAvailableDynamicSmemSize(executor_entry->launch_params.smem()); - recomputeArgs(*executor_entry, expr_eval, compiled_kernel_->kernel()); + // recomputeArgs(*executor_entry, expr_eval, compiled_kernel_->kernel()); if (isDebugDumpEnabled(DebugDumpOption::Occupancy) || isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) { diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h index bdce798369b..ece1c2987e3 100644 --- a/csrc/runtime/executor.h +++ b/csrc/runtime/executor.h @@ -122,6 +122,10 @@ class KernelExecutor : public ExecutorAbstract { bool init = false; LaunchParams launch_params; std::vector outputs; + // If an output is aliased to an input, this will hold the index of the + // input that it is aliased to. If not aliased, it will hold -1. + std::vector output_aliased_to_input; + std::vector output_aliased_to_output; // Temporary work buffers and intemediate global-memory tensors std::vector intermediates; // The arguments to the kernel. These are configured in computeArgs and @@ -228,6 +232,9 @@ class KernelExecutor : public ExecutorAbstract { // to we have now. void computeArgs(ExecutorEntry&, ExpressionEvaluator&, const kir::Kernel*) const; + + void computeArgs2(ExecutorEntry& entry, const std::vector& outputs) const; + // Updates an existing set of arguments based on the current arguments. It is // is an error to call this before `computeArgs` has been invoked. // recomputeArgs will fail if the arity of the function changes, or the rank diff --git a/csrc/runtime/executor_kernel_arg.cpp b/csrc/runtime/executor_kernel_arg.cpp index 8a0fc4fe7ce..9197cd63771 100644 --- a/csrc/runtime/executor_kernel_arg.cpp +++ b/csrc/runtime/executor_kernel_arg.cpp @@ -377,6 +377,38 @@ std::vector getKernelArgument( return polymorphicValueToBytes(pv, parameter->dtype(), index_type); } +std::vector getKernelArgument( + at::Tensor tensor, + const GlobalBufferInfo& output_info, + PrimDataType index_type) { + FUSER_PERF_SCOPE("getKernelArgument"); + + NVF_ERROR( + tensor.is_cuda() || tensor.is_meta(), + "GetMetaData expects a CUDA/meta tensor as input, but got: ", + tensor); + + std::shared_ptr struct_ = std::make_shared(); + TensorMetaData* metadata = (TensorMetaData*)struct_.get(); + metadata->dtype = + std::get(aten_to_data_type(tensor.scalar_type()).type); + metadata->data = tensor.data_ptr(); + + metadata->logical_size_data = output_info.shape_info.logical_sizes; + metadata->logical_size = c10::makeArrayRef(metadata->logical_size_data); + metadata->logical_stride_data = output_info.shape_info.logical_strides; + metadata->logical_stride = c10::makeArrayRef(metadata->logical_stride_data); + metadata->alloc_size_data = output_info.shape_info.allocation_sizes; + metadata->alloc_size = c10::makeArrayRef(metadata->alloc_size_data); + metadata->alloc_stride_data = output_info.shape_info.allocation_strides; + metadata->alloc_stride = c10::makeArrayRef(metadata->alloc_stride_data); + + return polymorphicValueToBytes( + PolymorphicValue(std::move(struct_)), + output_info.tv->dtype(), + index_type); +} + int64_t computeBytes(const KernelArgumentHolder& args) { int64_t num_bytes = 0; // Figure how many bytes are inputs, outputs, and temporary buffers diff --git a/csrc/runtime/executor_kernel_arg.h b/csrc/runtime/executor_kernel_arg.h index e4a8c6b4ea9..a583104f66c 100644 --- a/csrc/runtime/executor_kernel_arg.h +++ b/csrc/runtime/executor_kernel_arg.h @@ -222,6 +222,11 @@ std::vector getKernelArgument( Val* parameter, PrimDataType index_type); +std::vector getKernelArgument( + at::Tensor tensor, + const GlobalBufferInfo& output_info, + PrimDataType index_type); + int64_t computeBytes(const KernelArgumentHolder& args); int64_t computeBytes(const std::vector& outputs); diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp index e82aeef08c0..fd7d4a60df2 100644 --- a/csrc/tensor_metadata.cpp +++ b/csrc/tensor_metadata.cpp @@ -356,7 +356,8 @@ std::vector GetMetaData::evaluate( } else { metadata->logical_size = input.sizes(); } - metadata->logical_stride = input.strides(); + metadata->logical_stride_data = input.strides(); + metadata->logical_stride = c10::makeArrayRef(metadata->logical_stride_data); auto [allocation_sizes, allocation_strides] = inferAndValidateAllocationSizesAndStrides(input, tv, ee); From 543f9107be6c4ec15250c8d9dd1d83fa89a2fcea Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Mon, 24 Feb 2025 07:27:42 -0800 Subject: [PATCH 03/35] Lint. --- csrc/runtime/executor.cpp | 16 +++++++++++----- csrc/runtime/executor.h | 6 ++++-- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index f8b2742b8ae..dbd231f6d69 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -839,17 +839,22 @@ void KernelExecutor::computeArgs2( entry.arg_ptrs.resize(outputs.size() + intermediates.size()); NVF_ERROR(entry.outputs.size() == outputs.size(), "Outputs size mismatch"); - NVF_ERROR(entry.intermediates.size() == intermediates.size(), "Intermediates size mismatch"); + NVF_ERROR( + entry.intermediates.size() == intermediates.size(), + "Intermediates size mismatch"); const PrimDataType idx_type = compiled_kernel_->kernel()->indexType(); for (size_t out_idx = 0; out_idx < outputs.size(); ++out_idx) { - entry.args[out_idx] = getKernelArgument(outputs[out_idx], entry.outputs[out_idx], idx_type); + entry.args[out_idx] = + getKernelArgument(outputs[out_idx], entry.outputs[out_idx], idx_type); entry.arg_ptrs[out_idx] = entry.args[out_idx].data(); } for (size_t inter_idx = 0; inter_idx < intermediates.size(); ++inter_idx) { - entry.args[out_idx + inter_idx] = getKernelArgument(intermediates[inter_idx], entry.intermediates[inter_idx], idx_type); - entry.arg_ptrs[out_idx + inter_idx] = entry.args[out_idx + inter_idx].data(); + entry.args[out_idx + inter_idx] = getKernelArgument( + intermediates[inter_idx], entry.intermediates[inter_idx], idx_type); + entry.arg_ptrs[out_idx + inter_idx] = + entry.args[out_idx + inter_idx].data(); } } @@ -1043,7 +1048,8 @@ std::vector KernelExecutor::run( at::AutoDispatchBelowADInplaceOrView non_variable_type_mode; // Bind fusion inputs - // auto expr_eval = executor_utils::bindInputs(args, compiled_kernel_->kernel()); + // auto expr_eval = executor_utils::bindInputs(args, + // compiled_kernel_->kernel()); // only allocate outputs when not given if (outputs.empty()) { diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h index ece1c2987e3..e0b016423f2 100644 --- a/csrc/runtime/executor.h +++ b/csrc/runtime/executor.h @@ -233,8 +233,10 @@ class KernelExecutor : public ExecutorAbstract { void computeArgs(ExecutorEntry&, ExpressionEvaluator&, const kir::Kernel*) const; - void computeArgs2(ExecutorEntry& entry, const std::vector& outputs) const; - + void computeArgs2( + ExecutorEntry& entry, + const std::vector& outputs) const; + // Updates an existing set of arguments based on the current arguments. It is // is an error to call this before `computeArgs` has been invoked. // recomputeArgs will fail if the arity of the function changes, or the rank From f3d4320c0bf24cd07ece855cb15792a3fa85faeb Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Mon, 24 Feb 2025 08:05:22 -0800 Subject: [PATCH 04/35] Build fixes. --- csrc/runtime/allocations.cpp | 18 +++--- csrc/runtime/allocations.h | 4 +- csrc/runtime/executor.cpp | 61 +++++++++--------- csrc/runtime/executor.h | 95 +++++++++++++++------------- csrc/runtime/executor_kernel_arg.cpp | 1 + csrc/runtime/executor_kernel_arg.h | 2 + csrc/serde/fusion_cache.fbs | 8 +-- csrc/tensor_metadata.cpp | 3 +- 8 files changed, 103 insertions(+), 89 deletions(-) diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp index 53da6c08728..89aed454a4f 100644 --- a/csrc/runtime/allocations.cpp +++ b/csrc/runtime/allocations.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -374,24 +375,23 @@ std::vector allocateOutputs( std::vector allocateKernelOutputs( const Fusion* fusion, - const std::vector& output_infos, + const KernelExecutorEntry& entry, const c10::Device& device, const KernelArgumentHolder& args) { FUSER_PERF_SCOPE("fusion_executor::allocations::allocateOutputs"); NVF_ERROR( std::any_of( - output_infos.output_aliased_to_output.begin(), - output_infos.output_aliased_to_output.end(), + entry.output_aliased_to_output.begin(), + entry.output_aliased_to_output.end(), [](int idx) { return idx != -1; }), "Kernel's don't support output to output aliasing."); std::vector out_tensors; - out_tensors.reserve(output_infos.size()); - for (auto out_idx : c10::irange(output_infos.size())) { - auto out_info = output_infos.at(out_idx); - if (output_infos.output_aliased_to_input.at(out_info.tv->startIdx()) == - -1) { + out_tensors.reserve(entry.outputs.size()); + for (auto out_idx : c10::irange(entry.outputs.size())) { + auto out_info = entry.outputs.at(out_idx); + if (entry.output_aliased_to_input.at(out_idx) == -1) { auto alloc_tensor = at::native::empty_strided_cuda( out_info.shape_info.logical_sizes, out_info.shape_info.logical_strides, @@ -404,7 +404,7 @@ std::vector allocateKernelOutputs( } out_tensors.emplace_back(alloc_tensor); } else { - auto input_arg = args[output_infos.output_aliased_to_input.at(out_idx)]; + auto input_arg = args[entry.output_aliased_to_input.at(out_idx)]; NVF_ERROR( input_arg.is(), "Aliased input argument is not a tensor."); diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h index 06333a91caa..999692f936e 100644 --- a/csrc/runtime/allocations.h +++ b/csrc/runtime/allocations.h @@ -15,6 +15,8 @@ namespace nvfuser { +struct KernelExecutorEntry; + struct TensorShapeInfo { std::vector logical_sizes; std::vector logical_strides; @@ -89,7 +91,7 @@ std::vector allocateOutputs( std::vector allocateKernelOutputs( const Fusion* fusion, - const std::vector& output_infos, + const KernelExecutorEntry& entry, const c10::Device& device, const KernelArgumentHolder& args); diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index dbd231f6d69..61ad8992bc2 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -616,7 +616,7 @@ void dumpKernelArgs( } // namespace void KernelExecutor::initializeExecutorEntry( - ExecutorEntry& executor_entry, + KernelExecutorEntry& executor_entry, const KernelArgumentHolder& args, const LaunchParams& launch_constraints, const CompileParams& compile_params, @@ -665,13 +665,15 @@ void KernelExecutor::initializeExecutorEntry( expr_eval, index_type, compiled_kernel_->kernel()->outputs()); } else { // Need to save the information necessary for allocations as - // future uses of this ExecutorEntry may not be provided with + // future uses of this KernelExecutorEntry may not be provided with // allocated outputs for (auto output_idx : c10::irange(outputs.size())) { const auto& output = outputs[output_idx]; GlobalBufferInfo info; info.type = output.scalar_type(); - info.tv = compiled_kernel_->kernel()->outputs()[output_idx]; + auto out_val = compiled_kernel_->kernel()->outputs()[output_idx]; + NVF_ERROR(out_val->isA(), "Output is not a TensorView"); + info.tv = out_val->as(); NVF_ERROR( !info.tv->hasAllocation(), "Accepting allocated outputs is not currently supported with allocation domain. ", @@ -691,7 +693,7 @@ void KernelExecutor::initializeExecutorEntry( for (auto output_idx : c10::irange(outputs.size())) { auto out_info = output_info[output_idx]; auto fusion = compiled_kernel_->kernel()->as(); - auto alias_info = fusion->getOutputAliasInfo(out_info.tv); + auto alias_info = fusion->getOutputAlias(out_info.tv); NVF_ERROR( alias_info.type != AllocationType::Evaluate, "Outputs should not be evaluate type for kernels."); @@ -703,7 +705,7 @@ void KernelExecutor::initializeExecutorEntry( std::find( fusion->inputs().begin(), fusion->inputs().end(), aliased_to) - fusion->inputs().begin(); - if (aliased_to_idx < fusion->inputs().size()) { + if (aliased_to_idx < (int64_t)fusion->inputs().size()) { output_aliased_to_input[output_idx] = aliased_to_idx; } else { output_aliased_to_output[output_idx] = @@ -711,7 +713,7 @@ void KernelExecutor::initializeExecutorEntry( fusion->outputs().begin(), fusion->outputs().end(), aliased_to) - fusion->outputs().begin(); NVF_ERROR( - output_aliased_to_output[output_idx] < fusion->outputs().size(), + output_aliased_to_output[output_idx] < (int)fusion->outputs().size(), "Alias found but is not an output or input of the fusion. ", "Aliased to tv: ", aliased_to->toString(), @@ -724,10 +726,11 @@ void KernelExecutor::initializeExecutorEntry( auto intermediates = getIntermediateBufferInfo(expr_eval, index_type); - // All information is gathered. Save it to ExecutorEntry + // All information is gathered. Save it to KernelExecutorEntry executor_entry.launch_params = launch_params; executor_entry.outputs = output_info; - executor_entry.output_aliased_to = output_aliased_to; + executor_entry.output_aliased_to_input = output_aliased_to_input; + executor_entry.output_aliased_to_output = output_aliased_to_output; executor_entry.intermediates = intermediates; executor_entry.init = true; } @@ -750,7 +753,7 @@ void KernelExecutor::initializeExecutorEntry( /// @param idx_type_size generally sizeof(int32_t) or sizeof(int64_t); used for /// computing how large the arrays to copy are. static void fillTensorArgMetadata( - KernelExecutor::ExecutorEntry& entry, + KernelExecutorEntry& entry, const PolymorphicValue& tensor_metadata, size_t idx, size_t idx_type_size) { @@ -812,7 +815,7 @@ static void fillTensorArgMetadata( // It does not need to happen when only shapes change---use recomputeArgs for // that. void KernelExecutor::computeArgs( - ExecutorEntry& entry, + KernelExecutorEntry& entry, ExpressionEvaluator& expr_eval, const kir::Kernel* kernel) const { FUSER_PERF_SCOPE("KernelExecutor::computeArgs"); @@ -830,7 +833,7 @@ void KernelExecutor::computeArgs( // set the arguments that we'll pass to cuLaunchKernel // TODO: Add to header void KernelExecutor::computeArgs2( - ExecutorEntry& entry, + KernelExecutorEntry& entry, const std::vector& outputs, const std::vector& intermediates) const { FUSER_PERF_SCOPE("KernelExecutor::computeArgs2"); @@ -851,17 +854,17 @@ void KernelExecutor::computeArgs2( } for (size_t inter_idx = 0; inter_idx < intermediates.size(); ++inter_idx) { - entry.args[out_idx + inter_idx] = getKernelArgument( + entry.args[outputs.size() + inter_idx] = getKernelArgument( intermediates[inter_idx], entry.intermediates[inter_idx], idx_type); - entry.arg_ptrs[out_idx + inter_idx] = - entry.args[out_idx + inter_idx].data(); + entry.arg_ptrs[outputs.size() + inter_idx] = + entry.args[outputs.size() + inter_idx].data(); } } // Reset the arguments that we'll pass to cuLaunchKernel. This needs to be // invoked on every shape change. void KernelExecutor::recomputeArgs( - ExecutorEntry& entry, + KernelExecutorEntry& entry, ExpressionEvaluator& expr_eval, const kir::Kernel* kernel) const { FUSER_PERF_SCOPE("KernelExecutor::recomputeArgs"); @@ -1015,9 +1018,9 @@ std::vector KernelExecutor::run( NVF_ERROR(compiled_kernel_->lowered()); // Placeholder for the case where parameter cache is not used - ExecutorEntry temporary_executor_entry; + KernelExecutorEntry temporary_executor_entry; - ExecutorEntry* executor_entry = args.getCacheId().has_value() && + KernelExecutorEntry* executor_entry = args.getCacheId().has_value() && !compiled_kernel_->disablePaarameterCache() ? &executor_entry_lookup_[*args.getCacheId()] : &temporary_executor_entry; @@ -1054,8 +1057,8 @@ std::vector KernelExecutor::run( // only allocate outputs when not given if (outputs.empty()) { outputs = allocateKernelOutputs( - compiled_kernel_->kernel()->as(), - executor_entry->outputs, + compiled_kernel_->kernel(), + *executor_entry, compiled_kernel_->device(), args); } @@ -1247,7 +1250,7 @@ std::vector KernelExecutor::run( flatbuffers::Offset KernelExecutor::serialize( flatbuffers::FlatBufferBuilder& builder) const { // See table definition for KernelExecutor in serde/fusion_cache.fbs - using fb_executor_entry = flatbuffers::Offset; + using fb_executor_entry = flatbuffers::Offset; // Separate unordered_map for executor_entry_lookup into key and value // vectors. The key value is the cache_id value in the KernelArgumentHolder. @@ -1328,10 +1331,10 @@ flatbuffers::Offset KernelExecutor::serialize( return ckb.Finish(); } -flatbuffers::Offset KernelExecutor::serialize( +flatbuffers::Offset KernelExecutor::serialize( flatbuffers::FlatBufferBuilder& builder, - const ExecutorEntry& data) const { - // See table definition for ExecutorEntry in serde/fusion_cache.fbs + const KernelExecutorEntry& data) const { + // See table definition for KernelExecutorEntry in serde/fusion_cache.fbs // Serialize GlobalBufferInfo for outputs. // We map the output TensorView pointer to its corresponding position in @@ -1377,7 +1380,7 @@ flatbuffers::Offset KernelExecutor::serialize( serialize(builder, buffer, tv_position, false /* is_fusion_output */)); } - return serde::CreateExecutorEntryDirect( + return serde::CreateKernelExecutorEntryDirect( builder, data.init, data.launch_params.serialize(builder), @@ -1469,13 +1472,13 @@ void KernelExecutor::deserialize( } } -KernelExecutor::ExecutorEntry KernelExecutor::deserialize( - const serde::ExecutorEntry* buffer) { - // See table definition for ExecutorEntry in serde/fusion_cache.fbs +KernelExecutorEntry KernelExecutor::deserialize( + const serde::KernelExecutorEntry* buffer) { + // See table definition for KernelExecutorEntry in serde/fusion_cache.fbs - NVF_ERROR(buffer != nullptr, "serde::ExecutorEntry is nullptr."); + NVF_ERROR(buffer != nullptr, "serde::KernelExecutorEntry is nullptr."); - ExecutorEntry entry; + KernelExecutorEntry entry; entry.init = buffer->init(); diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h index e0b016423f2..1fecd1c32c8 100644 --- a/csrc/runtime/executor.h +++ b/csrc/runtime/executor.h @@ -58,6 +58,35 @@ class ExprEvalExecutor : public ExecutorAbstract { std::unique_ptr fusion_; }; +// struct used to hold necessary information to launch compiled kernel on a +// given input set. +// +// TODO: strides would also be important when we handle permutations in +// codegen. +// +struct KernelExecutorEntry { + bool init = false; + LaunchParams launch_params; + std::vector outputs; + // If an output is aliased to an input, this will hold the index of the + // input that it is aliased to. If not aliased, it will hold -1. + std::vector output_aliased_to_input; + std::vector output_aliased_to_output; + // Temporary work buffers and intemediate global-memory tensors + std::vector intermediates; + // The arguments to the kernel. These are configured in computeArgs and + // recomputeArgs. + // For the common case of a tensor argument, these correspond to the + // `struct Tensor` data in runtime/tensor.cu. That means each tensor + // element in `args` would be a sizeof(void*) + len(shape)*sizeof(int) + + // len(shape)*sizeof(int) byte array (here "int" is used in place of the + // index type, which varies in practice). + std::vector> args; + // This is just the data() pointers to the above `args`; cuLaunchKernel + // requires an array of this form. + std::vector arg_ptrs; +}; + class KernelExecutor : public ExecutorAbstract { public: // NVF_API was added for nvfuser_extension. See examples/sinh_extension. @@ -112,35 +141,6 @@ class KernelExecutor : public ExecutorAbstract { executor_entry_lookup_.erase(cache_id); } - // struct used to hold necessary information to launch compiled kernel on a - // given input set. - // - // TODO: strides would also be important when we handle permutations in - // codegen. - // - struct ExecutorEntry { - bool init = false; - LaunchParams launch_params; - std::vector outputs; - // If an output is aliased to an input, this will hold the index of the - // input that it is aliased to. If not aliased, it will hold -1. - std::vector output_aliased_to_input; - std::vector output_aliased_to_output; - // Temporary work buffers and intemediate global-memory tensors - std::vector intermediates; - // The arguments to the kernel. These are configured in computeArgs and - // recomputeArgs. - // For the common case of a tensor argument, these correspond to the - // `struct Tensor` data in runtime/tensor.cu. That means each tensor - // element in `args` would be a sizeof(void*) + len(shape)*sizeof(int) + - // len(shape)*sizeof(int) byte array (here "int" is used in place of the - // index type, which varies in practice). - std::vector> args; - // This is just the data() pointers to the above `args`; cuLaunchKernel - // requires an array of this form. - std::vector arg_ptrs; - }; - using ExecutorCompileTimeInfoCache = executor_utils::caching::ExecutorCompileTimeInfoCache; @@ -217,9 +217,9 @@ class KernelExecutor : public ExecutorAbstract { return &compile_time_info_cache_; } - //! TODO: Consider changing this to a constructor of ExecutorEntry + //! TODO: Consider changing this to a constructor of KernelExecutorEntry void initializeExecutorEntry( - ExecutorEntry& executor_entry, + KernelExecutorEntry& executor_entry, const KernelArgumentHolder& args, const LaunchParams& launch_constraints, const CompileParams& compile_params, @@ -230,35 +230,40 @@ class KernelExecutor : public ExecutorAbstract { // Creates the initial set of arguments to a kernel, based on the arguments // to we have now. - void computeArgs(ExecutorEntry&, ExpressionEvaluator&, const kir::Kernel*) - const; + void computeArgs( + KernelExecutorEntry&, + ExpressionEvaluator&, + const kir::Kernel*) const; void computeArgs2( - ExecutorEntry& entry, - const std::vector& outputs) const; + KernelExecutorEntry& entry, + const std::vector& outputs, + const std::vector& intermediates) const; // Updates an existing set of arguments based on the current arguments. It is // is an error to call this before `computeArgs` has been invoked. // recomputeArgs will fail if the arity of the function changes, or the rank // of any tensor changes (as these are compiled-in to the generated kernel // and therefore would require us to do a larger recompilation). - void recomputeArgs(ExecutorEntry&, ExpressionEvaluator&, const kir::Kernel*) - const; + void recomputeArgs( + KernelExecutorEntry&, + ExpressionEvaluator&, + const kir::Kernel*) const; //! Serialize CompiledKernel using flatbuffers flatbuffers::Offset serialize( flatbuffers::FlatBufferBuilder& builder, const executor_utils::CudaExecutable* kernel) const; - // ExecutorEntry is an internal POD struct for the KernelExecutor class. - // We define ExecutorEntry's serialize and deserialize as private methods in - // KernelExecutor. - flatbuffers::Offset serialize( + // KernelExecutorEntry is an internal POD struct for the KernelExecutor class. + // We define KernelExecutorEntry's serialize and deserialize as private + // methods in KernelExecutor. + flatbuffers::Offset serialize( flatbuffers::FlatBufferBuilder& builder, - const ExecutorEntry& data) const; + const KernelExecutorEntry& data) const; - //! Deserialize ExecutorEntry using flatbuffers - ExecutorEntry deserialize(const serde::ExecutorEntry* buffer); + //! Deserialize KernelExecutorEntry using flatbuffers + KernelExecutorEntry deserialize(const serde::KernelExecutorEntry* buffer); // GlobalBufferInfo is an internal POD struct for the KernelExecutor class. // We define GlobalBufferInfo's serialize and deserialize as private methods @@ -307,7 +312,7 @@ class KernelExecutor : public ExecutorAbstract { // lookup table to take short cut to retrieve recorded information in order to // launch kernels without re-inference parameters. - std::unordered_map executor_entry_lookup_; + std::unordered_map executor_entry_lookup_; // Compile time information caching. This is used for shape inference // support. The cache stores graph information that are available diff --git a/csrc/runtime/executor_kernel_arg.cpp b/csrc/runtime/executor_kernel_arg.cpp index 9197cd63771..74819f0fea9 100644 --- a/csrc/runtime/executor_kernel_arg.cpp +++ b/csrc/runtime/executor_kernel_arg.cpp @@ -8,6 +8,7 @@ #include // Extract size and strides +#include #include #include diff --git a/csrc/runtime/executor_kernel_arg.h b/csrc/runtime/executor_kernel_arg.h index a583104f66c..f2026ebe2e6 100644 --- a/csrc/runtime/executor_kernel_arg.h +++ b/csrc/runtime/executor_kernel_arg.h @@ -22,6 +22,8 @@ namespace nvfuser { +class GlobalBufferInfo; + //! KernelArgumentHolder copies meta information from kernel inputs, including //! tensor sizes/shapes/dtype/memory_ptr and copies scalar inputs. It is used //! for both compilation as well as kernel execution. It takes ownership of diff --git a/csrc/serde/fusion_cache.fbs b/csrc/serde/fusion_cache.fbs index 30a75729147..0d7cdd97d99 100644 --- a/csrc/serde/fusion_cache.fbs +++ b/csrc/serde/fusion_cache.fbs @@ -189,7 +189,7 @@ table KernelArgumentHolder { // // ===================================================================================== -// Tables for LaunchParams, GlobalBufferInfo, ExecutorEntry, and TensorShape used in KernelExecutor +// Tables for LaunchParams, GlobalBufferInfo, KernelExecutorEntry, and TensorShape used in KernelExecutor // Data representing a tensor shape used in LaunchParam table TensorShape { @@ -226,8 +226,8 @@ table GlobalBufferInfo { is_fusion_output : bool; } -// This table describes the cached ExecutorEntry for a kernel. -table ExecutorEntry { +// This table describes the cached KernelExecutorEntry for a kernel. +table KernelExecutorEntry { init : bool; launch_params : LaunchParams; outputs : [GlobalBufferInfo]; @@ -370,7 +370,7 @@ table KernelExecutor { group_id: long; kernel_code: string; executor_entry_lookup_keys: [ulong]; - executor_entry_lookup_values: [ExecutorEntry]; + executor_entry_lookup_values: [KernelExecutorEntry]; // Is this kernel being compiled with int32 or int64 indexing? index_type : long; compiled_kernel: CudaKernel; diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp index fd7d4a60df2..e5dd17f4aca 100644 --- a/csrc/tensor_metadata.cpp +++ b/csrc/tensor_metadata.cpp @@ -356,7 +356,8 @@ std::vector GetMetaData::evaluate( } else { metadata->logical_size = input.sizes(); } - metadata->logical_stride_data = input.strides(); + metadata->logical_stride_data = + std::vector(input.strides().begin(), input.strides().end()); metadata->logical_stride = c10::makeArrayRef(metadata->logical_stride_data); auto [allocation_sizes, allocation_strides] = From 66777eb2d4e28c9a0bc0ce3403cdc3b6b6ace5e8 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Mon, 24 Feb 2025 09:11:44 -0800 Subject: [PATCH 05/35] Getting closer, just an IMA now. --- csrc/runtime/allocations.cpp | 10 ++++------ csrc/runtime/executor.cpp | 13 ++++++++++--- csrc/runtime/executor_kernel_arg.cpp | 9 +++++---- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp index 89aed454a4f..3f242bfb63d 100644 --- a/csrc/runtime/allocations.cpp +++ b/csrc/runtime/allocations.cpp @@ -380,12 +380,7 @@ std::vector allocateKernelOutputs( const KernelArgumentHolder& args) { FUSER_PERF_SCOPE("fusion_executor::allocations::allocateOutputs"); - NVF_ERROR( - std::any_of( - entry.output_aliased_to_output.begin(), - entry.output_aliased_to_output.end(), - [](int idx) { return idx != -1; }), - "Kernel's don't support output to output aliasing."); + // TODO: Figure out if output to output aliasing is needed std::vector out_tensors; out_tensors.reserve(entry.outputs.size()); @@ -404,6 +399,9 @@ std::vector allocateKernelOutputs( } out_tensors.emplace_back(alloc_tensor); } else { + NVF_ERROR( + entry.output_aliased_to_input.at(out_idx) <= (int64_t)args.size(), + "Tried to grab an out of range input argument."); auto input_arg = args[entry.output_aliased_to_input.at(out_idx)]; NVF_ERROR( input_arg.is(), diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index 61ad8992bc2..541f0fa30ab 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -661,6 +661,7 @@ void KernelExecutor::initializeExecutorEntry( std::vector output_info; if (outputs.empty()) { + std::cout << compiled_kernel_->kernel()->outputs() << std::endl; output_info = getBufferInfos( expr_eval, index_type, compiled_kernel_->kernel()->outputs()); } else { @@ -687,10 +688,10 @@ void KernelExecutor::initializeExecutorEntry( } } - std::vector output_aliased_to_input(outputs.size(), -1); - std::vector output_aliased_to_output(outputs.size(), -1); + std::vector output_aliased_to_input(output_info.size(), -1); + std::vector output_aliased_to_output(output_info.size(), -1); - for (auto output_idx : c10::irange(outputs.size())) { + for (auto output_idx : c10::irange(output_info.size())) { auto out_info = output_info[output_idx]; auto fusion = compiled_kernel_->kernel()->as(); auto alias_info = fusion->getOutputAlias(out_info.tv); @@ -721,6 +722,12 @@ void KernelExecutor::initializeExecutorEntry( fusion->inputs(), "\nFusion Outputs:\n ", fusion->outputs()); + TORCH_WARN( + "Kernel found with output to output aliasing, this is unsupported in a kernel and will beignored.\n", + "Output: ", + out_info.tv->toString(), + "\nAliased to: ", + aliased_to->toString()); } } diff --git a/csrc/runtime/executor_kernel_arg.cpp b/csrc/runtime/executor_kernel_arg.cpp index 74819f0fea9..5d50eab4dbe 100644 --- a/csrc/runtime/executor_kernel_arg.cpp +++ b/csrc/runtime/executor_kernel_arg.cpp @@ -295,11 +295,11 @@ std::vector polymorphicValueToBytes( } else if (argument.is()) { // FUSER_PERF_SCOPE("polymorphicValueToBytes(StructHandle)"); std::vector buffer; - const auto& dtype_ = std::get(dtype.type); - auto& data = argument->*&TensorMetaData::data; - auto& logical_size = argument->*&TensorMetaData::logical_size; - auto& alloc_stride = argument->*&TensorMetaData::alloc_stride; if (argument.as().is()) { + auto& data = argument->*&TensorMetaData::data; + auto& logical_size = argument->*&TensorMetaData::logical_size; + auto& alloc_stride = argument->*&TensorMetaData::alloc_stride; + // special handle for TensorMetaData so that CPU overhead is minimal. if (index_type == PrimDataType::Int) { buffer.reserve( @@ -340,6 +340,7 @@ std::vector polymorphicValueToBytes( } return buffer; } else { + const auto& dtype_ = std::get(dtype.type); for (const auto& field : dtype_.fields) { if (!field.used_in_kernel) { continue; From 0a30236da9c021b212c47b875b76cc1816b536dc Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Mon, 24 Feb 2025 17:14:51 -0800 Subject: [PATCH 06/35] Fix basic tests. --- csrc/runtime/executor.cpp | 43 +++++----- csrc/runtime/executor.h | 3 +- csrc/runtime/executor_kernel_arg.cpp | 120 +++++++++++++++------------ 3 files changed, 92 insertions(+), 74 deletions(-) diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index 541f0fa30ab..ef99a21319f 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -661,7 +661,6 @@ void KernelExecutor::initializeExecutorEntry( std::vector output_info; if (outputs.empty()) { - std::cout << compiled_kernel_->kernel()->outputs() << std::endl; output_info = getBufferInfos( expr_eval, index_type, compiled_kernel_->kernel()->outputs()); } else { @@ -841,30 +840,32 @@ void KernelExecutor::computeArgs( // TODO: Add to header void KernelExecutor::computeArgs2( KernelExecutorEntry& entry, - const std::vector& outputs, - const std::vector& intermediates) const { + const KernelArgumentHolder& args) const { FUSER_PERF_SCOPE("KernelExecutor::computeArgs2"); + if (entry.args.size() != args.size()) { + entry.args.resize(args.size()); + entry.arg_ptrs.resize(args.size()); + } - entry.args.resize(outputs.size() + intermediates.size()); - entry.arg_ptrs.resize(outputs.size() + intermediates.size()); - - NVF_ERROR(entry.outputs.size() == outputs.size(), "Outputs size mismatch"); NVF_ERROR( - entry.intermediates.size() == intermediates.size(), - "Intermediates size mismatch"); + args.size() == + compiled_kernel_->kernel()->inputs().size() + + entry.outputs.size() + entry.intermediates.size() && + args.size() == compiled_kernel_->kernel()->parameters().size(), + "Argument size mismatch, expected: ", + compiled_kernel_->kernel()->inputs().size() + entry.outputs.size() + + entry.intermediates.size(), + " got: ", + args.size()); const PrimDataType idx_type = compiled_kernel_->kernel()->indexType(); - for (size_t out_idx = 0; out_idx < outputs.size(); ++out_idx) { - entry.args[out_idx] = - getKernelArgument(outputs[out_idx], entry.outputs[out_idx], idx_type); - entry.arg_ptrs[out_idx] = entry.args[out_idx].data(); - } - - for (size_t inter_idx = 0; inter_idx < intermediates.size(); ++inter_idx) { - entry.args[outputs.size() + inter_idx] = getKernelArgument( - intermediates[inter_idx], entry.intermediates[inter_idx], idx_type); - entry.arg_ptrs[outputs.size() + inter_idx] = - entry.args[outputs.size() + inter_idx].data(); + for (size_t i = 0; i < args.size(); ++i) { + auto bytes = polymorphicValueToBytes( + args[i], + compiled_kernel_->kernel()->parameters()[i]->dtype(), + idx_type); + entry.args[i] = bytes; + entry.arg_ptrs[i] = entry.args[i].data(); } } @@ -1152,7 +1153,7 @@ std::vector KernelExecutor::run( } } - computeArgs2(*executor_entry, outputs, intermediates); + computeArgs2(*executor_entry, args); if (isDebugDumpEnabled(DebugDumpOption::LaunchParam)) { launch_params_.print(); diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h index 1fecd1c32c8..faa6d67fc53 100644 --- a/csrc/runtime/executor.h +++ b/csrc/runtime/executor.h @@ -237,8 +237,7 @@ class KernelExecutor : public ExecutorAbstract { void computeArgs2( KernelExecutorEntry& entry, - const std::vector& outputs, - const std::vector& intermediates) const; + const KernelArgumentHolder& args) const; // Updates an existing set of arguments based on the current arguments. It is // is an error to call this before `computeArgs` has been invoked. diff --git a/csrc/runtime/executor_kernel_arg.cpp b/csrc/runtime/executor_kernel_arg.cpp index 5d50eab4dbe..ee4ebf8091f 100644 --- a/csrc/runtime/executor_kernel_arg.cpp +++ b/csrc/runtime/executor_kernel_arg.cpp @@ -180,25 +180,75 @@ std::vector polymorphicValueToBytes( if (argument.is()) { // FUSER_PERF_SCOPE("polymorphicValueToBytes(at::Tensor)"); const auto& tensor = argument.as(); - NVF_ERROR( - tensor.is_cpu() && tensor.numel() == 1, - "Only CPU scalar tensors are supported here. ", - "For GPU tensors, please use their metadata."); - auto scalar_type = tensor.scalar_type(); - NVF_ERROR( - dtype == aten_to_data_type(scalar_type), - "Expected ", - dtype, - " but got ", - aten_to_data_type(scalar_type), - "."); - std::vector buffer; - buffer.reserve(tensor.element_size()); - buffer.insert( - buffer.end(), - (std::byte*)tensor.data_ptr(), - (std::byte*)tensor.data_ptr() + tensor.element_size()); - return buffer; + if (tensor.is_cpu()) { + NVF_ERROR( + tensor.numel() == 1, + "Only CPU scalar tensors are supported here. ", + "For GPU tensors, please use their metadata."); + auto scalar_type = tensor.scalar_type(); + NVF_ERROR( + dtype == aten_to_data_type(scalar_type), + "Expected ", + dtype, + " but got ", + aten_to_data_type(scalar_type), + "."); + std::vector buffer; + buffer.reserve(tensor.element_size()); + buffer.insert( + buffer.end(), + (std::byte*)tensor.data_ptr(), + (std::byte*)tensor.data_ptr() + tensor.element_size()); + return buffer; + } else { + NVF_ERROR( + tensor.is_cuda(), "Only accepts CUDA tensors or CPU scalar tensors."); + + std::vector buffer; + auto data = tensor.data_ptr(); + auto logical_size = tensor.sizes(); + auto alloc_stride = tensor.strides(); + + // special handle for TensorMetaData so that CPU overhead is minimal. + if (index_type == PrimDataType::Int) { + buffer.reserve( + sizeof(void*) + sizeof(int64_t) * logical_size.size() + + sizeof(int64_t) * alloc_stride.size()); + buffer.insert( + buffer.end(), (std::byte*)data, (std::byte*)data + sizeof(void*)); + buffer.insert( + buffer.end(), + (std::byte*)logical_size.data(), + (std::byte*)logical_size.data() + + sizeof(int64_t) * logical_size.size()); + buffer.insert( + buffer.end(), + (std::byte*)alloc_stride.data(), + (std::byte*)alloc_stride.data() + + sizeof(int64_t) * alloc_stride.size()); + } else { + buffer.reserve( + sizeof(void*) + sizeof(int32_t) * logical_size.size() + + sizeof(int32_t) * alloc_stride.size()); + buffer.insert( + buffer.end(), (std::byte*)&data, (std::byte*)&data + sizeof(void*)); + std::vector logical_size32( + logical_size.begin(), logical_size.end()); + buffer.insert( + buffer.end(), + (std::byte*)logical_size32.data(), + (std::byte*)logical_size32.data() + + sizeof(int32_t) * logical_size32.size()); + std::vector alloc_stride32( + alloc_stride.begin(), alloc_stride.end()); + buffer.insert( + buffer.end(), + (std::byte*)alloc_stride32.data(), + (std::byte*)alloc_stride32.data() + + sizeof(int32_t) * alloc_stride32.size()); + } + return buffer; + } } else if (argument.is()) { // FUSER_PERF_SCOPE("polymorphicValueToBytes(Pointer)"); NVF_ERROR( @@ -379,38 +429,6 @@ std::vector getKernelArgument( return polymorphicValueToBytes(pv, parameter->dtype(), index_type); } -std::vector getKernelArgument( - at::Tensor tensor, - const GlobalBufferInfo& output_info, - PrimDataType index_type) { - FUSER_PERF_SCOPE("getKernelArgument"); - - NVF_ERROR( - tensor.is_cuda() || tensor.is_meta(), - "GetMetaData expects a CUDA/meta tensor as input, but got: ", - tensor); - - std::shared_ptr struct_ = std::make_shared(); - TensorMetaData* metadata = (TensorMetaData*)struct_.get(); - metadata->dtype = - std::get(aten_to_data_type(tensor.scalar_type()).type); - metadata->data = tensor.data_ptr(); - - metadata->logical_size_data = output_info.shape_info.logical_sizes; - metadata->logical_size = c10::makeArrayRef(metadata->logical_size_data); - metadata->logical_stride_data = output_info.shape_info.logical_strides; - metadata->logical_stride = c10::makeArrayRef(metadata->logical_stride_data); - metadata->alloc_size_data = output_info.shape_info.allocation_sizes; - metadata->alloc_size = c10::makeArrayRef(metadata->alloc_size_data); - metadata->alloc_stride_data = output_info.shape_info.allocation_strides; - metadata->alloc_stride = c10::makeArrayRef(metadata->alloc_stride_data); - - return polymorphicValueToBytes( - PolymorphicValue(std::move(struct_)), - output_info.tv->dtype(), - index_type); -} - int64_t computeBytes(const KernelArgumentHolder& args) { int64_t num_bytes = 0; // Figure how many bytes are inputs, outputs, and temporary buffers From 93f7ebd79c136d0589e34de2183eda456d4a9593 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Mon, 24 Feb 2025 18:00:33 -0800 Subject: [PATCH 07/35] Still debugging. --- csrc/runtime/executor.cpp | 146 +++++++++++++++++++++++++-- csrc/runtime/executor.h | 1 + csrc/runtime/executor_kernel_arg.cpp | 88 ++++------------ 3 files changed, 156 insertions(+), 79 deletions(-) diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index ef99a21319f..0d9a1815001 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -658,6 +658,46 @@ void KernelExecutor::initializeExecutorEntry( "Expected blockDim.x >= 32 but found ", launch_params.bdimx()); + std::vector input_info; + NVF_ERROR( + compiled_kernel_->kernel()->inputs().size() == args.size(), + "Input size mismatch, expected: ", + compiled_kernel_->kernel()->inputs().size(), + " got: ", + args.size()); + for (auto inp_idx : + c10::irange(compiled_kernel_->kernel()->inputs().size())) { + auto input = compiled_kernel_->kernel()->inputs()[inp_idx]; + if (input->isA()) { + if (input->as()->hasAllocation()) { + input_info.emplace_back(getBufferInfos( + expr_eval, index_type, {input->as()})[0]); + } else { + TensorShapeInfo shape_info; + shape_info.logical_sizes = args[inp_idx].as().sizes().vec(); + shape_info.logical_strides = + args[inp_idx].as().strides().vec(); + shape_info.allocation_sizes = + args[inp_idx].as().sizes().vec(); + shape_info.allocation_strides = + args[inp_idx].as().strides().vec(); + input_info.emplace_back(GlobalBufferInfo( + input->as(), + shape_info, + data_type_to_aten(input->dtype()), + false, + false, + false)); + } + } else { + input_info.emplace_back(GlobalBufferInfo()); + } + std::cout << "input_info.back().shape_info.logical_sizes: " + << input_info.back().shape_info.logical_sizes << std::endl; + std::cout << "input_info.back().shape_info.allocation_strides: " + << input_info.back().shape_info.allocation_strides << std::endl; + } + std::vector output_info; if (outputs.empty()) { @@ -738,6 +778,7 @@ void KernelExecutor::initializeExecutorEntry( executor_entry.output_aliased_to_input = output_aliased_to_input; executor_entry.output_aliased_to_output = output_aliased_to_output; executor_entry.intermediates = intermediates; + executor_entry.inputs = input_info; executor_entry.init = true; } @@ -841,6 +882,8 @@ void KernelExecutor::computeArgs( void KernelExecutor::computeArgs2( KernelExecutorEntry& entry, const KernelArgumentHolder& args) const { + std::cout << "Args: " << args.toString() << std::endl; + FUSER_PERF_SCOPE("KernelExecutor::computeArgs2"); if (entry.args.size() != args.size()) { entry.args.resize(args.size()); @@ -849,23 +892,106 @@ void KernelExecutor::computeArgs2( NVF_ERROR( args.size() == - compiled_kernel_->kernel()->inputs().size() + - entry.outputs.size() + entry.intermediates.size() && - args.size() == compiled_kernel_->kernel()->parameters().size(), + compiled_kernel_->kernel()->inputs().size() + entry.outputs.size() + + entry.intermediates.size(), "Argument size mismatch, expected: ", compiled_kernel_->kernel()->inputs().size() + entry.outputs.size() + entry.intermediates.size(), " got: ", args.size()); + NVF_ERROR( + args.size() == compiled_kernel_->kernel()->parameters().size(), + "Argument size mismatch, expected: ", + compiled_kernel_->kernel()->parameters().size(), + " got: ", + args.size()); + + auto buffer_info = [&](size_t idx) -> GlobalBufferInfo& { + if (idx < entry.inputs.size()) { + return entry.inputs[idx]; + } else if (idx < entry.inputs.size() + entry.outputs.size()) { + return entry.outputs[idx - entry.inputs.size()]; + } else if ( + idx < entry.inputs.size() + entry.outputs.size() + + entry.intermediates.size()) { + return entry + .intermediates[idx - entry.inputs.size() - entry.outputs.size()]; + } else { + NVF_CHECK( + 0, + "Invalid buffer index: ", + idx, + " input size: ", + entry.inputs.size(), + " output size: ", + entry.outputs.size(), + " intermediate size: ", + entry.intermediates.size()); + } + }; + const PrimDataType idx_type = compiled_kernel_->kernel()->indexType(); - for (size_t i = 0; i < args.size(); ++i) { - auto bytes = polymorphicValueToBytes( - args[i], - compiled_kernel_->kernel()->parameters()[i]->dtype(), - idx_type); - entry.args[i] = bytes; - entry.arg_ptrs[i] = entry.args[i].data(); + for (size_t arg_idx = 0; arg_idx < args.size(); ++arg_idx) { + std::vector bytes; + if (args[arg_idx].is()) { + auto tensor = args[arg_idx].as(); + NVF_ERROR( + tensor.is_cuda(), "Only accepts CUDA tensors or CPU scalar tensors."); + + auto data = tensor.data_ptr(); + const auto& logical_size = buffer_info(arg_idx).shape_info.logical_sizes; + const auto& alloc_stride = + buffer_info(arg_idx).shape_info.allocation_strides; + + // special handle for TensorMetaData so that CPU overhead is minimal. + if (idx_type == PrimDataType::Int) { + bytes.reserve( + sizeof(void*) + sizeof(int64_t) * logical_size.size() + + sizeof(int64_t) * alloc_stride.size()); + bytes.insert( + bytes.end(), (std::byte*)data, (std::byte*)data + sizeof(void*)); + bytes.insert( + bytes.end(), + (std::byte*)logical_size.data(), + (std::byte*)logical_size.data() + + sizeof(int64_t) * logical_size.size()); + bytes.insert( + bytes.end(), + (std::byte*)alloc_stride.data(), + (std::byte*)alloc_stride.data() + + sizeof(int64_t) * alloc_stride.size()); + } else { + bytes.reserve( + sizeof(void*) + sizeof(int32_t) * logical_size.size() + + sizeof(int32_t) * alloc_stride.size()); + bytes.insert( + bytes.end(), (std::byte*)&data, (std::byte*)&data + sizeof(void*)); + std::vector logical_size32( + logical_size.begin(), logical_size.end()); + bytes.insert( + bytes.end(), + (std::byte*)logical_size32.data(), + (std::byte*)logical_size32.data() + + sizeof(int32_t) * logical_size32.size()); + std::vector alloc_stride32( + alloc_stride.begin(), alloc_stride.end()); + bytes.insert( + bytes.end(), + (std::byte*)alloc_stride32.data(), + (std::byte*)alloc_stride32.data() + + sizeof(int32_t) * alloc_stride32.size()); + } + entry.args[arg_idx] = bytes; + entry.arg_ptrs[arg_idx] = entry.args[arg_idx].data(); + } else { + auto bytes = polymorphicValueToBytes( + args[arg_idx], + compiled_kernel_->kernel()->parameters()[arg_idx]->dtype(), + idx_type); + entry.args[arg_idx] = bytes; + entry.arg_ptrs[arg_idx] = entry.args[arg_idx].data(); + } } } diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h index faa6d67fc53..70f372312d6 100644 --- a/csrc/runtime/executor.h +++ b/csrc/runtime/executor.h @@ -74,6 +74,7 @@ struct KernelExecutorEntry { std::vector output_aliased_to_output; // Temporary work buffers and intemediate global-memory tensors std::vector intermediates; + std::vector inputs; // The arguments to the kernel. These are configured in computeArgs and // recomputeArgs. // For the common case of a tensor argument, these correspond to the diff --git a/csrc/runtime/executor_kernel_arg.cpp b/csrc/runtime/executor_kernel_arg.cpp index ee4ebf8091f..0392ae65b9a 100644 --- a/csrc/runtime/executor_kernel_arg.cpp +++ b/csrc/runtime/executor_kernel_arg.cpp @@ -180,75 +180,25 @@ std::vector polymorphicValueToBytes( if (argument.is()) { // FUSER_PERF_SCOPE("polymorphicValueToBytes(at::Tensor)"); const auto& tensor = argument.as(); - if (tensor.is_cpu()) { - NVF_ERROR( - tensor.numel() == 1, - "Only CPU scalar tensors are supported here. ", - "For GPU tensors, please use their metadata."); - auto scalar_type = tensor.scalar_type(); - NVF_ERROR( - dtype == aten_to_data_type(scalar_type), - "Expected ", - dtype, - " but got ", - aten_to_data_type(scalar_type), - "."); - std::vector buffer; - buffer.reserve(tensor.element_size()); - buffer.insert( - buffer.end(), - (std::byte*)tensor.data_ptr(), - (std::byte*)tensor.data_ptr() + tensor.element_size()); - return buffer; - } else { - NVF_ERROR( - tensor.is_cuda(), "Only accepts CUDA tensors or CPU scalar tensors."); - - std::vector buffer; - auto data = tensor.data_ptr(); - auto logical_size = tensor.sizes(); - auto alloc_stride = tensor.strides(); - - // special handle for TensorMetaData so that CPU overhead is minimal. - if (index_type == PrimDataType::Int) { - buffer.reserve( - sizeof(void*) + sizeof(int64_t) * logical_size.size() + - sizeof(int64_t) * alloc_stride.size()); - buffer.insert( - buffer.end(), (std::byte*)data, (std::byte*)data + sizeof(void*)); - buffer.insert( - buffer.end(), - (std::byte*)logical_size.data(), - (std::byte*)logical_size.data() + - sizeof(int64_t) * logical_size.size()); - buffer.insert( - buffer.end(), - (std::byte*)alloc_stride.data(), - (std::byte*)alloc_stride.data() + - sizeof(int64_t) * alloc_stride.size()); - } else { - buffer.reserve( - sizeof(void*) + sizeof(int32_t) * logical_size.size() + - sizeof(int32_t) * alloc_stride.size()); - buffer.insert( - buffer.end(), (std::byte*)&data, (std::byte*)&data + sizeof(void*)); - std::vector logical_size32( - logical_size.begin(), logical_size.end()); - buffer.insert( - buffer.end(), - (std::byte*)logical_size32.data(), - (std::byte*)logical_size32.data() + - sizeof(int32_t) * logical_size32.size()); - std::vector alloc_stride32( - alloc_stride.begin(), alloc_stride.end()); - buffer.insert( - buffer.end(), - (std::byte*)alloc_stride32.data(), - (std::byte*)alloc_stride32.data() + - sizeof(int32_t) * alloc_stride32.size()); - } - return buffer; - } + NVF_ERROR( + tensor.is_cpu() && tensor.numel() == 1, + "Only CPU scalar tensors are supported here. ", + "For GPU tensors, please use their metadata."); + auto scalar_type = tensor.scalar_type(); + NVF_ERROR( + dtype == aten_to_data_type(scalar_type), + "Expected ", + dtype, + " but got ", + aten_to_data_type(scalar_type), + "."); + std::vector buffer; + buffer.reserve(tensor.element_size()); + buffer.insert( + buffer.end(), + (std::byte*)tensor.data_ptr(), + (std::byte*)tensor.data_ptr() + tensor.element_size()); + return buffer; } else if (argument.is()) { // FUSER_PERF_SCOPE("polymorphicValueToBytes(Pointer)"); NVF_ERROR( From ddd62d3ed3f094f6977cd764f296a883a093f0fb Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Tue, 25 Feb 2025 11:50:05 -0800 Subject: [PATCH 08/35] Fix TMA Support. --- csrc/runtime/executor.cpp | 132 +++++++++++++++++++++++--------------- csrc/runtime/executor.h | 4 ++ 2 files changed, 84 insertions(+), 52 deletions(-) diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index 0d9a1815001..99cf5ffbc73 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -689,13 +689,7 @@ void KernelExecutor::initializeExecutorEntry( false, false)); } - } else { - input_info.emplace_back(GlobalBufferInfo()); } - std::cout << "input_info.back().shape_info.logical_sizes: " - << input_info.back().shape_info.logical_sizes << std::endl; - std::cout << "input_info.back().shape_info.allocation_strides: " - << input_info.back().shape_info.allocation_strides << std::endl; } std::vector output_info; @@ -877,29 +871,43 @@ void KernelExecutor::computeArgs( } } -// set the arguments that we'll pass to cuLaunchKernel -// TODO: Add to header +namespace { +GlobalBufferInfo& linear_buffer_info_getter( + KernelExecutorEntry& entry, + size_t idx) { + if (idx < entry.inputs.size()) { + return entry.inputs[idx]; + } else if (idx < entry.inputs.size() + entry.outputs.size()) { + return entry.outputs[idx - entry.inputs.size()]; + } else if ( + idx < + entry.inputs.size() + entry.outputs.size() + entry.intermediates.size()) { + return entry + .intermediates[idx - entry.inputs.size() - entry.outputs.size()]; + } else { + NVF_CHECK( + 0, + "Invalid buffer index: ", + idx, + " input size: ", + entry.inputs.size(), + " output size: ", + entry.outputs.size(), + " intermediate size: ", + entry.intermediates.size()); + } +}; +} // namespace + void KernelExecutor::computeArgs2( KernelExecutorEntry& entry, const KernelArgumentHolder& args) const { - std::cout << "Args: " << args.toString() << std::endl; - FUSER_PERF_SCOPE("KernelExecutor::computeArgs2"); if (entry.args.size() != args.size()) { entry.args.resize(args.size()); entry.arg_ptrs.resize(args.size()); } - NVF_ERROR( - args.size() == - compiled_kernel_->kernel()->inputs().size() + entry.outputs.size() + - entry.intermediates.size(), - "Argument size mismatch, expected: ", - compiled_kernel_->kernel()->inputs().size() + entry.outputs.size() + - entry.intermediates.size(), - " got: ", - args.size()); - NVF_ERROR( args.size() == compiled_kernel_->kernel()->parameters().size(), "Argument size mismatch, expected: ", @@ -907,31 +915,8 @@ void KernelExecutor::computeArgs2( " got: ", args.size()); - auto buffer_info = [&](size_t idx) -> GlobalBufferInfo& { - if (idx < entry.inputs.size()) { - return entry.inputs[idx]; - } else if (idx < entry.inputs.size() + entry.outputs.size()) { - return entry.outputs[idx - entry.inputs.size()]; - } else if ( - idx < entry.inputs.size() + entry.outputs.size() + - entry.intermediates.size()) { - return entry - .intermediates[idx - entry.inputs.size() - entry.outputs.size()]; - } else { - NVF_CHECK( - 0, - "Invalid buffer index: ", - idx, - " input size: ", - entry.inputs.size(), - " output size: ", - entry.outputs.size(), - " intermediate size: ", - entry.intermediates.size()); - } - }; - const PrimDataType idx_type = compiled_kernel_->kernel()->indexType(); + int64_t buffer_info_idx = 0; for (size_t arg_idx = 0; arg_idx < args.size(); ++arg_idx) { std::vector bytes; if (args[arg_idx].is()) { @@ -940,17 +925,19 @@ void KernelExecutor::computeArgs2( tensor.is_cuda(), "Only accepts CUDA tensors or CPU scalar tensors."); auto data = tensor.data_ptr(); - const auto& logical_size = buffer_info(arg_idx).shape_info.logical_sizes; + const auto& logical_size = + linear_buffer_info_getter(entry, buffer_info_idx) + .shape_info.logical_sizes; const auto& alloc_stride = - buffer_info(arg_idx).shape_info.allocation_strides; - + linear_buffer_info_getter(entry, buffer_info_idx) + .shape_info.allocation_strides; + buffer_info_idx++; // special handle for TensorMetaData so that CPU overhead is minimal. if (idx_type == PrimDataType::Int) { bytes.reserve( sizeof(void*) + sizeof(int64_t) * logical_size.size() + sizeof(int64_t) * alloc_stride.size()); - bytes.insert( - bytes.end(), (std::byte*)data, (std::byte*)data + sizeof(void*)); + bytes.insert(bytes.end(), (std::byte*)&data, (std::byte*)(&data + 1)); bytes.insert( bytes.end(), (std::byte*)logical_size.data(), @@ -965,8 +952,7 @@ void KernelExecutor::computeArgs2( bytes.reserve( sizeof(void*) + sizeof(int32_t) * logical_size.size() + sizeof(int32_t) * alloc_stride.size()); - bytes.insert( - bytes.end(), (std::byte*)&data, (std::byte*)&data + sizeof(void*)); + bytes.insert(bytes.end(), (std::byte*)&data, (std::byte*)(&data + 1)); std::vector logical_size32( logical_size.begin(), logical_size.end()); bytes.insert( @@ -1002,7 +988,6 @@ void KernelExecutor::recomputeArgs( ExpressionEvaluator& expr_eval, const kir::Kernel* kernel) const { FUSER_PERF_SCOPE("KernelExecutor::recomputeArgs"); - // assert(entry.init && "entry was never initialized"); const std::vector& params = kernel->parameters(); const PrimDataType idx_type = kernel->indexType(); @@ -1107,6 +1092,38 @@ void KernelExecutor::resetCompiledKernelProperties() { static_smem_size_.reset(); } +KernelArgumentHolder KernelExecutor::resolveTMA( + KernelExecutorEntry& entry, + const KernelArgumentHolder& args) const { + ExpressionEvaluator expr_eval; + int64_t arg_idx = 0; + NVF_ERROR( + entry.inputs.size() == compiled_kernel_->kernel()->inputs().size(), + "Input size mismatch"); + for (auto inp_idx : c10::irange(entry.inputs.size())) { + expr_eval.bind( + compiled_kernel_->kernel()->inputs()[inp_idx], args[arg_idx++]); + } + + NVF_ERROR( + entry.outputs.size() == compiled_kernel_->kernel()->outputs().size(), + "Output size mismatch"); + for (auto out_idx : c10::irange(entry.outputs.size())) { + expr_eval.bind( + compiled_kernel_->kernel()->outputs()[out_idx], args[arg_idx++]); + } + + for (auto intermediate_entry : entry.intermediates) { + expr_eval.bind(intermediate_entry.tv, args[arg_idx++]); + } + + KernelArgumentHolder resolved_args; + for (auto param : compiled_kernel_->kernel()->parameters()) { + resolved_args.push(expr_eval.evaluate(param)); + } + return resolved_args; +} + std::vector KernelExecutor::run( KernelArgumentHolder args, std::vector outputs, @@ -1279,6 +1296,17 @@ std::vector KernelExecutor::run( } } + if (args.size() != compiled_kernel_->kernel()->parameters().size()) { + std::vector exprs = compiled_kernel_->kernel()->exprs(); + NVF_ERROR( + std::any_of( + exprs.begin(), + exprs.end(), + [](Expr* e) { return ir_utils::isCpAsyncBulk(e); }), + "Argument mismatch detected in run, but is not resolveable."); + args = resolveTMA(*executor_entry, args); + } + computeArgs2(*executor_entry, args); if (isDebugDumpEnabled(DebugDumpOption::LaunchParam)) { diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h index 70f372312d6..50c97e2eece 100644 --- a/csrc/runtime/executor.h +++ b/csrc/runtime/executor.h @@ -240,6 +240,10 @@ class KernelExecutor : public ExecutorAbstract { KernelExecutorEntry& entry, const KernelArgumentHolder& args) const; + KernelArgumentHolder resolveTMA( + KernelExecutorEntry& entry, + const KernelArgumentHolder& args) const; + // Updates an existing set of arguments based on the current arguments. It is // is an error to call this before `computeArgs` has been invoked. // recomputeArgs will fail if the arity of the function changes, or the rank From f91c252185dee1ffef30f0d49141415c9698c21f Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Tue, 25 Feb 2025 12:36:25 -0800 Subject: [PATCH 09/35] Drop. --- csrc/runtime/allocations.cpp | 40 ++++++++++++++++++++++++++++ csrc/runtime/allocations.h | 6 +++++ csrc/runtime/executor.cpp | 33 +++++++++++++++++++++-- tests/cpp/test_allocation_domain.cpp | 4 +++ 4 files changed, 81 insertions(+), 2 deletions(-) diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp index 3f242bfb63d..407b135004d 100644 --- a/csrc/runtime/allocations.cpp +++ b/csrc/runtime/allocations.cpp @@ -806,6 +806,46 @@ std::pair, std::vector> inferShapeOfOutput( return {meta_tensor.sizes().vec(), meta_tensor.strides().vec()}; } +std::vector getInputBufferInfos( + ExpressionEvaluator& expr_eval, + DataType index_dtype, + const std::vector& fusion_inputs, + const std::vector& inputs) { + NVF_ERROR( + fusion_inputs.size() == inputs.size(), + "Mismatch in inputs provided, expected ", + fusion_inputs.size(), + " but got ", + inputs.size()); + std::vector buffer_infos; + for (auto i : c10::irange(fusion_inputs.size())) { + GlobalBufferInfo buffer_info; + buffer_info.tv = fusion_inputs[i]->as(); + auto logical_sizes = inputs[i].sizes().vec(); + auto logical_strides = inputs[i].strides().vec(); + TensorShapeInfo shape_info; + shape_info.logical_sizes = logical_sizes; + shape_info.logical_strides = logical_strides; + buffer_info.shape_info = shape_info; + buffer_info.type = inputs[i].scalar_type(); + + // TODO: Handle input allocation domains that aren't permutes + // of the logical domain + if (buffer_info.tv->hasAllocation()) { + auto allocation_size_stride = inferAllocationShape( + buffer_info.tv, expr_eval); + buffer_info.shape_info.allocation_sizes = allocation_size_stride.first; + buffer_info.shape_info.allocation_strides = allocation_size_stride.second; + } else { + buffer_info.shape_info.allocation_sizes = logical_sizes; + buffer_info.shape_info.allocation_strides = logical_strides; + } + + buffer_infos.emplace_back(buffer_info); + } + return buffer_infos; + } + TensorShapeInfo inferTensorShapes( TensorView* tv, const ExpressionEvaluator& expr_eval) { diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h index 999692f936e..0f6e5b4a3f1 100644 --- a/csrc/runtime/allocations.h +++ b/csrc/runtime/allocations.h @@ -103,4 +103,10 @@ std::vector getBufferInfos( DataType index_dtype, const std::vector& fusion_outputs); +std::vector getInputBufferInfos( + ExpressionEvaluator& expr_eval, + DataType index_dtype, + const std::vector& fusion_outputs, + const std::vector& inputs); + } // namespace nvfuser diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index 99cf5ffbc73..fe00df2f742 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -670,8 +670,21 @@ void KernelExecutor::initializeExecutorEntry( auto input = compiled_kernel_->kernel()->inputs()[inp_idx]; if (input->isA()) { if (input->as()->hasAllocation()) { - input_info.emplace_back(getBufferInfos( - expr_eval, index_type, {input->as()})[0]); + std::cout << "Get buffer info for: " << input->toString() << std::endl; + std::cout << "Arg: " + << PolymorphicValue_functions::toString(args[inp_idx]) + << std::endl; + auto buffer_info = + getInputBufferInfos(expr_eval, index_type, {input->as()})[0]; + std::cout << "Buffer info allocation sizes: " + << buffer_info.shape_info.allocation_sizes << std::endl; + std::cout << "Buffer info allocation strides: " + << buffer_info.shape_info.allocation_strides << std::endl; + std::cout << "Buffer info logical sizes: " + << buffer_info.shape_info.logical_sizes << std::endl; + std::cout << "Buffer info logical strides: " + << buffer_info.shape_info.logical_strides << std::endl; + input_info.emplace_back(buffer_info); } else { TensorShapeInfo shape_info; shape_info.logical_sizes = args[inp_idx].as().sizes().vec(); @@ -915,6 +928,19 @@ void KernelExecutor::computeArgs2( " got: ", args.size()); + for (auto inp : compiled_kernel_->kernel()->inputs()) { + if (!inp->isA()) { + continue; + } + std::cout << "Input root: " << inp->as()->getRootDomain() + << std::endl; + std::cout << "Input logical: " << inp->as()->getLogicalDomain() + << std::endl; + std::cout << "Input alloc: " + << inp->as()->getMaybeAllocationDomain() << std::endl; + } + std::cout << "Args: " << args.toString() << std::endl; + const PrimDataType idx_type = compiled_kernel_->kernel()->indexType(); int64_t buffer_info_idx = 0; for (size_t arg_idx = 0; arg_idx < args.size(); ++arg_idx) { @@ -931,6 +957,9 @@ void KernelExecutor::computeArgs2( const auto& alloc_stride = linear_buffer_info_getter(entry, buffer_info_idx) .shape_info.allocation_strides; + std::cout << "Binding Tensor: " << (int64_t)data + << " size: " << logical_size << " stride: " << alloc_stride + << std::endl; buffer_info_idx++; // special handle for TensorMetaData so that CPU overhead is minimal. if (idx_type == PrimDataType::Int) { diff --git a/tests/cpp/test_allocation_domain.cpp b/tests/cpp/test_allocation_domain.cpp index 849cd140219..2972a883692 100644 --- a/tests/cpp/test_allocation_domain.cpp +++ b/tests/cpp/test_allocation_domain.cpp @@ -1098,7 +1098,11 @@ TEST_F(AllocationDomainTest, ContiguityIssue1021) { fusion->addOutput(tv1); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + // allocate [8,8] strided [8, 1] + // modify to [4,8] strided [1, 8] + // Tell nvFuser it's allocated as [4, 8] strided [8, 1] at::Tensor t0 = at::randn({8, 8}, options).as_strided({4, 8}, {1, 8}); + std::cout << debug_str(t0) << std::endl; FusionExecutorCache executor_cache(std::move(fusion)); auto outputs = executor_cache.runFusionWithInputs({t0}); From cf15f4385027ed0d5e9e439c4539dedcea57ece1 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Tue, 25 Feb 2025 14:25:09 -0800 Subject: [PATCH 10/35] All but one allocation domain test working. --- csrc/runtime/allocations.cpp | 30 ++++++++++++--- csrc/runtime/executor.cpp | 71 ++++++++++++++++++++---------------- 2 files changed, 64 insertions(+), 37 deletions(-) diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp index 407b135004d..8f66ea6889e 100644 --- a/csrc/runtime/allocations.cpp +++ b/csrc/runtime/allocations.cpp @@ -832,10 +832,30 @@ std::vector getInputBufferInfos( // TODO: Handle input allocation domains that aren't permutes // of the logical domain if (buffer_info.tv->hasAllocation()) { - auto allocation_size_stride = inferAllocationShape( - buffer_info.tv, expr_eval); - buffer_info.shape_info.allocation_sizes = allocation_size_stride.first; - buffer_info.shape_info.allocation_strides = allocation_size_stride.second; + auto logical_domain = + TensorDomain::noReductions(buffer_info.tv->getLogicalDomain()); + auto allocation_domain = TensorDomain::noReductions( + buffer_info.tv->getMaybeAllocationDomain()); + std::unordered_map logical_to_allocation_map; + for (int64_t logical_idx : c10::irange(logical_domain.size())) { + auto allocation_id = std::find( + allocation_domain.begin(), + allocation_domain.end(), + logical_domain[logical_idx]); + NVF_ERROR( + allocation_id != allocation_domain.end(), + "Logical domain and allocation domain have different sets of IterDomains, this is not supported yet."); + logical_to_allocation_map[logical_idx] = + std::distance(allocation_domain.begin(), allocation_id); + } + std::vector allocation_sizes(allocation_domain.size()); + std::vector allocation_strides(allocation_domain.size()); + for (auto i : c10::irange(allocation_domain.size())) { + allocation_sizes[i] = logical_sizes[logical_to_allocation_map[i]]; + allocation_strides[i] = logical_strides[logical_to_allocation_map[i]]; + } + buffer_info.shape_info.allocation_sizes = allocation_sizes; + buffer_info.shape_info.allocation_strides = allocation_strides; } else { buffer_info.shape_info.allocation_sizes = logical_sizes; buffer_info.shape_info.allocation_strides = logical_strides; @@ -844,7 +864,7 @@ std::vector getInputBufferInfos( buffer_infos.emplace_back(buffer_info); } return buffer_infos; - } +} TensorShapeInfo inferTensorShapes( TensorView* tv, diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index fe00df2f742..e8f13620084 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -668,40 +668,47 @@ void KernelExecutor::initializeExecutorEntry( for (auto inp_idx : c10::irange(compiled_kernel_->kernel()->inputs().size())) { auto input = compiled_kernel_->kernel()->inputs()[inp_idx]; - if (input->isA()) { - if (input->as()->hasAllocation()) { - std::cout << "Get buffer info for: " << input->toString() << std::endl; - std::cout << "Arg: " - << PolymorphicValue_functions::toString(args[inp_idx]) - << std::endl; - auto buffer_info = - getInputBufferInfos(expr_eval, index_type, {input->as()})[0]; - std::cout << "Buffer info allocation sizes: " - << buffer_info.shape_info.allocation_sizes << std::endl; - std::cout << "Buffer info allocation strides: " - << buffer_info.shape_info.allocation_strides << std::endl; - std::cout << "Buffer info logical sizes: " - << buffer_info.shape_info.logical_sizes << std::endl; - std::cout << "Buffer info logical strides: " - << buffer_info.shape_info.logical_strides << std::endl; - input_info.emplace_back(buffer_info); + if (auto input_tv = dynamic_cast(input)) { + std::cout << "Get buffer info for: " << input_tv->toString() << std::endl; + std::cout << "Arg: " + << PolymorphicValue_functions::toString(args[inp_idx]) + << std::endl; + + auto at_tensor = args[inp_idx].as(); + std::vector alloc_sizes; + std::vector alloc_strides; + if (input_tv->hasAllocation()) { + std::tie(alloc_sizes, alloc_strides) = + inferAndValidateAllocationSizesAndStrides( + at_tensor, input_tv, expr_eval); } else { - TensorShapeInfo shape_info; - shape_info.logical_sizes = args[inp_idx].as().sizes().vec(); - shape_info.logical_strides = - args[inp_idx].as().strides().vec(); - shape_info.allocation_sizes = - args[inp_idx].as().sizes().vec(); - shape_info.allocation_strides = - args[inp_idx].as().strides().vec(); - input_info.emplace_back(GlobalBufferInfo( - input->as(), - shape_info, - data_type_to_aten(input->dtype()), - false, - false, - false)); + alloc_sizes = at_tensor.sizes().vec(); + alloc_strides = at_tensor.strides().vec(); } + + TensorShapeInfo shape_info; + shape_info.logical_sizes = args[inp_idx].as().sizes().vec(); + shape_info.logical_strides = + args[inp_idx].as().strides().vec(); + shape_info.allocation_sizes = alloc_sizes; + shape_info.allocation_strides = alloc_strides; + auto buffer_info = GlobalBufferInfo( + input->as(), + shape_info, + data_type_to_aten(input->dtype()), + false, + false, + false); + + std::cout << "Buffer info allocation sizes: " + << buffer_info.shape_info.allocation_sizes << std::endl; + std::cout << "Buffer info allocation strides: " + << buffer_info.shape_info.allocation_strides << std::endl; + std::cout << "Buffer info logical sizes: " + << buffer_info.shape_info.logical_sizes << std::endl; + std::cout << "Buffer info logical strides: " + << buffer_info.shape_info.logical_strides << std::endl; + input_info.emplace_back(buffer_info); } } From eaca8a8f8a607a18535f8f606592ecf4331ff730 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Tue, 25 Feb 2025 14:32:53 -0800 Subject: [PATCH 11/35] All allocation tests working. --- csrc/runtime/executor.cpp | 39 ++++++---------------------- tests/cpp/test_allocation_domain.cpp | 2 +- 2 files changed, 9 insertions(+), 32 deletions(-) diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index e8f13620084..f65fda23d49 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -669,12 +669,8 @@ void KernelExecutor::initializeExecutorEntry( c10::irange(compiled_kernel_->kernel()->inputs().size())) { auto input = compiled_kernel_->kernel()->inputs()[inp_idx]; if (auto input_tv = dynamic_cast(input)) { - std::cout << "Get buffer info for: " << input_tv->toString() << std::endl; - std::cout << "Arg: " - << PolymorphicValue_functions::toString(args[inp_idx]) - << std::endl; - auto at_tensor = args[inp_idx].as(); + std::vector alloc_sizes; std::vector alloc_strides; if (input_tv->hasAllocation()) { @@ -692,23 +688,14 @@ void KernelExecutor::initializeExecutorEntry( args[inp_idx].as().strides().vec(); shape_info.allocation_sizes = alloc_sizes; shape_info.allocation_strides = alloc_strides; - auto buffer_info = GlobalBufferInfo( - input->as(), + + input_info.emplace_back(GlobalBufferInfo( + input_tv, shape_info, - data_type_to_aten(input->dtype()), + data_type_to_aten(input_tv->dtype()), false, false, - false); - - std::cout << "Buffer info allocation sizes: " - << buffer_info.shape_info.allocation_sizes << std::endl; - std::cout << "Buffer info allocation strides: " - << buffer_info.shape_info.allocation_strides << std::endl; - std::cout << "Buffer info logical sizes: " - << buffer_info.shape_info.logical_sizes << std::endl; - std::cout << "Buffer info logical strides: " - << buffer_info.shape_info.logical_strides << std::endl; - input_info.emplace_back(buffer_info); + false)); } } @@ -748,9 +735,9 @@ void KernelExecutor::initializeExecutorEntry( auto out_info = output_info[output_idx]; auto fusion = compiled_kernel_->kernel()->as(); auto alias_info = fusion->getOutputAlias(out_info.tv); - NVF_ERROR( + TORCH_WARN_ONCE( alias_info.type != AllocationType::Evaluate, - "Outputs should not be evaluate type for kernels."); + "Outputs should not be evaluate type for kernels, this will be ignored and a kernel will produce the output tensor."); if (alias_info.type == AllocationType::New) { continue; } @@ -939,14 +926,7 @@ void KernelExecutor::computeArgs2( if (!inp->isA()) { continue; } - std::cout << "Input root: " << inp->as()->getRootDomain() - << std::endl; - std::cout << "Input logical: " << inp->as()->getLogicalDomain() - << std::endl; - std::cout << "Input alloc: " - << inp->as()->getMaybeAllocationDomain() << std::endl; } - std::cout << "Args: " << args.toString() << std::endl; const PrimDataType idx_type = compiled_kernel_->kernel()->indexType(); int64_t buffer_info_idx = 0; @@ -964,9 +944,6 @@ void KernelExecutor::computeArgs2( const auto& alloc_stride = linear_buffer_info_getter(entry, buffer_info_idx) .shape_info.allocation_strides; - std::cout << "Binding Tensor: " << (int64_t)data - << " size: " << logical_size << " stride: " << alloc_stride - << std::endl; buffer_info_idx++; // special handle for TensorMetaData so that CPU overhead is minimal. if (idx_type == PrimDataType::Int) { diff --git a/tests/cpp/test_allocation_domain.cpp b/tests/cpp/test_allocation_domain.cpp index 2972a883692..ead0014bdcc 100644 --- a/tests/cpp/test_allocation_domain.cpp +++ b/tests/cpp/test_allocation_domain.cpp @@ -1102,7 +1102,7 @@ TEST_F(AllocationDomainTest, ContiguityIssue1021) { // modify to [4,8] strided [1, 8] // Tell nvFuser it's allocated as [4, 8] strided [8, 1] at::Tensor t0 = at::randn({8, 8}, options).as_strided({4, 8}, {1, 8}); - std::cout << debug_str(t0) << std::endl; + FusionExecutorCache executor_cache(std::move(fusion)); auto outputs = executor_cache.runFusionWithInputs({t0}); From 651dd68f0f2e07d6f00f85683e3972e83bad795c Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Wed, 26 Feb 2025 21:56:13 +0000 Subject: [PATCH 12/35] Debugging in process. --- csrc/runtime/allocations.cpp | 66 ++++++++++++++++++++- csrc/runtime/executor.cpp | 95 +++++++++++++++++++++++------- csrc/runtime/executor.h | 9 +++ csrc/runtime/executor_kernel_arg.h | 4 ++ csrc/utils.cpp | 8 ++- tests/cpp/test_gpu3.cpp | 10 ++++ 6 files changed, 168 insertions(+), 24 deletions(-) diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp index 8f66ea6889e..54fa2e08e46 100644 --- a/csrc/runtime/allocations.cpp +++ b/csrc/runtime/allocations.cpp @@ -394,6 +394,8 @@ std::vector allocateKernelOutputs( c10::nullopt, device, c10::nullopt); + std::cout << "Allocating output tensor: " << debug_str(alloc_tensor) + << std::endl; if (shouldFillAllocationWithNan()) { fillTensorWithNan(alloc_tensor); } @@ -403,10 +405,38 @@ std::vector allocateKernelOutputs( entry.output_aliased_to_input.at(out_idx) <= (int64_t)args.size(), "Tried to grab an out of range input argument."); auto input_arg = args[entry.output_aliased_to_input.at(out_idx)]; + ExpressionEvaluator ee; + ee.bind( + fusion->inputs()[entry.output_aliased_to_input.at(out_idx)], + input_arg); + std::cout << "Input arg: " << debug_str(input_arg.as()) + << std::endl; + std::cout << "Aliased output tensor: " + << debug_str(ee.evaluate(out_info.tv).as()) + << std::endl; NVF_ERROR( input_arg.is(), "Aliased input argument is not a tensor."); - out_tensors.emplace_back(input_arg.as()); + if (input_arg.as().sizes() != + out_info.shape_info.logical_sizes || + input_arg.as().strides() != + out_info.shape_info.logical_strides) { + out_tensors.emplace_back(input_arg.as().as_strided( + out_info.shape_info.logical_sizes, + out_info.shape_info.logical_strides)); + } else { + out_tensors.emplace_back(input_arg.as()); + } + std::cout + << "Aliasing T" << out_info.tv->name() << " to T" + << fusion->inputs()[entry.output_aliased_to_input.at(out_idx)]->name() + << std::endl; + std::cout << "Aliased output tensor: " << debug_str(out_tensors.back()) + << std::endl; + std::cout << "Aliased output tensor logical sizes: " + << out_info.shape_info.logical_sizes << std::endl; + std::cout << "Aliased output tensor logical strides: " + << out_info.shape_info.logical_strides << std::endl; } } return out_tensors; @@ -424,10 +454,20 @@ GlobalBufferInfo getBufferInfo( auto dtype = (info.tv->dtype() == DataType::Index ? index_dtype : info.tv->dtype()); info.type = data_type_to_aten(dtype); + std::cout << " Getting global buffer info for T" << tv->name() << " " + << tv->getLogicalDomain() << std::endl; + std::cout << "Logical sizes: " << info.shape_info.logical_sizes << std::endl; + std::cout << "Allocation sizes: " << info.shape_info.allocation_sizes + << std::endl; + std::cout << "Logical strides: " << info.shape_info.logical_strides + << std::endl; + std::cout << "Allocation strides: " << info.shape_info.allocation_strides + << std::endl; return info; } } // namespace + std::vector getBufferInfos( ExpressionEvaluator& expr_eval, DataType index_dtype, @@ -869,6 +909,30 @@ std::vector getInputBufferInfos( TensorShapeInfo inferTensorShapes( TensorView* tv, const ExpressionEvaluator& expr_eval) { + + // Alias handling: + auto alias_info = tv->fusion()->getOutputAlias(tv); + if(alias_info.type != AllocationType::New){ + auto tensor = expr_eval.evaluate(tv); + std::pair, std::vector> logical_size_stride = {tensor.sizes().vec(), tensor.strides().vec()}; + if(!tv->hasAllocation()){ + return TensorShapeInfo{ + logical_size_stride.first, + logical_size_stride.second, + logical_size_stride.first, + logical_size_stride.second}; + } + + auto allocation_size_stride = inferAndValidateAllocationSizesAndStrides( + tensor, tv, expr_eval); + return TensorShapeInfo{ + logical_size_stride.first, + logical_size_stride.second, + allocation_size_stride.first, + allocation_size_stride.second}; + } + + // Non-alias handling: auto allocation_size_stride = inferAllocationShape(tv, expr_eval); if (!tv->hasAllocation()) { return TensorShapeInfo{ diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index f65fda23d49..252ff2c2f25 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -282,6 +282,36 @@ void KernelExecutor::compile( if (isProfilerEnabled()) { FusionProfiler::segment(group_id_).stopCompile(); } + + for(auto expr : exprs){ + if (ir_utils::isCpAsyncBulk(expr)) { + has_tma_ = true; + } + if (expr->isA()) { + has_rng_ = true; + } + } + + for(auto output : fusion->outputs()){ + if(output->isA()){ + auto out_tv = output->as(); + auto alias_info = fusion->getOutputAlias(out_tv); + if(alias_info.type == AllocationType::New){ + continue; + } + auto aliased_to = alias_info.aliased_io->as(); + auto inputs = InputsOf::output(out_tv); + for(auto input : inputs){ + if(input->isA() && input->sameAs(aliased_to)){ + continue; + } + } + if(val->isConst()){ + continue; + } + has_dynamic_alias_ = true; + } + } } LaunchParams KernelExecutor::computeLaunchParams( @@ -487,7 +517,7 @@ std::vector KernelExecutor::getIntermediateBufferInfo( info.shape_info.allocation_strides = strides; info.shape_info.logical_sizes = sizes; info.shape_info.logical_strides = strides; - auto dtype = (tv->dtype() == DataType::Index ? index_type : tv->dtype()); + auto dtype = tv->dtype() == DataType::Index ? index_type : tv->dtype(); info.type = data_type_to_aten(dtype); // Remember the tensor buffer used for storing kernel profile @@ -688,14 +718,14 @@ void KernelExecutor::initializeExecutorEntry( args[inp_idx].as().strides().vec(); shape_info.allocation_sizes = alloc_sizes; shape_info.allocation_strides = alloc_strides; - - input_info.emplace_back(GlobalBufferInfo( + GlobalBufferInfo info( input_tv, shape_info, data_type_to_aten(input_tv->dtype()), false, false, - false)); + false); + input_info.emplace_back(info); } } @@ -932,10 +962,9 @@ void KernelExecutor::computeArgs2( int64_t buffer_info_idx = 0; for (size_t arg_idx = 0; arg_idx < args.size(); ++arg_idx) { std::vector bytes; - if (args[arg_idx].is()) { + if (args[arg_idx].is() && + args[arg_idx].as().is_cuda()) { auto tensor = args[arg_idx].as(); - NVF_ERROR( - tensor.is_cuda(), "Only accepts CUDA tensors or CPU scalar tensors."); auto data = tensor.data_ptr(); const auto& logical_size = @@ -1105,6 +1134,26 @@ void KernelExecutor::resetCompiledKernelProperties() { static_smem_size_.reset(); } +namespace { +KernelArgumentHolder resolveRNGSeed( + const kir::Kernel* kernel, + KernelArgumentHolder& args) { + ExpressionEvaluator expr_eval; + KernelArgumentHolder resolved_args; + resolved_args.reserve(args.size()); + int64_t arg_idx = 0; + for (auto param : kernel->parameters()) { + if (param->definition() && + param->definition()->isA()) { + resolved_args.push(expr_eval.evaluate(param)); + } else { + resolved_args.push(args[arg_idx++]); + } + } + return resolved_args; +} +} // namespace + KernelArgumentHolder KernelExecutor::resolveTMA( KernelExecutorEntry& entry, const KernelArgumentHolder& args) const { @@ -1156,6 +1205,11 @@ std::vector KernelExecutor::run( sprof.startKernel(); } + ExpressionEvaluator expr_eval; + if(has_dynamic_alias_ || has_tma_){ + expr_eval = executor_utils::bindInputs(args, compiled_kernel_->kernel()); + } + NVF_ERROR(isCompiled()); NVF_ERROR( outputs.empty() || @@ -1214,10 +1268,6 @@ std::vector KernelExecutor::run( // context manager to disable auto grad for `empty_cuda` calls later at::AutoDispatchBelowADInplaceOrView non_variable_type_mode; - // Bind fusion inputs - // auto expr_eval = executor_utils::bindInputs(args, - // compiled_kernel_->kernel()); - // only allocate outputs when not given if (outputs.empty()) { outputs = allocateKernelOutputs( @@ -1310,14 +1360,20 @@ std::vector KernelExecutor::run( } if (args.size() != compiled_kernel_->kernel()->parameters().size()) { - std::vector exprs = compiled_kernel_->kernel()->exprs(); - NVF_ERROR( - std::any_of( - exprs.begin(), - exprs.end(), - [](Expr* e) { return ir_utils::isCpAsyncBulk(e); }), - "Argument mismatch detected in run, but is not resolveable."); - args = resolveTMA(*executor_entry, args); + NVF_ERROR(has_tma_ || has_rng_, "No TMA or RNG found in the kernel, but detected an argument size mismatch."); + // If args don't match one of two things is happening. We need to add TMA + // related args or RNG related args. Resolve these scenarios. + if (has_tma_) { + // Resolving TMA requires binding all values and evaluating the TMA + // arguments + args = resolveTMA(*executor_entry, args); + } + if (has_rng_) { + // Resolving RNG seed requires evaluating and adding those values, but + // doesn't require binding all values as getting RNG seed and offset + // doesn't depend on other values + args = resolveRNGSeed(compiled_kernel_->kernel(), args); + } } computeArgs2(*executor_entry, args); @@ -1418,7 +1474,6 @@ std::vector KernelExecutor::run( sprof.stopKernel(); sprof.outputBytesAccessed(computeBytes(outputs)); } - return outputs; } diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h index 50c97e2eece..6d79300a067 100644 --- a/csrc/runtime/executor.h +++ b/csrc/runtime/executor.h @@ -314,6 +314,15 @@ class KernelExecutor : public ExecutorAbstract { int64_t warp_size_ = 0; + // Has an RNG kernel and therefore needs to infer RNG state through expression evaluator + bool has_rng_ = false; + + // Has a TMA kernel and therefore needs to infer TMA inputs through expression evaluator + bool has_TMA_ = false; + + // Has a dynamic alias and therefore needs to infer what they are through expression evaluator + bool has_dynamic_alias_ = false; + // lookup table to take short cut to retrieve recorded information in order to // launch kernels without re-inference parameters. std::unordered_map executor_entry_lookup_; diff --git a/csrc/runtime/executor_kernel_arg.h b/csrc/runtime/executor_kernel_arg.h index f2026ebe2e6..40d44866165 100644 --- a/csrc/runtime/executor_kernel_arg.h +++ b/csrc/runtime/executor_kernel_arg.h @@ -74,6 +74,10 @@ class NVF_API KernelArgumentHolder { } } + void reserve(size_t size) { + arguments_.reserve(size); + } + void push(const std::vector& tensors); void push(const c10::ArrayRef& args); void push(std::initializer_list args) { diff --git a/csrc/utils.cpp b/csrc/utils.cpp index c24ae356cbb..0358908a6e1 100644 --- a/csrc/utils.cpp +++ b/csrc/utils.cpp @@ -43,13 +43,15 @@ std::string debug_str(const at::Tensor& tensor) { std::stringstream ss; ss << "Tensor:"; ss << " shape: " << tensor.sizes(); - ss << ", dtype: " << tensor.dtype(); - ss << ", device: " << tensor.device(); - ss << ", pointer: " << reinterpret_cast(tensor.data_ptr()); if (!tensor.is_contiguous()) { ss << ", strides: " << tensor.strides(); } + + ss << ", dtype: " << tensor.dtype(); + ss << ", device: " << tensor.device(); + ss << ", pointer: " << reinterpret_cast(tensor.data_ptr()); + return ss.str(); } diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp index 3adf985a898..ac5ed810ec2 100644 --- a/tests/cpp/test_gpu3.cpp +++ b/tests/cpp/test_gpu3.cpp @@ -4302,6 +4302,16 @@ TEST_F(NVFuserTest, FusionIssue2068_CUDA) { FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1, t2, t3, t4}); + std::cout << "Generated outputs:" << std::endl; + for (auto output : cg_outputs) { + std::cout << " " << debug_str(output) << std::endl; + } + + for (auto output : fusion.outputs()) { + std::cout << "T" << output->name() << " " + << output->as()->getLogicalDomain() << std::endl; + } + testValidate( executor_cache.fusion(), cg_outputs, From ce150d97fab5f15e63e41b0cbd36082b806cf7ae Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Wed, 26 Feb 2025 15:12:19 -0800 Subject: [PATCH 13/35] Down to 5 failures in test_nvfuser. --- csrc/runtime/allocations.cpp | 28 +++++++++++++--------------- csrc/runtime/executor.cpp | 29 +++++++++++++++-------------- csrc/runtime/executor.h | 9 ++++++--- 3 files changed, 34 insertions(+), 32 deletions(-) diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp index 54fa2e08e46..4688e8a3338 100644 --- a/csrc/runtime/allocations.cpp +++ b/csrc/runtime/allocations.cpp @@ -410,7 +410,7 @@ std::vector allocateKernelOutputs( fusion->inputs()[entry.output_aliased_to_input.at(out_idx)], input_arg); std::cout << "Input arg: " << debug_str(input_arg.as()) - << std::endl; + << std::endl; std::cout << "Aliased output tensor: " << debug_str(ee.evaluate(out_info.tv).as()) << std::endl; @@ -909,25 +909,23 @@ std::vector getInputBufferInfos( TensorShapeInfo inferTensorShapes( TensorView* tv, const ExpressionEvaluator& expr_eval) { - // Alias handling: auto alias_info = tv->fusion()->getOutputAlias(tv); - if(alias_info.type != AllocationType::New){ - auto tensor = expr_eval.evaluate(tv); - std::pair, std::vector> logical_size_stride = {tensor.sizes().vec(), tensor.strides().vec()}; - if(!tv->hasAllocation()){ + if (alias_info.type != AllocationType::New) { + auto val = expr_eval.evaluate(tv); + auto tensor = val.as(); + if (!tv->hasAllocation()) { return TensorShapeInfo{ - logical_size_stride.first, - logical_size_stride.second, - logical_size_stride.first, - logical_size_stride.second}; + tensor.sizes().vec(), + tensor.strides().vec(), + tensor.sizes().vec(), + tensor.strides().vec()}; } - - auto allocation_size_stride = inferAndValidateAllocationSizesAndStrides( - tensor, tv, expr_eval); + auto allocation_size_stride = + inferAndValidateAllocationSizesAndStrides(tensor, tv, expr_eval); return TensorShapeInfo{ - logical_size_stride.first, - logical_size_stride.second, + tensor.sizes().vec(), + tensor.strides().vec(), allocation_size_stride.first, allocation_size_stride.second}; } diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index 252ff2c2f25..8bfd8cb7ef1 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -283,30 +283,30 @@ void KernelExecutor::compile( FusionProfiler::segment(group_id_).stopCompile(); } - for(auto expr : exprs){ + for (auto expr : exprs) { if (ir_utils::isCpAsyncBulk(expr)) { - has_tma_ = true; + has_TMA_ = true; } if (expr->isA()) { has_rng_ = true; } } - for(auto output : fusion->outputs()){ - if(output->isA()){ + for (auto output : fusion->outputs()) { + if (output->isA()) { auto out_tv = output->as(); auto alias_info = fusion->getOutputAlias(out_tv); - if(alias_info.type == AllocationType::New){ + if (alias_info.type == AllocationType::New) { continue; } auto aliased_to = alias_info.aliased_io->as(); auto inputs = InputsOf::output(out_tv); - for(auto input : inputs){ - if(input->isA() && input->sameAs(aliased_to)){ + for (auto input : inputs) { + if (input->isA() && input->sameAs(aliased_to)) { continue; } } - if(val->isConst()){ + if (out_tv->isConst()) { continue; } has_dynamic_alias_ = true; @@ -654,10 +654,9 @@ void KernelExecutor::initializeExecutorEntry( DataType index_type) { FUSER_PERF_SCOPE("KernelExecutor::initializeExecutorEntry"); - ExpressionEvaluator expr_eval; - evaluatorPrecomputedValues()->bindInputs(args); + ExpressionEvaluator expr_eval = + executor_utils::bindInputs(args, compiled_kernel_->kernel()); expr_eval.precomputedValues() = evaluatorPrecomputedValues().get(); - auto launch_params = computeLaunchParams( launch_constraints, expr_eval, warp_size_, index_type); @@ -1206,7 +1205,7 @@ std::vector KernelExecutor::run( } ExpressionEvaluator expr_eval; - if(has_dynamic_alias_ || has_tma_){ + if (has_dynamic_alias_ || has_TMA_) { expr_eval = executor_utils::bindInputs(args, compiled_kernel_->kernel()); } @@ -1360,10 +1359,12 @@ std::vector KernelExecutor::run( } if (args.size() != compiled_kernel_->kernel()->parameters().size()) { - NVF_ERROR(has_tma_ || has_rng_, "No TMA or RNG found in the kernel, but detected an argument size mismatch."); + NVF_ERROR( + has_TMA_ || has_rng_, + "No TMA or RNG found in the kernel, but detected an argument size mismatch."); // If args don't match one of two things is happening. We need to add TMA // related args or RNG related args. Resolve these scenarios. - if (has_tma_) { + if (has_TMA_) { // Resolving TMA requires binding all values and evaluating the TMA // arguments args = resolveTMA(*executor_entry, args); diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h index 6d79300a067..2fa54473a12 100644 --- a/csrc/runtime/executor.h +++ b/csrc/runtime/executor.h @@ -314,13 +314,16 @@ class KernelExecutor : public ExecutorAbstract { int64_t warp_size_ = 0; - // Has an RNG kernel and therefore needs to infer RNG state through expression evaluator + // Has an RNG kernel and therefore needs to infer RNG state through expression + // evaluator bool has_rng_ = false; - // Has a TMA kernel and therefore needs to infer TMA inputs through expression evaluator + // Has a TMA kernel and therefore needs to infer TMA inputs through expression + // evaluator bool has_TMA_ = false; - // Has a dynamic alias and therefore needs to infer what they are through expression evaluator + // Has a dynamic alias and therefore needs to infer what they are through + // expression evaluator bool has_dynamic_alias_ = false; // lookup table to take short cut to retrieve recorded information in order to From 561dfb1a10e4a8bf34341a36799c99f74ee60135 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Wed, 26 Feb 2025 17:11:54 -0800 Subject: [PATCH 14/35] Slice fix. --- csrc/runtime/allocations.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp index 4688e8a3338..c98b7509bab 100644 --- a/csrc/runtime/allocations.cpp +++ b/csrc/runtime/allocations.cpp @@ -409,11 +409,13 @@ std::vector allocateKernelOutputs( ee.bind( fusion->inputs()[entry.output_aliased_to_input.at(out_idx)], input_arg); + auto output = ee.evaluate(out_info.tv).as(); std::cout << "Input arg: " << debug_str(input_arg.as()) << std::endl; + std::cout << "Aliased output tennsor: " + << fusion->outputs()[out_idx]->toString() << std::endl; std::cout << "Aliased output tensor: " - << debug_str(ee.evaluate(out_info.tv).as()) - << std::endl; + << debug_str(output) << std::endl; NVF_ERROR( input_arg.is(), "Aliased input argument is not a tensor."); @@ -421,11 +423,12 @@ std::vector allocateKernelOutputs( out_info.shape_info.logical_sizes || input_arg.as().strides() != out_info.shape_info.logical_strides) { - out_tensors.emplace_back(input_arg.as().as_strided( + std::cout<<"As strided?"<()); + out_tensors.emplace_back(output); } std::cout << "Aliasing T" << out_info.tv->name() << " to T" From 868afb72aefde3266c35d3f624dcf52eeb7d054c Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Wed, 26 Feb 2025 18:01:03 -0800 Subject: [PATCH 15/35] Debugging. --- csrc/runtime/executor.cpp | 5 +++++ tests/cpp/test_multidevice_sharding.cpp | 16 ++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index 8bfd8cb7ef1..140c42f862c 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -717,6 +717,11 @@ void KernelExecutor::initializeExecutorEntry( args[inp_idx].as().strides().vec(); shape_info.allocation_sizes = alloc_sizes; shape_info.allocation_strides = alloc_strides; + std::cout<<"Input shape info: "<setAllocationDomain(tv->getLoopDomain(), true); } - - const int64_t b = 2; - const int64_t h = d * 3; - const int64_t s = 5; + x->setContiguity({false, false, true, true, true}); + const int64_t b = 1; + const int64_t h = d * 2; + const int64_t s = 2; at::Tensor unsharded_x_tensor = at::randint(5, {b, h, s, s}, tensor_options); - at::Tensor x_tensor = shardTensor(unsharded_x_tensor, x); - + + // at::Tensor x_tensor = shardTensor(unsharded_x_tensor, x); + at::Tensor x_tensor = at::randint(5, {b, h/d, s, s}, tensor_options); + std::cout<<"Properties: "< Date: Thu, 27 Feb 2025 14:43:49 -0800 Subject: [PATCH 16/35] Debugging still. --- csrc/runtime/allocations.cpp | 31 +++++- csrc/runtime/allocations.h | 1 + csrc/runtime/executor.cpp | 120 +++++++++++++++++++++--- csrc/serde/fusion_cache.fbs | 1 + csrc/tensor_metadata.cpp | 5 + tests/cpp/test_multidevice_sharding.cpp | 16 ++-- 6 files changed, 148 insertions(+), 26 deletions(-) diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp index c98b7509bab..cf9626599a9 100644 --- a/csrc/runtime/allocations.cpp +++ b/csrc/runtime/allocations.cpp @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -262,6 +263,8 @@ at::Tensor allocateTensor( // Handle a fusion with duplicated outputs. TensorView* out_tv = out_info.tv; if (ee.isKnown(out_tv)) { + std::cout << "Evaluated 2: \n" + << ee.evaluate(out_tv).as() << std::endl; return ee.evaluate(out_tv).as(); } @@ -294,6 +297,7 @@ at::Tensor allocateTensor( if (shouldFillAllocationWithNan()) { fillTensorWithNan(alloc_tensor); } + std::cout << "Allocated 0: \n" << alloc_tensor << std::endl; return alloc_tensor; } case AllocationType::ReuseBuffer: @@ -314,6 +318,7 @@ at::Tensor allocateTensor( aliased_io->toString()); inferAndValidateAllocationSizesAndStrides(out_tensor, out_tv, ee); } + std::cout << "Evaluated 1: \n" << out_tensor << std::endl; return out_tensor; } default: @@ -394,8 +399,8 @@ std::vector allocateKernelOutputs( c10::nullopt, device, c10::nullopt); - std::cout << "Allocating output tensor: " << debug_str(alloc_tensor) - << std::endl; + std::cout << "Allocating output tensor:\n" + << debug_str(alloc_tensor) << std::endl; if (shouldFillAllocationWithNan()) { fillTensorWithNan(alloc_tensor); } @@ -414,8 +419,7 @@ std::vector allocateKernelOutputs( << std::endl; std::cout << "Aliased output tennsor: " << fusion->outputs()[out_idx]->toString() << std::endl; - std::cout << "Aliased output tensor: " - << debug_str(output) << std::endl; + std::cout << "Aliased output tensor: " << debug_str(output) << std::endl; NVF_ERROR( input_arg.is(), "Aliased input argument is not a tensor."); @@ -423,7 +427,12 @@ std::vector allocateKernelOutputs( out_info.shape_info.logical_sizes || input_arg.as().strides() != out_info.shape_info.logical_strides) { - std::cout<<"As strided?"< getInputBufferInfos( TensorShapeInfo shape_info; shape_info.logical_sizes = logical_sizes; shape_info.logical_strides = logical_strides; + if (isSharded(buffer_info.tv)) { + shape_info.unsharded_logical_sizes = + unshardedSizes(buffer_info.tv, logical_sizes); + } buffer_info.shape_info = shape_info; buffer_info.type = inputs[i].scalar_type(); @@ -921,6 +934,8 @@ TensorShapeInfo inferTensorShapes( return TensorShapeInfo{ tensor.sizes().vec(), tensor.strides().vec(), + isSharded(tv) ? unshardedSizes(tv, tensor.sizes().vec()) + : tensor.sizes().vec(), tensor.sizes().vec(), tensor.strides().vec()}; } @@ -929,6 +944,8 @@ TensorShapeInfo inferTensorShapes( return TensorShapeInfo{ tensor.sizes().vec(), tensor.strides().vec(), + isSharded(tv) ? unshardedSizes(tv, tensor.sizes().vec()) + : tensor.sizes().vec(), allocation_size_stride.first, allocation_size_stride.second}; } @@ -939,6 +956,8 @@ TensorShapeInfo inferTensorShapes( return TensorShapeInfo{ allocation_size_stride.first, allocation_size_stride.second, + isSharded(tv) ? unshardedSizes(tv, allocation_size_stride.first) + : allocation_size_stride.first, allocation_size_stride.first, allocation_size_stride.second}; } @@ -955,6 +974,8 @@ TensorShapeInfo inferTensorShapes( return { logical_meta_tensor.sizes().vec(), logical_meta_tensor.strides().vec(), + isSharded(tv) ? unshardedSizes(tv, logical_meta_tensor.sizes().vec()) + : logical_meta_tensor.sizes().vec(), allocation_size_stride.first, allocation_size_stride.second}; } diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h index 0f6e5b4a3f1..b2984d48629 100644 --- a/csrc/runtime/allocations.h +++ b/csrc/runtime/allocations.h @@ -20,6 +20,7 @@ struct KernelExecutorEntry; struct TensorShapeInfo { std::vector logical_sizes; std::vector logical_strides; + std::vector unsharded_logical_sizes; std::vector allocation_sizes; std::vector allocation_strides; }; diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index 140c42f862c..bbf652fa06d 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -172,7 +173,24 @@ void KernelExecutor::compile( CompileParams compile_params, SchedulerType scheduler_type) { FUSER_PERF_SCOPE("KernelExecutor::compile"); - + for (auto input : fusion->inputs()) { + if (input->isA()) { + auto tv = input->as(); + tv->setContiguity(std::vector>( + TensorDomain::noReductions(tv->getMaybeAllocationDomain()).size(), + false)); + } + } + for (auto output : fusion->outputs()) { + if (output->isA()) { + auto tv = output->as(); + tv->setContiguity(std::vector>( + TensorDomain::noReductions(tv->getMaybeAllocationDomain()).size(), + false)); + } + } + fusion->print(); + fusion->printKernel(); NVF_ERROR( supported(fusion), "KernelExecutor does not support the Fusion provided."); @@ -360,7 +378,7 @@ LaunchParams KernelExecutor::computeLaunchParams( parallel_iter_extents, launch_constraints); expr_eval.precomputedValues()->evaluate(); } - + std::cout << "====================" << std::endl; // If any dimension was set in launch constraints we need to run through // IterDomains that have been parallelized, and bind those values. Or make // sure if they could be inferred the inference matches what was set. @@ -371,6 +389,9 @@ LaunchParams KernelExecutor::computeLaunchParams( for (auto extent : parallel_extents) { auto inferred_val = expr_eval.evaluate(extent); if (inferred_val.hasValue()) { + std::cout << "Inferring val: " << extent->toInlineString() + << std::endl; + std::cout << inferred_val.as() << std::endl; // This value could have been inferred, make sure it was set right. bool valid = inferred_val.as() == launch_constraints.getDim(p_type) || @@ -381,9 +402,13 @@ LaunchParams KernelExecutor::computeLaunchParams( "this may be due to mixed broadcast axes that are parallelized."); } } else if (!expr_eval.precomputedValues()) { + std::cout << "Binding val: " << extent->toInlineString() << std::endl; + std::cout << launch_constraints.getDim(p_type) << std::endl; expr_eval.bind(extent, launch_constraints.getDim(p_type)); } if (!launch_params.hasDim(p_type)) { + std::cout << "Binding val: " << p_type << std::endl; + std::cout << launch_constraints.getDim(p_type) << std::endl; // Bind the launch constraint into our evaluation context launch_params.bind(launch_constraints.getDim(p_type), p_type); // Makes sure the p-types bound to evaluators are the @@ -396,10 +421,13 @@ LaunchParams KernelExecutor::computeLaunchParams( } } + std::cout << "====================" << std::endl; // Run through the rest of the parallel IterDomains and infer their size for (auto [p_type, extent] : simplified_parallel_iter_extents) { FUSER_PERF_SCOPE("KernelExecutor::ParallelBindingResolution"); auto val = expr_eval.evaluate(extent); + std::cout << "Evaluating val: " << extent->toInlineString() << std::endl; + std::cout << val.as() << std::endl; NVF_ERROR( val.hasValue(), "Tried to evaluate the extent, ", @@ -414,6 +442,7 @@ LaunchParams KernelExecutor::computeLaunchParams( } } + std::cout << "====================" << std::endl; // Re-run the integer machine with all // the thread sizes now determined. if (expr_eval.precomputedValues()) { @@ -659,7 +688,13 @@ void KernelExecutor::initializeExecutorEntry( expr_eval.precomputedValues() = evaluatorPrecomputedValues().get(); auto launch_params = computeLaunchParams( launch_constraints, expr_eval, warp_size_, index_type); + std::cout << "Constraints: " << launch_constraints.toString() << std::endl; + std::cout << "Launch params: " << launch_params.toString() << std::endl; + + std::cout << "Expr eval:\n"; + expr_eval.print(); + // NVF_THROW("Stop here"); for (const auto& entry : compiled_kernel_->kernel()->summary().validations) { NVF_CHECK(expr_eval.evaluate(entry.first).as(), entry.second); } @@ -715,13 +750,23 @@ void KernelExecutor::initializeExecutorEntry( shape_info.logical_sizes = args[inp_idx].as().sizes().vec(); shape_info.logical_strides = args[inp_idx].as().strides().vec(); + if (isSharded(input_tv)) { + std::cout << "input_tv is sharded" << std::endl; + shape_info.unsharded_logical_sizes = + unshardedSizes(input_tv, shape_info.logical_sizes); + std::cout << "unsharded_logical_sizes: " + << shape_info.unsharded_logical_sizes << std::endl; + } shape_info.allocation_sizes = alloc_sizes; shape_info.allocation_strides = alloc_strides; - std::cout<<"Input shape info: "<(); auto data = tensor.data_ptr(); - const auto& logical_size = - linear_buffer_info_getter(entry, buffer_info_idx) - .shape_info.logical_sizes; - const auto& alloc_stride = - linear_buffer_info_getter(entry, buffer_info_idx) - .shape_info.allocation_strides; + const auto& buffer_info = + linear_buffer_info_getter(entry, buffer_info_idx); + const auto& logical_size = buffer_info.shape_info.logical_sizes.size() == + buffer_info.shape_info.unsharded_logical_sizes.size() + ? buffer_info.shape_info.unsharded_logical_sizes + : buffer_info.shape_info.logical_sizes; + const auto& alloc_stride = buffer_info.shape_info.allocation_strides; + std::cout << "Populating buffer info for tensor"; + std::cout << " pointer: " << data; + std::cout << " logical size: " << buffer_info.shape_info.logical_sizes + << "\n"; + std::cout << " unsharded logical size: " + << buffer_info.shape_info.unsharded_logical_sizes << "\n"; + std::cout << " Unsharded logical size: " << logical_size << "\n"; + std::cout << " alloc stride: " << alloc_stride << "\n"; buffer_info_idx++; // special handle for TensorMetaData so that CPU overhead is minimal. if (idx_type == PrimDataType::Int) { @@ -1438,6 +1492,10 @@ std::vector KernelExecutor::run( << ", warps_per_sm=" << warps_per_sm << ", occupancy=" << oss.str() << std::endl; } + std::cout << "Running KE with args:\n"; + for (const auto& arg : args) { + std::cout << debug_str(arg.as()) << "\n"; + } if (!compiled_kernel_->kernel()->summary().has_cooperative_grid_reduction) { FUSER_PERF_SCOPE("ExecutorRunFusion::cuLaunchKernel"); @@ -1480,6 +1538,37 @@ std::vector KernelExecutor::run( sprof.stopKernel(); sprof.outputBytesAccessed(computeBytes(outputs)); } + std::cout << "\n\n"; + std::cout << "Entry input info:\n"; + for (auto entry : executor_entry->inputs) { + std::cout << "T" << entry.tv->name() << "\n"; + std::cout << " Logical sizes: " << entry.shape_info.logical_sizes << "\n"; + std::cout << " Logical strides: " << entry.shape_info.logical_strides + << "\n"; + std::cout << " Allocation sizes: " << entry.shape_info.allocation_sizes + << "\n"; + std::cout << " Allocation strides: " << entry.shape_info.allocation_strides + << "\n"; + } + + std::cout << "Entry output info:\n"; + for (auto entry : executor_entry->outputs) { + std::cout << "T" << entry.tv->name() << "\n"; + std::cout << " Logical sizes: " << entry.shape_info.logical_sizes << "\n"; + std::cout << " Logical strides: " << entry.shape_info.logical_strides + << "\n"; + std::cout << " Allocation sizes: " << entry.shape_info.allocation_sizes + << "\n"; + std::cout << " Allocation strides: " << entry.shape_info.allocation_strides + << "\n"; + } + + std::cout << "Ran KE with args:\n"; + for (const auto& arg : args) { + std::cout << debug_str(arg.as()) << "\n"; + } + std::cout << "Launch params: " << launch_params_.toString() << "\n"; + std::cout << std::endl; return outputs; } @@ -1635,6 +1724,7 @@ flatbuffers::Offset KernelExecutor::serialize( tv_position, &data.shape_info.logical_sizes, &data.shape_info.logical_strides, + &data.shape_info.unsharded_logical_sizes, &data.shape_info.allocation_sizes, &data.shape_info.allocation_strides, nvfuser::toUnderlying(data.type), @@ -1766,6 +1856,10 @@ GlobalBufferInfo KernelExecutor::deserialize( shape_info.logical_strides.emplace_back(dim_stride); } + for (auto dim_size : *buffer->unsharded_logical_sizes()) { + shape_info.unsharded_logical_sizes.emplace_back(dim_size); + } + for (auto dim_size : *buffer->alloc_sizes()) { shape_info.allocation_sizes.emplace_back(dim_size); } diff --git a/csrc/serde/fusion_cache.fbs b/csrc/serde/fusion_cache.fbs index 0d7cdd97d99..3ff755b2c15 100644 --- a/csrc/serde/fusion_cache.fbs +++ b/csrc/serde/fusion_cache.fbs @@ -217,6 +217,7 @@ table GlobalBufferInfo { tv : long = -1; logical_sizes : [long]; logical_strides : [long]; + unsharded_logical_sizes : [long]; alloc_sizes : [long]; alloc_strides : [long]; dtype : long; diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp index e5dd17f4aca..41fb5e50d47 100644 --- a/csrc/tensor_metadata.cpp +++ b/csrc/tensor_metadata.cpp @@ -310,6 +310,8 @@ inferAndValidateAllocationSizesAndStrides( std::vector allocation_strides; allocation_sizes.reserve(alloc.size()); allocation_strides.reserve(alloc.size()); + std::cout << "Processing allocation domain for: " << tv->name() << "\n"; + std::cout << tv->getMaybeAllocationDomain() << std::endl; for (IterDomain* id : TensorDomain::noReductions(alloc)) { if (id->isDeviceDim()) { allocation_sizes.push_back(1); @@ -317,7 +319,10 @@ inferAndValidateAllocationSizesAndStrides( allocation_sizes.push_back(active_ids.at(id).first); } allocation_strides.push_back(active_ids.at(id).second); + std::cout << allocation_sizes.back() << " " << allocation_strides.back() + << " for ID: " << id->toString() << "\n"; } + std::cout << std::endl; // Only validate final sizes and strides when we have a non-empty tensor. if (tensor.numel() != 0) { diff --git a/tests/cpp/test_multidevice_sharding.cpp b/tests/cpp/test_multidevice_sharding.cpp index 5675a81273e..2cf57bcf8f1 100644 --- a/tests/cpp/test_multidevice_sharding.cpp +++ b/tests/cpp/test_multidevice_sharding.cpp @@ -241,18 +241,18 @@ TEST_F(MultiDeviceTest, DivideBySum) { tv->setAllocationDomain(tv->getLoopDomain(), true); } x->setContiguity({false, false, true, true, true}); - const int64_t b = 1; - const int64_t h = d * 2; - const int64_t s = 2; + const int64_t b = 7; + const int64_t h = d * 5; + const int64_t s = 3; at::Tensor unsharded_x_tensor = at::randint(5, {b, h, s, s}, tensor_options); - + // at::Tensor x_tensor = shardTensor(unsharded_x_tensor, x); - at::Tensor x_tensor = at::randint(5, {b, h/d, s, s}, tensor_options); - std::cout<<"Properties: "< Date: Thu, 27 Feb 2025 15:15:13 -0800 Subject: [PATCH 17/35] Distrbuted tests working. --- csrc/runtime/allocations.cpp | 36 --------------- csrc/runtime/executor.cpp | 89 ------------------------------------ csrc/tensor_metadata.cpp | 5 -- 3 files changed, 130 deletions(-) diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp index cf9626599a9..5c56bd0d01e 100644 --- a/csrc/runtime/allocations.cpp +++ b/csrc/runtime/allocations.cpp @@ -263,8 +263,6 @@ at::Tensor allocateTensor( // Handle a fusion with duplicated outputs. TensorView* out_tv = out_info.tv; if (ee.isKnown(out_tv)) { - std::cout << "Evaluated 2: \n" - << ee.evaluate(out_tv).as() << std::endl; return ee.evaluate(out_tv).as(); } @@ -297,7 +295,6 @@ at::Tensor allocateTensor( if (shouldFillAllocationWithNan()) { fillTensorWithNan(alloc_tensor); } - std::cout << "Allocated 0: \n" << alloc_tensor << std::endl; return alloc_tensor; } case AllocationType::ReuseBuffer: @@ -318,7 +315,6 @@ at::Tensor allocateTensor( aliased_io->toString()); inferAndValidateAllocationSizesAndStrides(out_tensor, out_tv, ee); } - std::cout << "Evaluated 1: \n" << out_tensor << std::endl; return out_tensor; } default: @@ -399,8 +395,6 @@ std::vector allocateKernelOutputs( c10::nullopt, device, c10::nullopt); - std::cout << "Allocating output tensor:\n" - << debug_str(alloc_tensor) << std::endl; if (shouldFillAllocationWithNan()) { fillTensorWithNan(alloc_tensor); } @@ -415,11 +409,6 @@ std::vector allocateKernelOutputs( fusion->inputs()[entry.output_aliased_to_input.at(out_idx)], input_arg); auto output = ee.evaluate(out_info.tv).as(); - std::cout << "Input arg: " << debug_str(input_arg.as()) - << std::endl; - std::cout << "Aliased output tennsor: " - << fusion->outputs()[out_idx]->toString() << std::endl; - std::cout << "Aliased output tensor: " << debug_str(output) << std::endl; NVF_ERROR( input_arg.is(), "Aliased input argument is not a tensor."); @@ -427,28 +416,12 @@ std::vector allocateKernelOutputs( out_info.shape_info.logical_sizes || input_arg.as().strides() != out_info.shape_info.logical_strides) { - std::cout << "As strided?" << std::endl; - std::cout << "Aliasing output tensor: \n" - << output.as_strided( - out_info.shape_info.logical_sizes, - out_info.shape_info.logical_strides) - << std::endl; out_tensors.emplace_back(output.as_strided( out_info.shape_info.logical_sizes, out_info.shape_info.logical_strides)); } else { out_tensors.emplace_back(output); } - std::cout - << "Aliasing T" << out_info.tv->name() << " to T" - << fusion->inputs()[entry.output_aliased_to_input.at(out_idx)]->name() - << std::endl; - std::cout << "Aliased output tensor: " << debug_str(out_tensors.back()) - << std::endl; - std::cout << "Aliased output tensor logical sizes: " - << out_info.shape_info.logical_sizes << std::endl; - std::cout << "Aliased output tensor logical strides: " - << out_info.shape_info.logical_strides << std::endl; } } return out_tensors; @@ -466,15 +439,6 @@ GlobalBufferInfo getBufferInfo( auto dtype = (info.tv->dtype() == DataType::Index ? index_dtype : info.tv->dtype()); info.type = data_type_to_aten(dtype); - std::cout << " Getting global buffer info for T" << tv->name() << " " - << tv->getLogicalDomain() << std::endl; - std::cout << "Logical sizes: " << info.shape_info.logical_sizes << std::endl; - std::cout << "Allocation sizes: " << info.shape_info.allocation_sizes - << std::endl; - std::cout << "Logical strides: " << info.shape_info.logical_strides - << std::endl; - std::cout << "Allocation strides: " << info.shape_info.allocation_strides - << std::endl; return info; } diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index bbf652fa06d..cf575b3c1c2 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -173,24 +173,6 @@ void KernelExecutor::compile( CompileParams compile_params, SchedulerType scheduler_type) { FUSER_PERF_SCOPE("KernelExecutor::compile"); - for (auto input : fusion->inputs()) { - if (input->isA()) { - auto tv = input->as(); - tv->setContiguity(std::vector>( - TensorDomain::noReductions(tv->getMaybeAllocationDomain()).size(), - false)); - } - } - for (auto output : fusion->outputs()) { - if (output->isA()) { - auto tv = output->as(); - tv->setContiguity(std::vector>( - TensorDomain::noReductions(tv->getMaybeAllocationDomain()).size(), - false)); - } - } - fusion->print(); - fusion->printKernel(); NVF_ERROR( supported(fusion), "KernelExecutor does not support the Fusion provided."); @@ -378,7 +360,6 @@ LaunchParams KernelExecutor::computeLaunchParams( parallel_iter_extents, launch_constraints); expr_eval.precomputedValues()->evaluate(); } - std::cout << "====================" << std::endl; // If any dimension was set in launch constraints we need to run through // IterDomains that have been parallelized, and bind those values. Or make // sure if they could be inferred the inference matches what was set. @@ -389,9 +370,6 @@ LaunchParams KernelExecutor::computeLaunchParams( for (auto extent : parallel_extents) { auto inferred_val = expr_eval.evaluate(extent); if (inferred_val.hasValue()) { - std::cout << "Inferring val: " << extent->toInlineString() - << std::endl; - std::cout << inferred_val.as() << std::endl; // This value could have been inferred, make sure it was set right. bool valid = inferred_val.as() == launch_constraints.getDim(p_type) || @@ -402,13 +380,9 @@ LaunchParams KernelExecutor::computeLaunchParams( "this may be due to mixed broadcast axes that are parallelized."); } } else if (!expr_eval.precomputedValues()) { - std::cout << "Binding val: " << extent->toInlineString() << std::endl; - std::cout << launch_constraints.getDim(p_type) << std::endl; expr_eval.bind(extent, launch_constraints.getDim(p_type)); } if (!launch_params.hasDim(p_type)) { - std::cout << "Binding val: " << p_type << std::endl; - std::cout << launch_constraints.getDim(p_type) << std::endl; // Bind the launch constraint into our evaluation context launch_params.bind(launch_constraints.getDim(p_type), p_type); // Makes sure the p-types bound to evaluators are the @@ -421,13 +395,10 @@ LaunchParams KernelExecutor::computeLaunchParams( } } - std::cout << "====================" << std::endl; // Run through the rest of the parallel IterDomains and infer their size for (auto [p_type, extent] : simplified_parallel_iter_extents) { FUSER_PERF_SCOPE("KernelExecutor::ParallelBindingResolution"); auto val = expr_eval.evaluate(extent); - std::cout << "Evaluating val: " << extent->toInlineString() << std::endl; - std::cout << val.as() << std::endl; NVF_ERROR( val.hasValue(), "Tried to evaluate the extent, ", @@ -442,7 +413,6 @@ LaunchParams KernelExecutor::computeLaunchParams( } } - std::cout << "====================" << std::endl; // Re-run the integer machine with all // the thread sizes now determined. if (expr_eval.precomputedValues()) { @@ -688,11 +658,6 @@ void KernelExecutor::initializeExecutorEntry( expr_eval.precomputedValues() = evaluatorPrecomputedValues().get(); auto launch_params = computeLaunchParams( launch_constraints, expr_eval, warp_size_, index_type); - std::cout << "Constraints: " << launch_constraints.toString() << std::endl; - std::cout << "Launch params: " << launch_params.toString() << std::endl; - - std::cout << "Expr eval:\n"; - expr_eval.print(); // NVF_THROW("Stop here"); for (const auto& entry : compiled_kernel_->kernel()->summary().validations) { @@ -751,22 +716,11 @@ void KernelExecutor::initializeExecutorEntry( shape_info.logical_strides = args[inp_idx].as().strides().vec(); if (isSharded(input_tv)) { - std::cout << "input_tv is sharded" << std::endl; shape_info.unsharded_logical_sizes = unshardedSizes(input_tv, shape_info.logical_sizes); - std::cout << "unsharded_logical_sizes: " - << shape_info.unsharded_logical_sizes << std::endl; } shape_info.allocation_sizes = alloc_sizes; shape_info.allocation_strides = alloc_strides; - std::cout << "Input shape info: " << std::endl; - std::cout << "Logical sizes: " << shape_info.logical_sizes << std::endl; - std::cout << "Allocation sizes: " << shape_info.allocation_sizes - << std::endl; - std::cout << "Logical strides: " << shape_info.logical_strides - << std::endl; - std::cout << "Allocation strides: " << shape_info.allocation_strides - << std::endl; GlobalBufferInfo info( input_tv, shape_info, @@ -1023,14 +977,6 @@ void KernelExecutor::computeArgs2( ? buffer_info.shape_info.unsharded_logical_sizes : buffer_info.shape_info.logical_sizes; const auto& alloc_stride = buffer_info.shape_info.allocation_strides; - std::cout << "Populating buffer info for tensor"; - std::cout << " pointer: " << data; - std::cout << " logical size: " << buffer_info.shape_info.logical_sizes - << "\n"; - std::cout << " unsharded logical size: " - << buffer_info.shape_info.unsharded_logical_sizes << "\n"; - std::cout << " Unsharded logical size: " << logical_size << "\n"; - std::cout << " alloc stride: " << alloc_stride << "\n"; buffer_info_idx++; // special handle for TensorMetaData so that CPU overhead is minimal. if (idx_type == PrimDataType::Int) { @@ -1492,10 +1438,6 @@ std::vector KernelExecutor::run( << ", warps_per_sm=" << warps_per_sm << ", occupancy=" << oss.str() << std::endl; } - std::cout << "Running KE with args:\n"; - for (const auto& arg : args) { - std::cout << debug_str(arg.as()) << "\n"; - } if (!compiled_kernel_->kernel()->summary().has_cooperative_grid_reduction) { FUSER_PERF_SCOPE("ExecutorRunFusion::cuLaunchKernel"); @@ -1538,37 +1480,6 @@ std::vector KernelExecutor::run( sprof.stopKernel(); sprof.outputBytesAccessed(computeBytes(outputs)); } - std::cout << "\n\n"; - std::cout << "Entry input info:\n"; - for (auto entry : executor_entry->inputs) { - std::cout << "T" << entry.tv->name() << "\n"; - std::cout << " Logical sizes: " << entry.shape_info.logical_sizes << "\n"; - std::cout << " Logical strides: " << entry.shape_info.logical_strides - << "\n"; - std::cout << " Allocation sizes: " << entry.shape_info.allocation_sizes - << "\n"; - std::cout << " Allocation strides: " << entry.shape_info.allocation_strides - << "\n"; - } - - std::cout << "Entry output info:\n"; - for (auto entry : executor_entry->outputs) { - std::cout << "T" << entry.tv->name() << "\n"; - std::cout << " Logical sizes: " << entry.shape_info.logical_sizes << "\n"; - std::cout << " Logical strides: " << entry.shape_info.logical_strides - << "\n"; - std::cout << " Allocation sizes: " << entry.shape_info.allocation_sizes - << "\n"; - std::cout << " Allocation strides: " << entry.shape_info.allocation_strides - << "\n"; - } - - std::cout << "Ran KE with args:\n"; - for (const auto& arg : args) { - std::cout << debug_str(arg.as()) << "\n"; - } - std::cout << "Launch params: " << launch_params_.toString() << "\n"; - std::cout << std::endl; return outputs; } diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp index 41fb5e50d47..e5dd17f4aca 100644 --- a/csrc/tensor_metadata.cpp +++ b/csrc/tensor_metadata.cpp @@ -310,8 +310,6 @@ inferAndValidateAllocationSizesAndStrides( std::vector allocation_strides; allocation_sizes.reserve(alloc.size()); allocation_strides.reserve(alloc.size()); - std::cout << "Processing allocation domain for: " << tv->name() << "\n"; - std::cout << tv->getMaybeAllocationDomain() << std::endl; for (IterDomain* id : TensorDomain::noReductions(alloc)) { if (id->isDeviceDim()) { allocation_sizes.push_back(1); @@ -319,10 +317,7 @@ inferAndValidateAllocationSizesAndStrides( allocation_sizes.push_back(active_ids.at(id).first); } allocation_strides.push_back(active_ids.at(id).second); - std::cout << allocation_sizes.back() << " " << allocation_strides.back() - << " for ID: " << id->toString() << "\n"; } - std::cout << std::endl; // Only validate final sizes and strides when we have a non-empty tensor. if (tensor.numel() != 0) { From f5628962f0fe53696f2e651c17d6a28cf3c46060 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Sat, 1 Mar 2025 07:17:54 -0800 Subject: [PATCH 18/35] Remove precomputed values from initialize executor entry, for some reason it's not synced correctly. --- csrc/runtime/executor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index cf575b3c1c2..c85d05f6d0c 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -655,7 +655,7 @@ void KernelExecutor::initializeExecutorEntry( ExpressionEvaluator expr_eval = executor_utils::bindInputs(args, compiled_kernel_->kernel()); - expr_eval.precomputedValues() = evaluatorPrecomputedValues().get(); + // expr_eval.precomputedValues() = evaluatorPrecomputedValues().get(); auto launch_params = computeLaunchParams( launch_constraints, expr_eval, warp_size_, index_type); From eb291c5414c19bc250af81de94fd692eb6881da3 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Sat, 1 Mar 2025 09:31:40 -0800 Subject: [PATCH 19/35] Fix most failures. --- csrc/runtime/allocations.cpp | 104 ++++++++++++++++------------- csrc/runtime/allocations.h | 10 ++- csrc/runtime/executor.cpp | 44 +++++++++--- csrc/runtime/executor_kernel_arg.h | 4 ++ 4 files changed, 104 insertions(+), 58 deletions(-) diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp index 5c56bd0d01e..5d64275599b 100644 --- a/csrc/runtime/allocations.cpp +++ b/csrc/runtime/allocations.cpp @@ -374,17 +374,18 @@ std::vector allocateOutputs( return out_tensors; } -std::vector allocateKernelOutputs( +KernelArgumentHolder allocateKernelOutputs( const Fusion* fusion, const KernelExecutorEntry& entry, const c10::Device& device, - const KernelArgumentHolder& args) { + const KernelArgumentHolder& args, + bool dynamic_alias) { FUSER_PERF_SCOPE("fusion_executor::allocations::allocateOutputs"); // TODO: Figure out if output to output aliasing is needed - std::vector out_tensors; - out_tensors.reserve(entry.outputs.size()); + KernelArgumentHolder out_tensors; + out_tensors.resize(entry.outputs.size()); for (auto out_idx : c10::irange(entry.outputs.size())) { auto out_info = entry.outputs.at(out_idx); if (entry.output_aliased_to_input.at(out_idx) == -1) { @@ -398,57 +399,38 @@ std::vector allocateKernelOutputs( if (shouldFillAllocationWithNan()) { fillTensorWithNan(alloc_tensor); } - out_tensors.emplace_back(alloc_tensor); - } else { - NVF_ERROR( - entry.output_aliased_to_input.at(out_idx) <= (int64_t)args.size(), - "Tried to grab an out of range input argument."); - auto input_arg = args[entry.output_aliased_to_input.at(out_idx)]; + out_tensors[out_idx] = alloc_tensor; + } else if ( + fusion->getOutputAlias(out_info.tv).type == + AllocationType::ReuseBuffer) { + auto inp = args[entry.output_aliased_to_input.at(out_idx)]; + NVF_ERROR(inp.is(), "Input is not a Tensor"); + out_tensors[out_idx] = inp; + } else if ( + fusion->getOutputAlias(out_info.tv).type == AllocationType::Evaluate) { + if (dynamic_alias) { + out_tensors[out_idx] = std::monostate(); + continue; + } + ExpressionEvaluator ee; ee.bind( - fusion->inputs()[entry.output_aliased_to_input.at(out_idx)], - input_arg); - auto output = ee.evaluate(out_info.tv).as(); - NVF_ERROR( - input_arg.is(), - "Aliased input argument is not a tensor."); - if (input_arg.as().sizes() != - out_info.shape_info.logical_sizes || - input_arg.as().strides() != - out_info.shape_info.logical_strides) { - out_tensors.emplace_back(output.as_strided( - out_info.shape_info.logical_sizes, - out_info.shape_info.logical_strides)); - } else { - out_tensors.emplace_back(output); - } + fusion->getOutputAlias(out_info.tv).aliased_io, + args[entry.output_aliased_to_input.at(out_idx)]); + out_tensors[out_idx] = ee.evaluate(out_info.tv); + } else { + NVF_THROW( + "Unexpected allocation path, internal logic around allocations must be incorrect."); } } return out_tensors; } -namespace { -GlobalBufferInfo getBufferInfo( - ExpressionEvaluator& expr_eval, - DataType index_dtype, - TensorView* tv) { - FUSER_PERF_SCOPE("fusion_executor::allocations::getBufferInfo"); - GlobalBufferInfo info; - info.tv = tv; - info.shape_info = inferTensorShapes(info.tv, expr_eval); - auto dtype = - (info.tv->dtype() == DataType::Index ? index_dtype : info.tv->dtype()); - info.type = data_type_to_aten(dtype); - return info; -} - -} // namespace - std::vector getBufferInfos( ExpressionEvaluator& expr_eval, DataType index_dtype, const std::vector& fusion_outputs) { - FUSER_PERF_SCOPE("fusion_executor::allocations::getOutbufferInfo"); + FUSER_PERF_SCOPE("fusion_executor::allocations::getOutbufferInfos"); std::vector output_buffer_infos; output_buffer_infos.reserve(fusion_outputs.size()); for (const auto out : fusion_outputs) { @@ -456,8 +438,14 @@ std::vector getBufferInfos( out->isA(), "Cannot allocate outputs that are not tensors."); - output_buffer_infos.emplace_back( - getBufferInfo(expr_eval, index_dtype, out->as())); + GlobalBufferInfo info; + info.tv = out->as(); + info.shape_info = inferTensorShapes(info.tv, expr_eval); + auto dtype = + (info.tv->dtype() == DataType::Index ? index_dtype : info.tv->dtype()); + info.type = data_type_to_aten(dtype); + + output_buffer_infos.emplace_back(info); } return output_buffer_infos; } @@ -889,12 +877,30 @@ std::vector getInputBufferInfos( TensorShapeInfo inferTensorShapes( TensorView* tv, const ExpressionEvaluator& expr_eval) { + std::cout << "Infer shape: " << tv->toString() << std::endl; // Alias handling: auto alias_info = tv->fusion()->getOutputAlias(tv); - if (alias_info.type != AllocationType::New) { + if (alias_info.type == AllocationType::Evaluate) { + std::cout << "Evaluating" << std::endl; + std::cout << "Output: " << tv->toString() << std::endl; + auto inps = InputsOf::output(tv); + std::cout << "Inputs: " << std::endl; + for (auto inp : inps) { + std::cout << " " << inp->toString() << std::endl; + } + auto exprs = + DependencyCheck::getAllExprsBetween({inps.begin(), inps.end()}, {tv}); + std::cout << "Expressions: " << std::endl; + for (auto expr : exprs) { + std::cout << " " << expr->toString() << std::endl; + } + auto val = expr_eval.evaluate(tv); + NVF_ERROR(val.hasValue() && val.is(), "Output is not a Tensor"); auto tensor = val.as(); + if (!tv->hasAllocation()) { + std::cout << "Inferred 0" << std::endl; return TensorShapeInfo{ tensor.sizes().vec(), tensor.strides().vec(), @@ -903,8 +909,10 @@ TensorShapeInfo inferTensorShapes( tensor.sizes().vec(), tensor.strides().vec()}; } + std::cout << "Allocation" << std::endl; auto allocation_size_stride = inferAndValidateAllocationSizesAndStrides(tensor, tv, expr_eval); + std::cout << "Inferred 1" << std::endl; return TensorShapeInfo{ tensor.sizes().vec(), tensor.strides().vec(), @@ -917,6 +925,7 @@ TensorShapeInfo inferTensorShapes( // Non-alias handling: auto allocation_size_stride = inferAllocationShape(tv, expr_eval); if (!tv->hasAllocation()) { + std::cout << "Inferred 2" << std::endl; return TensorShapeInfo{ allocation_size_stride.first, allocation_size_stride.second, @@ -935,6 +944,7 @@ TensorShapeInfo inferTensorShapes( // `transformFromAllocationToLogical` logical_meta_tensor = transformFromAllocationToLogical(logical_meta_tensor, tv, expr_eval); + std::cout << "Inferred 3" << std::endl; return { logical_meta_tensor.sizes().vec(), logical_meta_tensor.strides().vec(), diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h index b2984d48629..29adbd5e254 100644 --- a/csrc/runtime/allocations.h +++ b/csrc/runtime/allocations.h @@ -90,11 +90,17 @@ std::vector allocateOutputs( const c10::Device& device, ExpressionEvaluator& ee); -std::vector allocateKernelOutputs( +// Allocate output tensors for a given fusion. Outputs may alias inputs, in +// that case output tensors are shallow copies of the aliased inputs. +// +// If dynamic_alias is true, then any argument with AllocationType::Evaluate +// will not be populated, it will be filled with std::monostate. +KernelArgumentHolder allocateKernelOutputs( const Fusion* fusion, const KernelExecutorEntry& entry, const c10::Device& device, - const KernelArgumentHolder& args); + const KernelArgumentHolder& args, + bool dynamic_alias = false); //! Return information necessary for allocating output tensors. Input //! and output tensors are allowed to alias each other, which is diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index c85d05f6d0c..b806e4f761e 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -296,7 +296,7 @@ void KernelExecutor::compile( if (output->isA()) { auto out_tv = output->as(); auto alias_info = fusion->getOutputAlias(out_tv); - if (alias_info.type == AllocationType::New) { + if (alias_info.type != AllocationType::Evaluate) { continue; } auto aliased_to = alias_info.aliased_io->as(); @@ -768,9 +768,6 @@ void KernelExecutor::initializeExecutorEntry( auto out_info = output_info[output_idx]; auto fusion = compiled_kernel_->kernel()->as(); auto alias_info = fusion->getOutputAlias(out_info.tv); - TORCH_WARN_ONCE( - alias_info.type != AllocationType::Evaluate, - "Outputs should not be evaluate type for kernels, this will be ignored and a kernel will produce the output tensor."); if (alias_info.type == AllocationType::New) { continue; } @@ -795,8 +792,8 @@ void KernelExecutor::initializeExecutorEntry( fusion->inputs(), "\nFusion Outputs:\n ", fusion->outputs()); - TORCH_WARN( - "Kernel found with output to output aliasing, this is unsupported in a kernel and will beignored.\n", + NVF_THROW( + "Kernel found with output to output aliasing, this is unsupported at this moment.\n", "Output: ", out_info.tv->toString(), "\nAliased to: ", @@ -1249,6 +1246,7 @@ std::vector KernelExecutor::run( // Initialize the executor entry if not initlized if (!executor_entry->init) { + std::cout << "Initializing executor entry" << std::endl; initializeExecutorEntry( *executor_entry, args, @@ -1256,6 +1254,7 @@ std::vector KernelExecutor::run( compile_params, outputs, compiled_kernel_->kernel()->indexType()); + std::cout << "Executor entry initialized" << std::endl; } if (!(executor_entry->launch_params.nThreads() <= @@ -1274,11 +1273,33 @@ std::vector KernelExecutor::run( // only allocate outputs when not given if (outputs.empty()) { - outputs = allocateKernelOutputs( + auto outputs_args = allocateKernelOutputs( compiled_kernel_->kernel(), *executor_entry, compiled_kernel_->device(), - args); + args, + has_dynamic_alias_); + outputs.reserve(outputs_args.size()); + if (has_dynamic_alias_) { + for (const auto i : + c10::irange(compiled_kernel_->kernel()->outputs().size())) { + auto param = compiled_kernel_->kernel()->outputs()[i]; + if (!param->isA()) { + continue; + } + if (compiled_kernel_->kernel() + ->getOutputAlias(param->as()) + .type == AllocationType::Evaluate) { + outputs_args[i] = expr_eval.evaluate(param); + } + } + } + for (const auto i : c10::irange(outputs_args.size())) { + NVF_ERROR( + outputs_args[i].hasValue() && outputs_args[i].is(), + "Output is not populated or not a Tensor"); + outputs.emplace_back(outputs_args[i].as()); + } } args.push(outputs); @@ -1372,17 +1393,22 @@ std::vector KernelExecutor::run( if (has_TMA_) { // Resolving TMA requires binding all values and evaluating the TMA // arguments + std::cout << "Resolving TMA" << std::endl; args = resolveTMA(*executor_entry, args); + std::cout << "TMA resolved" << std::endl; } if (has_rng_) { // Resolving RNG seed requires evaluating and adding those values, but // doesn't require binding all values as getting RNG seed and offset // doesn't depend on other values + std::cout << "Resolving RNG seed" << std::endl; args = resolveRNGSeed(compiled_kernel_->kernel(), args); + std::cout << "RNG seed resolved" << std::endl; } } - + std::cout << "Computing args" << std::endl; computeArgs2(*executor_entry, args); + std::cout << "Args computed" << std::endl; if (isDebugDumpEnabled(DebugDumpOption::LaunchParam)) { launch_params_.print(); diff --git a/csrc/runtime/executor_kernel_arg.h b/csrc/runtime/executor_kernel_arg.h index 40d44866165..6e3852549e6 100644 --- a/csrc/runtime/executor_kernel_arg.h +++ b/csrc/runtime/executor_kernel_arg.h @@ -78,6 +78,10 @@ class NVF_API KernelArgumentHolder { arguments_.reserve(size); } + void resize(size_t size) { + arguments_.resize(size); + } + void push(const std::vector& tensors); void push(const c10::ArrayRef& args); void push(std::initializer_list args) { From e4d94c6dbe8f59c0c9b1227c50c7a6225a8a9090 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Sat, 1 Mar 2025 09:38:19 -0800 Subject: [PATCH 20/35] Cleanup. --- csrc/runtime/allocations.cpp | 20 -------------------- csrc/runtime/executor.cpp | 16 ++++++++-------- csrc/utils.cpp | 8 +++----- 3 files changed, 11 insertions(+), 33 deletions(-) diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp index 5d64275599b..39ae7a39c54 100644 --- a/csrc/runtime/allocations.cpp +++ b/csrc/runtime/allocations.cpp @@ -877,30 +877,14 @@ std::vector getInputBufferInfos( TensorShapeInfo inferTensorShapes( TensorView* tv, const ExpressionEvaluator& expr_eval) { - std::cout << "Infer shape: " << tv->toString() << std::endl; // Alias handling: auto alias_info = tv->fusion()->getOutputAlias(tv); if (alias_info.type == AllocationType::Evaluate) { - std::cout << "Evaluating" << std::endl; - std::cout << "Output: " << tv->toString() << std::endl; - auto inps = InputsOf::output(tv); - std::cout << "Inputs: " << std::endl; - for (auto inp : inps) { - std::cout << " " << inp->toString() << std::endl; - } - auto exprs = - DependencyCheck::getAllExprsBetween({inps.begin(), inps.end()}, {tv}); - std::cout << "Expressions: " << std::endl; - for (auto expr : exprs) { - std::cout << " " << expr->toString() << std::endl; - } - auto val = expr_eval.evaluate(tv); NVF_ERROR(val.hasValue() && val.is(), "Output is not a Tensor"); auto tensor = val.as(); if (!tv->hasAllocation()) { - std::cout << "Inferred 0" << std::endl; return TensorShapeInfo{ tensor.sizes().vec(), tensor.strides().vec(), @@ -909,10 +893,8 @@ TensorShapeInfo inferTensorShapes( tensor.sizes().vec(), tensor.strides().vec()}; } - std::cout << "Allocation" << std::endl; auto allocation_size_stride = inferAndValidateAllocationSizesAndStrides(tensor, tv, expr_eval); - std::cout << "Inferred 1" << std::endl; return TensorShapeInfo{ tensor.sizes().vec(), tensor.strides().vec(), @@ -925,7 +907,6 @@ TensorShapeInfo inferTensorShapes( // Non-alias handling: auto allocation_size_stride = inferAllocationShape(tv, expr_eval); if (!tv->hasAllocation()) { - std::cout << "Inferred 2" << std::endl; return TensorShapeInfo{ allocation_size_stride.first, allocation_size_stride.second, @@ -944,7 +925,6 @@ TensorShapeInfo inferTensorShapes( // `transformFromAllocationToLogical` logical_meta_tensor = transformFromAllocationToLogical(logical_meta_tensor, tv, expr_eval); - std::cout << "Inferred 3" << std::endl; return { logical_meta_tensor.sizes().vec(), logical_meta_tensor.strides().vec(), diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index b806e4f761e..a80e90a1bb0 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -1246,7 +1246,7 @@ std::vector KernelExecutor::run( // Initialize the executor entry if not initlized if (!executor_entry->init) { - std::cout << "Initializing executor entry" << std::endl; + // std::cout << "Initializing executor entry" << std::endl; initializeExecutorEntry( *executor_entry, args, @@ -1254,7 +1254,7 @@ std::vector KernelExecutor::run( compile_params, outputs, compiled_kernel_->kernel()->indexType()); - std::cout << "Executor entry initialized" << std::endl; + // std::cout << "Executor entry initialized" << std::endl; } if (!(executor_entry->launch_params.nThreads() <= @@ -1393,22 +1393,22 @@ std::vector KernelExecutor::run( if (has_TMA_) { // Resolving TMA requires binding all values and evaluating the TMA // arguments - std::cout << "Resolving TMA" << std::endl; + // std::cout << "Resolving TMA" << std::endl; args = resolveTMA(*executor_entry, args); - std::cout << "TMA resolved" << std::endl; + // std::cout << "TMA resolved" << std::endl; } if (has_rng_) { // Resolving RNG seed requires evaluating and adding those values, but // doesn't require binding all values as getting RNG seed and offset // doesn't depend on other values - std::cout << "Resolving RNG seed" << std::endl; + // std::cout << "Resolving RNG seed" << std::endl; args = resolveRNGSeed(compiled_kernel_->kernel(), args); - std::cout << "RNG seed resolved" << std::endl; + // std::cout << "RNG seed resolved" << std::endl; } } - std::cout << "Computing args" << std::endl; + // std::cout << "Computing args" << std::endl; computeArgs2(*executor_entry, args); - std::cout << "Args computed" << std::endl; + // std::cout << "Args computed" << std::endl; if (isDebugDumpEnabled(DebugDumpOption::LaunchParam)) { launch_params_.print(); diff --git a/csrc/utils.cpp b/csrc/utils.cpp index 0358908a6e1..c24ae356cbb 100644 --- a/csrc/utils.cpp +++ b/csrc/utils.cpp @@ -43,15 +43,13 @@ std::string debug_str(const at::Tensor& tensor) { std::stringstream ss; ss << "Tensor:"; ss << " shape: " << tensor.sizes(); - - if (!tensor.is_contiguous()) { - ss << ", strides: " << tensor.strides(); - } - ss << ", dtype: " << tensor.dtype(); ss << ", device: " << tensor.device(); ss << ", pointer: " << reinterpret_cast(tensor.data_ptr()); + if (!tensor.is_contiguous()) { + ss << ", strides: " << tensor.strides(); + } return ss.str(); } From ae81a733a40bd9c356ca974b97d57933d0a3f7bb Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Sat, 1 Mar 2025 15:11:14 -0800 Subject: [PATCH 21/35] Fix serialization. --- .gitignore | 2 + csrc/runtime/executor.cpp | 68 +++++++++++++++++++++++++------ csrc/runtime/executor.h | 3 +- csrc/serde/fusion_cache.fbs | 6 ++- tests/python/test_schedule_ops.py | 3 ++ tools/run_nvfuser_tests.py | 2 +- 6 files changed, 69 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index 81bb26635d5..27b73f61ffa 100644 --- a/.gitignore +++ b/.gitignore @@ -51,3 +51,5 @@ foo.bin # Mac OS internal file .DS_Store + +test_log* \ No newline at end of file diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index a80e90a1bb0..7cc816e4f31 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -771,6 +771,7 @@ void KernelExecutor::initializeExecutorEntry( if (alias_info.type == AllocationType::New) { continue; } + NVF_ERROR(alias_info.aliased_io, "Alias info is not an input or output"); auto aliased_to = alias_info.aliased_io->as(); auto aliased_to_idx = std::find( @@ -779,10 +780,12 @@ void KernelExecutor::initializeExecutorEntry( if (aliased_to_idx < (int64_t)fusion->inputs().size()) { output_aliased_to_input[output_idx] = aliased_to_idx; } else { + auto aliased_out = std::find( + fusion->outputs().begin(), fusion->outputs().end(), aliased_to); + NVF_ERROR(aliased_out != fusion->outputs().end(), "Alias not found"); output_aliased_to_output[output_idx] = - std::find( - fusion->outputs().begin(), fusion->outputs().end(), aliased_to) - - fusion->outputs().begin(); + aliased_out - fusion->outputs().begin(); + NVF_ERROR( output_aliased_to_output[output_idx] < (int)fusion->outputs().size(), "Alias found but is not an output or input of the fusion. ", @@ -1014,6 +1017,9 @@ void KernelExecutor::computeArgs2( entry.args[arg_idx] = bytes; entry.arg_ptrs[arg_idx] = entry.args[arg_idx].data(); } else { + if(args[arg_idx].is()){ + buffer_info_idx++; + } auto bytes = polymorphicValueToBytes( args[arg_idx], compiled_kernel_->kernel()->parameters()[arg_idx]->dtype(), @@ -1613,8 +1619,9 @@ flatbuffers::Offset KernelExecutor::serialize( ? -1 : std::distance( compiledKernel()->kernel()->outputs().cbegin(), tv_iter); + NVF_ERROR(tv_position != -1, "Output TensorView not found in kernel outputs"); outputs_fb.push_back( - serialize(builder, buffer, tv_position, true /* is_fusion_output */)); + serialize(builder, buffer, tv_position, true /* is_fusion_output */, false /* is_fusion_input */)); } // Serialize GlobalBufferInfo for intermediates. @@ -1638,23 +1645,44 @@ flatbuffers::Offset KernelExecutor::serialize( : std::distance( compiledKernel()->kernel()->summary().global_allocations.cbegin(), tv_iter); + NVF_ERROR(tv_position != -1, "Intermediate TensorView not found in kernel global allocations"); intermediates_fb.push_back( - serialize(builder, buffer, tv_position, false /* is_fusion_output */)); + serialize(builder, buffer, tv_position, false /* is_fusion_output */, false /* is_fusion_input */)); } + + std::vector inputs_fb; + inputs_fb.reserve(data.inputs.size()); + for (const auto& buffer : data.inputs) { + auto tv_iter = std::find( + compiledKernel()->kernel()->inputs().cbegin(), + compiledKernel()->kernel()->inputs().cend(), + buffer.tv); + auto tv_position = (tv_iter == compiledKernel()->kernel()->inputs().cend()) + ? -1 + : std::distance( + compiledKernel()->kernel()->inputs().cbegin(), tv_iter); + NVF_ERROR(tv_position != -1, "Input TensorView not found in kernel inputs"); + inputs_fb.push_back( + serialize(builder, buffer, tv_position, false /* is_fusion_output */, true /* is_fusion_input */)); + } return serde::CreateKernelExecutorEntryDirect( builder, data.init, data.launch_params.serialize(builder), &outputs_fb, - &intermediates_fb); + &intermediates_fb, + &inputs_fb, + &data.output_aliased_to_input, + &data.output_aliased_to_output); } flatbuffers::Offset KernelExecutor::serialize( flatbuffers::FlatBufferBuilder& builder, const GlobalBufferInfo& data, int64_t tv_position, - bool is_fusion_output) const { + bool is_fusion_output, + bool is_fusion_input) const { // See table definition for GlobalBufferInfo in serde/fusion_cache.fbs return serde::CreateGlobalBufferInfoDirect( builder, @@ -1668,7 +1696,8 @@ flatbuffers::Offset KernelExecutor::serialize( data.zero_init, data.resets_to_zero, data.is_profile_buffer, - is_fusion_output); + is_fusion_output, + is_fusion_input); } void KernelExecutor::deserialize( @@ -1682,7 +1711,6 @@ void KernelExecutor::deserialize( int64_t runtime_id, int64_t group_id) { // See table definition for KernelExecutor in serde/fusion_cache.fbs - NVF_ERROR(buffer != nullptr, "serde::KernelExecutor is nullptr."); NVF_ERROR(_fusion != nullptr, "Fusion is nullptr."); @@ -1755,6 +1783,18 @@ KernelExecutorEntry KernelExecutor::deserialize( entry.intermediates.push_back(deserialize(intermediate_buffer)); } + for (auto input_buffer : *buffer->inputs()) { + entry.inputs.push_back(deserialize(input_buffer)); + } + + for (auto output_aliased_to_input : *buffer->output_aliased_to_input()) { + entry.output_aliased_to_input.push_back(output_aliased_to_input); + } + + for (auto output_aliased_to_output : *buffer->output_aliased_to_output()) { + entry.output_aliased_to_output.push_back(output_aliased_to_output); + } + return entry; } @@ -1765,7 +1805,7 @@ GlobalBufferInfo KernelExecutor::deserialize( NVF_ERROR(buffer != nullptr, "serde::GlobalBufferInfo is nullptr."); NVF_ERROR( - buffer->tv() != -1, "Serialization failed to encode buffer tv position."); + buffer->tv_pos() != -1, "Serialization failed to encode buffer tv position."); NVF_ERROR( compiled_kernel_->lowered() != nullptr, @@ -1773,12 +1813,16 @@ GlobalBufferInfo KernelExecutor::deserialize( GlobalBufferInfo info; if (buffer->is_fusion_output()) { - auto out_val = compiled_kernel_->kernel()->outputs().at(buffer->tv()); + auto out_val = compiled_kernel_->kernel()->outputs().at(buffer->tv_pos()); NVF_ERROR(out_val != nullptr); info.tv = dynamic_cast(out_val); + } else if(buffer->is_fusion_input()) { + auto in_val = compiled_kernel_->kernel()->inputs().at(buffer->tv_pos()); + NVF_ERROR(in_val != nullptr); + info.tv = dynamic_cast(in_val); } else { auto out_val = compiled_kernel_->kernel()->summary().global_allocations.at( - buffer->tv()); + buffer->tv_pos()); NVF_ERROR(out_val != nullptr); info.tv = dynamic_cast(out_val->buffer()); } diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h index 2fa54473a12..522d2b49f1f 100644 --- a/csrc/runtime/executor.h +++ b/csrc/runtime/executor.h @@ -276,7 +276,8 @@ class KernelExecutor : public ExecutorAbstract { flatbuffers::FlatBufferBuilder& builder, const GlobalBufferInfo& data, int64_t tv_position, - bool is_fusion_output) const; + bool is_fusion_output, + bool is_fusion_input) const; //! Deserialize GlobalBufferInfo using flatbuffers GlobalBufferInfo deserialize(const serde::GlobalBufferInfo* buffer); diff --git a/csrc/serde/fusion_cache.fbs b/csrc/serde/fusion_cache.fbs index 3ff755b2c15..079c6e88152 100644 --- a/csrc/serde/fusion_cache.fbs +++ b/csrc/serde/fusion_cache.fbs @@ -214,7 +214,7 @@ table LaunchParams { // For output tensors, we use its position in the fusion outputs. // For intermediate tensors, we use its position in the KernelSummary global_allocations. table GlobalBufferInfo { - tv : long = -1; + tv_pos : long = -1; logical_sizes : [long]; logical_strides : [long]; unsharded_logical_sizes : [long]; @@ -225,6 +225,7 @@ table GlobalBufferInfo { resets_to_zero : bool; is_profile_buffer : bool; is_fusion_output : bool; + is_fusion_input : bool; } // This table describes the cached KernelExecutorEntry for a kernel. @@ -233,6 +234,9 @@ table KernelExecutorEntry { launch_params : LaunchParams; outputs : [GlobalBufferInfo]; intermediates : [GlobalBufferInfo]; + inputs : [GlobalBufferInfo]; + output_aliased_to_input : [int]; + output_aliased_to_output : [int]; } // ===================================================================================== diff --git a/tests/python/test_schedule_ops.py b/tests/python/test_schedule_ops.py index 640753a2a5d..3c59ff37621 100644 --- a/tests/python/test_schedule_ops.py +++ b/tests/python/test_schedule_ops.py @@ -1012,6 +1012,9 @@ def schedule(self): self.assertEqual(nvf_out[1], inputs[4]) self.assertEqual(nvf_out[2], torch_ref) + @pytest.mark.skip( + reason="Disable test, the scheduler is not actually sending to ExprEvalExec but is sending to KernelExecutor which will correctly error." + ) def test_matmul_auto_scheduler(self): """ Implement a simple matmul kernel with a user defined schedule diff --git a/tools/run_nvfuser_tests.py b/tools/run_nvfuser_tests.py index 31c1067f37d..5292273f277 100644 --- a/tools/run_nvfuser_tests.py +++ b/tools/run_nvfuser_tests.py @@ -64,7 +64,7 @@ def get_python_tests(python_test_dir): def get_test_timeout(test_name): """Return timeout in seconds for a given test""" - if test_name in ["test_nvfuser", "test_matmul", "test_ops"]: + if test_name in ["test_nvfuser", "test_matmul", "test_ops.py"]: return 3600 # 1 hour return 600 # 10 minutes From 378d2511c0dce903d15a0babd3b3a20e2e75375e Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Sun, 2 Mar 2025 05:25:39 -0800 Subject: [PATCH 22/35] Fix serialization. --- .gitignore | 2 +- csrc/runtime/executor.cpp | 59 +++++++++++++++++++++---------- csrc/serde/fusion_cache.fbs | 3 ++ tests/python/test_schedule_ops.py | 2 +- 4 files changed, 45 insertions(+), 21 deletions(-) diff --git a/.gitignore b/.gitignore index 27b73f61ffa..89d7c587c4b 100644 --- a/.gitignore +++ b/.gitignore @@ -52,4 +52,4 @@ foo.bin # Mac OS internal file .DS_Store -test_log* \ No newline at end of file +test_log* diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index 7cc816e4f31..6e319679dea 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -781,11 +781,11 @@ void KernelExecutor::initializeExecutorEntry( output_aliased_to_input[output_idx] = aliased_to_idx; } else { auto aliased_out = std::find( - fusion->outputs().begin(), fusion->outputs().end(), aliased_to); + fusion->outputs().begin(), fusion->outputs().end(), aliased_to); NVF_ERROR(aliased_out != fusion->outputs().end(), "Alias not found"); output_aliased_to_output[output_idx] = aliased_out - fusion->outputs().begin(); - + NVF_ERROR( output_aliased_to_output[output_idx] < (int)fusion->outputs().size(), "Alias found but is not an output or input of the fusion. ", @@ -1017,7 +1017,7 @@ void KernelExecutor::computeArgs2( entry.args[arg_idx] = bytes; entry.arg_ptrs[arg_idx] = entry.args[arg_idx].data(); } else { - if(args[arg_idx].is()){ + if (args[arg_idx].is()) { buffer_info_idx++; } auto bytes = polymorphicValueToBytes( @@ -1287,6 +1287,7 @@ std::vector KernelExecutor::run( has_dynamic_alias_); outputs.reserve(outputs_args.size()); if (has_dynamic_alias_) { + // TODO: Make sure dynamic alias works. for (const auto i : c10::irange(compiled_kernel_->kernel()->outputs().size())) { auto param = compiled_kernel_->kernel()->outputs()[i]; @@ -1402,8 +1403,7 @@ std::vector KernelExecutor::run( // std::cout << "Resolving TMA" << std::endl; args = resolveTMA(*executor_entry, args); // std::cout << "TMA resolved" << std::endl; - } - if (has_rng_) { + } else if (has_rng_) { // Resolving RNG seed requires evaluating and adding those values, but // doesn't require binding all values as getting RNG seed and offset // doesn't depend on other values @@ -1550,7 +1550,10 @@ flatbuffers::Offset KernelExecutor::serialize( &executor_entry_lookup_keys_fb, &executor_entry_lookup_values_fb, toUnderlying(compiledKernel()->kernel()->indexType()), - serialize(builder, compiledKernel()->cudaExecutable().get())); + serialize(builder, compiledKernel()->cudaExecutable().get()), + has_rng_, + has_TMA_, + has_dynamic_alias_); } flatbuffers::Offset KernelExecutor::serialize( @@ -1619,9 +1622,14 @@ flatbuffers::Offset KernelExecutor::serialize( ? -1 : std::distance( compiledKernel()->kernel()->outputs().cbegin(), tv_iter); - NVF_ERROR(tv_position != -1, "Output TensorView not found in kernel outputs"); - outputs_fb.push_back( - serialize(builder, buffer, tv_position, true /* is_fusion_output */, false /* is_fusion_input */)); + NVF_ERROR( + tv_position != -1, "Output TensorView not found in kernel outputs"); + outputs_fb.push_back(serialize( + builder, + buffer, + tv_position, + true /* is_fusion_output */, + false /* is_fusion_input */)); } // Serialize GlobalBufferInfo for intermediates. @@ -1645,12 +1653,17 @@ flatbuffers::Offset KernelExecutor::serialize( : std::distance( compiledKernel()->kernel()->summary().global_allocations.cbegin(), tv_iter); - NVF_ERROR(tv_position != -1, "Intermediate TensorView not found in kernel global allocations"); - intermediates_fb.push_back( - serialize(builder, buffer, tv_position, false /* is_fusion_output */, false /* is_fusion_input */)); + NVF_ERROR( + tv_position != -1, + "Intermediate TensorView not found in kernel global allocations"); + intermediates_fb.push_back(serialize( + builder, + buffer, + tv_position, + false /* is_fusion_output */, + false /* is_fusion_input */)); } - std::vector inputs_fb; inputs_fb.reserve(data.inputs.size()); for (const auto& buffer : data.inputs) { @@ -1660,11 +1673,14 @@ flatbuffers::Offset KernelExecutor::serialize( buffer.tv); auto tv_position = (tv_iter == compiledKernel()->kernel()->inputs().cend()) ? -1 - : std::distance( - compiledKernel()->kernel()->inputs().cbegin(), tv_iter); + : std::distance(compiledKernel()->kernel()->inputs().cbegin(), tv_iter); NVF_ERROR(tv_position != -1, "Input TensorView not found in kernel inputs"); - inputs_fb.push_back( - serialize(builder, buffer, tv_position, false /* is_fusion_output */, true /* is_fusion_input */)); + inputs_fb.push_back(serialize( + builder, + buffer, + tv_position, + false /* is_fusion_output */, + true /* is_fusion_input */)); } return serde::CreateKernelExecutorEntryDirect( builder, @@ -1761,6 +1777,10 @@ void KernelExecutor::deserialize( buffer->executor_entry_lookup_keys()->Get(idx), deserialize(buffer->executor_entry_lookup_values()->Get(idx))); } + + has_rng_ = buffer->has_rng(); + has_TMA_ = buffer->has_TMA(); + has_dynamic_alias_ = buffer->has_dynamic_alias(); } KernelExecutorEntry KernelExecutor::deserialize( @@ -1805,7 +1825,8 @@ GlobalBufferInfo KernelExecutor::deserialize( NVF_ERROR(buffer != nullptr, "serde::GlobalBufferInfo is nullptr."); NVF_ERROR( - buffer->tv_pos() != -1, "Serialization failed to encode buffer tv position."); + buffer->tv_pos() != -1, + "Serialization failed to encode buffer tv position."); NVF_ERROR( compiled_kernel_->lowered() != nullptr, @@ -1816,7 +1837,7 @@ GlobalBufferInfo KernelExecutor::deserialize( auto out_val = compiled_kernel_->kernel()->outputs().at(buffer->tv_pos()); NVF_ERROR(out_val != nullptr); info.tv = dynamic_cast(out_val); - } else if(buffer->is_fusion_input()) { + } else if (buffer->is_fusion_input()) { auto in_val = compiled_kernel_->kernel()->inputs().at(buffer->tv_pos()); NVF_ERROR(in_val != nullptr); info.tv = dynamic_cast(in_val); diff --git a/csrc/serde/fusion_cache.fbs b/csrc/serde/fusion_cache.fbs index 079c6e88152..c39b76bbad2 100644 --- a/csrc/serde/fusion_cache.fbs +++ b/csrc/serde/fusion_cache.fbs @@ -379,6 +379,9 @@ table KernelExecutor { // Is this kernel being compiled with int32 or int64 indexing? index_type : long; compiled_kernel: CudaKernel; + has_rng: bool; + has_TMA: bool; + has_dynamic_alias: bool; } // A directed edge on DAG, which wraps a value that connects segmented groups. diff --git a/tests/python/test_schedule_ops.py b/tests/python/test_schedule_ops.py index 3c59ff37621..a271a8531f3 100644 --- a/tests/python/test_schedule_ops.py +++ b/tests/python/test_schedule_ops.py @@ -1013,7 +1013,7 @@ def schedule(self): self.assertEqual(nvf_out[2], torch_ref) @pytest.mark.skip( - reason="Disable test, the scheduler is not actually sending to ExprEvalExec but is sending to KernelExecutor which will correctly error." + reason="Disable test, the scheduler is not actually sending to ExprEvalExec but is sending to KernelExecutor which will correctly error." ) def test_matmul_auto_scheduler(self): """ From 9ba868ebec9ae7d6415c475aecd6412b2f2063d7 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Sun, 2 Mar 2025 16:42:35 -0800 Subject: [PATCH 23/35] Clang. --- csrc/runtime/allocations.cpp | 12 +++++++----- csrc/runtime/executor.cpp | 16 ++++++++-------- csrc/runtime/executor_kernel_arg.h | 2 +- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp index 06817136eac..a459cbb09ed 100644 --- a/csrc/runtime/allocations.cpp +++ b/csrc/runtime/allocations.cpp @@ -403,7 +403,7 @@ KernelArgumentHolder allocateKernelOutputs( } else if ( fusion->getOutputAlias(out_info.tv).type == AllocationType::ReuseBuffer) { - auto inp = args[entry.output_aliased_to_input.at(out_idx)]; + const auto& inp = args[entry.output_aliased_to_input.at(out_idx)]; NVF_ERROR(inp.is(), "Input is not a Tensor"); out_tensors[out_idx] = inp; } else if ( @@ -845,7 +845,7 @@ std::vector getInputBufferInfos( auto allocation_domain = TensorDomain::noReductions( buffer_info.tv->getMaybeAllocationDomain()); std::unordered_map logical_to_allocation_map; - for (int64_t logical_idx : c10::irange(logical_domain.size())) { + for (auto logical_idx : c10::irange(logical_domain.size())) { auto allocation_id = std::find( allocation_domain.begin(), allocation_domain.end(), @@ -853,14 +853,16 @@ std::vector getInputBufferInfos( NVF_ERROR( allocation_id != allocation_domain.end(), "Logical domain and allocation domain have different sets of IterDomains, this is not supported yet."); - logical_to_allocation_map[logical_idx] = + logical_to_allocation_map[(int64_t)logical_idx] = std::distance(allocation_domain.begin(), allocation_id); } std::vector allocation_sizes(allocation_domain.size()); std::vector allocation_strides(allocation_domain.size()); for (auto i : c10::irange(allocation_domain.size())) { - allocation_sizes[i] = logical_sizes[logical_to_allocation_map[i]]; - allocation_strides[i] = logical_strides[logical_to_allocation_map[i]]; + allocation_sizes[i] = + (int64_t)logical_sizes[logical_to_allocation_map[i]]; + allocation_strides[i] = + (int64_t)logical_strides[logical_to_allocation_map[i]]; } buffer_info.shape_info.allocation_sizes = allocation_sizes; buffer_info.shape_info.allocation_strides = allocation_strides; diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index facd575f6ed..9a04931f146 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -718,13 +718,13 @@ void KernelExecutor::initializeExecutorEntry( } shape_info.allocation_sizes = alloc_sizes; shape_info.allocation_strides = alloc_strides; - GlobalBufferInfo info( + GlobalBufferInfo info{ input_tv, shape_info, data_type_to_aten(input_tv->dtype()), false, false, - false); + false}; input_info.emplace_back(info); } } @@ -774,18 +774,18 @@ void KernelExecutor::initializeExecutorEntry( } NVF_ERROR(alias_info.aliased_io, "Alias info is not an input or output"); auto aliased_to = alias_info.aliased_io->as(); - auto aliased_to_idx = + auto aliased_to_idx = std::distance( + fusion->inputs().begin(), std::find( - fusion->inputs().begin(), fusion->inputs().end(), aliased_to) - - fusion->inputs().begin(); + fusion->inputs().begin(), fusion->inputs().end(), aliased_to)); if (aliased_to_idx < (int64_t)fusion->inputs().size()) { - output_aliased_to_input[output_idx] = aliased_to_idx; + output_aliased_to_input[(int64_t)output_idx] = aliased_to_idx; } else { auto aliased_out = std::find( fusion->outputs().begin(), fusion->outputs().end(), aliased_to); NVF_ERROR(aliased_out != fusion->outputs().end(), "Alias not found"); output_aliased_to_output[output_idx] = - aliased_out - fusion->outputs().begin(); + std::distance(fusion->outputs().begin(), aliased_out); NVF_ERROR( output_aliased_to_output[output_idx] < (int)fusion->outputs().size(), @@ -1183,7 +1183,7 @@ KernelArgumentHolder KernelExecutor::resolveTMA( compiled_kernel_->kernel()->outputs()[out_idx], args[arg_idx++]); } - for (auto intermediate_entry : entry.intermediates) { + for (const auto& intermediate_entry : entry.intermediates) { expr_eval.bind(intermediate_entry.tv, args[arg_idx++]); } diff --git a/csrc/runtime/executor_kernel_arg.h b/csrc/runtime/executor_kernel_arg.h index 6e3852549e6..a8a7fb110de 100644 --- a/csrc/runtime/executor_kernel_arg.h +++ b/csrc/runtime/executor_kernel_arg.h @@ -22,7 +22,7 @@ namespace nvfuser { -class GlobalBufferInfo; +struct GlobalBufferInfo; //! KernelArgumentHolder copies meta information from kernel inputs, including //! tensor sizes/shapes/dtype/memory_ptr and copies scalar inputs. It is used From 32a87fb5820e3956d8d5213f749d91a7b1c76a52 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Sun, 2 Mar 2025 16:57:15 -0800 Subject: [PATCH 24/35] Remove output to output aliasing, it's not used. --- csrc/runtime/allocations.cpp | 2 +- csrc/runtime/executor.cpp | 26 ++++++-------------------- csrc/runtime/executor.h | 1 - csrc/serde/fusion_cache.fbs | 1 - 4 files changed, 7 insertions(+), 23 deletions(-) diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp index a459cbb09ed..c3ef950d563 100644 --- a/csrc/runtime/allocations.cpp +++ b/csrc/runtime/allocations.cpp @@ -197,7 +197,7 @@ std::pair, std::vector> inferShapeOfIntermediate( return inferShape(tv, symbolic_sizes, expand_flags, expr_eval); } -bool fill_allocation_with_nan_ = false; +static bool fill_allocation_with_nan_ = false; bool shouldFillAllocationWithNan() { return fill_allocation_with_nan_; diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index 9a04931f146..24b652ff323 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -763,7 +763,6 @@ void KernelExecutor::initializeExecutorEntry( } std::vector output_aliased_to_input(output_info.size(), -1); - std::vector output_aliased_to_output(output_info.size(), -1); for (auto output_idx : c10::irange(output_info.size())) { auto out_info = output_info[output_idx]; @@ -783,19 +782,12 @@ void KernelExecutor::initializeExecutorEntry( } else { auto aliased_out = std::find( fusion->outputs().begin(), fusion->outputs().end(), aliased_to); - NVF_ERROR(aliased_out != fusion->outputs().end(), "Alias not found"); - output_aliased_to_output[output_idx] = - std::distance(fusion->outputs().begin(), aliased_out); - NVF_ERROR( - output_aliased_to_output[output_idx] < (int)fusion->outputs().size(), - "Alias found but is not an output or input of the fusion. ", - "Aliased to tv: ", - aliased_to->toString(), - "\nFusion Inputs:\n ", - fusion->inputs(), - "\nFusion Outputs:\n ", - fusion->outputs()); + aliased_out != fusion->outputs().end(), + "Could not find the alias tensor of: ", + out_info.tv->toString(), + "\nAliased to: ", + aliased_to->toString()); NVF_THROW( "Kernel found with output to output aliasing, this is unsupported at this moment.\n", "Output: ", @@ -811,7 +803,6 @@ void KernelExecutor::initializeExecutorEntry( executor_entry.launch_params = launch_params; executor_entry.outputs = output_info; executor_entry.output_aliased_to_input = output_aliased_to_input; - executor_entry.output_aliased_to_output = output_aliased_to_output; executor_entry.intermediates = intermediates; executor_entry.inputs = input_info; executor_entry.init = true; @@ -1681,8 +1672,7 @@ flatbuffers::Offset KernelExecutor::serialize( &outputs_fb, &intermediates_fb, &inputs_fb, - &data.output_aliased_to_input, - &data.output_aliased_to_output); + &data.output_aliased_to_input); } flatbuffers::Offset KernelExecutor::serialize( @@ -1803,10 +1793,6 @@ KernelExecutorEntry KernelExecutor::deserialize( entry.output_aliased_to_input.push_back(output_aliased_to_input); } - for (auto output_aliased_to_output : *buffer->output_aliased_to_output()) { - entry.output_aliased_to_output.push_back(output_aliased_to_output); - } - return entry; } diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h index 74547d2bf86..ffe4d1a37da 100644 --- a/csrc/runtime/executor.h +++ b/csrc/runtime/executor.h @@ -70,7 +70,6 @@ struct KernelExecutorEntry { // If an output is aliased to an input, this will hold the index of the // input that it is aliased to. If not aliased, it will hold -1. std::vector output_aliased_to_input; - std::vector output_aliased_to_output; // Temporary work buffers and intemediate global-memory tensors std::vector intermediates; std::vector inputs; diff --git a/csrc/serde/fusion_cache.fbs b/csrc/serde/fusion_cache.fbs index c39b76bbad2..d7ffdeba45d 100644 --- a/csrc/serde/fusion_cache.fbs +++ b/csrc/serde/fusion_cache.fbs @@ -236,7 +236,6 @@ table KernelExecutorEntry { intermediates : [GlobalBufferInfo]; inputs : [GlobalBufferInfo]; output_aliased_to_input : [int]; - output_aliased_to_output : [int]; } // ===================================================================================== From cec8ab3ed348ee56124b8aada467713fe6c0c70b Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Sun, 2 Mar 2025 18:17:36 -0800 Subject: [PATCH 25/35] Update HostIR Exec and remove duplicate allocation code. --- csrc/host_ir/executor.cpp | 69 ++++++++++------ csrc/runtime/allocations.cpp | 141 +++----------------------------- csrc/runtime/allocations.h | 22 +---- csrc/runtime/executor.cpp | 62 +++++--------- csrc/runtime/executor.h | 2 +- csrc/runtime/executor_utils.cpp | 38 +++++++++ csrc/runtime/executor_utils.h | 5 ++ csrc/serde/fusion_cache.fbs | 2 +- 8 files changed, 123 insertions(+), 218 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index b726918d375..5465a02b09b 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -86,21 +86,6 @@ bool HostIrExecutor::isCompiled() const { return (bool)host_ir_container_; } -namespace { -// Host IR specific function, returns the at:Tensor (ordered list) associated -// with the provdied Fusion output tv -at::Tensor findBufferForFusionOutput( - const KernelArgumentHolder& output_args, - const Val* fusion_out, - const Fusion* fusion) { - auto i = - std::find(fusion->outputs().begin(), fusion->outputs().end(), fusion_out); - NVF_ERROR(i != fusion->outputs().end()); - auto index = std::distance(fusion->outputs().begin(), i); - return output_args[index].as(); -} -} // namespace - KernelArgumentHolder HostIrExecutor::run( KernelArgumentHolder& args, KernelArgumentHolder output_args) { @@ -120,13 +105,18 @@ KernelArgumentHolder HostIrExecutor::run( auto expr_eval = executor_utils::bindInputs(args, host_ir_container_.get()); if (output_args.empty()) { - std::vector output_info = getBufferInfos( + std::vector output_infos = getBufferInfos( expr_eval, PrimDataType::Int, host_ir_container_->outputs()); - output_args = allocateOutputs( + output_args.resize(host_ir_container_->outputs().size()); + auto output_alias_to_input = + executor_utils::getOutputAliasToInputMap(host_ir_container_.get()); + output_args = allocateKernelOutputs( host_ir_container_.get(), - output_info, + output_infos, + output_alias_to_input, c10::Device(c10::DeviceType::CUDA, args.getDeviceIndex()), - expr_eval); + args, + true); } // TODO: If outputs are provided validate they're the correct size @@ -136,8 +126,18 @@ KernelArgumentHolder HostIrExecutor::run( c10d::Backend* backend = communicator_->getBackendForTeam(communication->team(), std::nullopt); auto in_tensor = expr_eval.evaluate(communication->in()).as(); - at::Tensor out_tensor = findBufferForFusionOutput( - output_args, communication->out(), host_ir_container_.get()); + auto out_idx = std::distance( + host_ir_container_->outputs().begin(), + std::find( + host_ir_container_->outputs().begin(), + host_ir_container_->outputs().end(), + communication->out())); + + NVF_ERROR( + out_idx < (int64_t)host_ir_container_->outputs().size(), + "Output tensor not found in fusion outputs"); + auto out_tensor = output_args[out_idx].as(); + c10::intrusive_ptr work = postSingleCommunication( communication, communicator_->deviceId(), @@ -148,6 +148,19 @@ KernelArgumentHolder HostIrExecutor::run( work->wait(); } } + + // Evaluate outputs that are marked as Evaluate + for (auto out_idx : c10::irange(host_ir_container_->outputs().size())) { + auto out = host_ir_container_->outputs()[out_idx]; + auto alias_info = host_ir_container_->getOutputAlias(out); + if (alias_info.type == AllocationType::Evaluate) { + NVF_ERROR( + !output_args[out_idx].hasValue(), + "Output tensor already has a value"); + output_args[out_idx] = expr_eval.evaluate(out); + } + } + if (isProfilerEnabled()) { FusionProfiler::segment(group_id_).setDevice(args.getDeviceIndex()); FusionProfiler::segment(group_id_).stopKernel(); @@ -572,13 +585,21 @@ void HostIrEvaluator::handle(kir::Allocate* allocate) { "Allocation must be on a TensorView but got ", allocate->buffer()); TensorView* tv = allocate->buffer()->as(); + if (expr_evaluator_.isKnown(tv)) { + return; + } GlobalBufferInfo info = getBufferInfos(expr_evaluator_, PrimDataType::Int, {tv}).at(0); - AliasInfo alias_info = { - .type = AllocationType::New, .aliased_io = nullptr, .hide_output = false}; c10::Device device = communicator_ ? communicator_->device() : at::Device("cuda:0"); - at::Tensor tensor = allocateTensor(info, alias_info, device, expr_evaluator_); + auto tensor = at::native::empty_strided_cuda( + info.shape_info.logical_sizes, + info.shape_info.logical_strides, + info.type, + c10::nullopt, + device, + c10::nullopt); + expr_evaluator_.bind(tv, tensor); } diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp index c3ef950d563..770d8a73a52 100644 --- a/csrc/runtime/allocations.cpp +++ b/csrc/runtime/allocations.cpp @@ -254,141 +254,20 @@ void fillTensorWithNan(at::Tensor& t) { } } -at::Tensor allocateTensor( - const GlobalBufferInfo& out_info, - const AliasInfo& alias_info, - const c10::Device& device, - ExpressionEvaluator& ee) { - FUSER_PERF_SCOPE("fusion_executor::allocations::allocateTensor"); - // Handle a fusion with duplicated outputs. - TensorView* out_tv = out_info.tv; - if (ee.isKnown(out_tv)) { - return ee.evaluate(out_tv).as(); - } - - std::optional aliased_io_tensor = std::nullopt; - Val* aliased_io = alias_info.aliased_io; - if (aliased_io != nullptr) { - NVF_ERROR( - aliased_io->isFusionInput() || aliased_io->isFusionOutput(), - aliased_io->toInlineString(), - " is expected to be a fusion input/output. `ee.evaluate` ", - "an intermediate tensor may involve GPU computation to materialize it ", - "to global memory."); - const PolymorphicValue& aliased_io_val = ee.evaluate(aliased_io); - NVF_ERROR( - aliased_io_val.is(), - "Alias io only supports tensor. Found ", - PolymorphicValue_functions::toString(aliased_io_val)); - aliased_io_tensor = aliased_io_val.as(); - } - - switch (alias_info.type) { - case AllocationType::New: { - auto alloc_tensor = at::native::empty_strided_cuda( - out_info.shape_info.logical_sizes, - out_info.shape_info.logical_strides, - out_info.type, - c10::nullopt, - device, - c10::nullopt); - if (shouldFillAllocationWithNan()) { - fillTensorWithNan(alloc_tensor); - } - return alloc_tensor; - } - case AllocationType::ReuseBuffer: - // Unlike for `AllocationType::Evaluate`, don't use - // ExpressionEvaluator to compute the output tensor. This is because - // the output tensor may hold different data from the input, e.g., an - // updated running mean. `ExpressionEvaluator::evaluate(out_tv)` - // would trigger non-trivial host computation. - return aliased_io_tensor.value(); - case AllocationType::Evaluate: { - auto out_tensor = ee.evaluate(out_tv).as(); - if (aliased_io_tensor.has_value()) { - NVF_ERROR( - out_tensor.is_alias_of(aliased_io_tensor.value()), - "ExpressionEvaluator failed to evaluate ", - out_tv->toString(), - " as an alias of ", - aliased_io->toString()); - inferAndValidateAllocationSizesAndStrides(out_tensor, out_tv, ee); - } - return out_tensor; - } - default: - NVF_THROW("Unrecognized AllocationType."); - } -} - -KernelArgumentHolder allocateOutputs( - const Fusion* fusion, - const std::vector& output_info, - const c10::Device& device, - ExpressionEvaluator& ee) { - FUSER_PERF_SCOPE("fusion_executor::allocations::allocateOutputs"); - - const auto num_outs = output_info.size(); - - // Sort the outputs so we compute aliases after allocating non-aliases. The - // order between aliases can be arbitrary. E.g., - // - // ``` - // non_alias_out = ... - // alias_out_0 = reshape(non_alias_out, ...) - // alias_out_1 = reshape(alias_out_0, ...) - // ``` - // - // It's fine to compute `alias_out_1` before computing `alias_out_0`: when we - // compute `alias_out_1`, `alias_out_0` will be recursively - // `ExpressionEvaluator::evaluate`ed. However, `non_alias_out` must be - // allocated first so `alias_out_*` can refer them. - std::vector> sorted_outs; - sorted_outs.reserve(num_outs); - for (const auto out_index : c10::irange(num_outs)) { - sorted_outs.emplace_back(out_index, fusion->outputs()[out_index]); - } - std::sort( - sorted_outs.begin(), - sorted_outs.end(), - [fusion]( - const std::pair& lhs, - const std::pair& rhs) { - return ( - fusion->getOutputAlias(lhs.second).type == AllocationType::New && - fusion->getOutputAlias(rhs.second).type != AllocationType::New); - }); - - std::vector out_tensors(num_outs); - for (const auto& [out_index, out] : sorted_outs) { - at::Tensor out_tensor = allocateTensor( - output_info[out_index], fusion->getOutputAlias(out), device, ee); - // Bind `out_tensor` so - // 1. duplicated outputs map to the same tensor, - // 2. an output that aliases another output can be evaluated via - // ExpressionEvaluator cheaply. - ee.bind(out, out_tensor); - out_tensors[out_index] = out_tensor; - } - return KernelArgumentHolder(out_tensors); -} - KernelArgumentHolder allocateKernelOutputs( const Fusion* fusion, - const KernelExecutorEntry& entry, + const std::vector& output_infos, + const std::vector& output_alias_to_input_map, const c10::Device& device, const KernelArgumentHolder& args, - bool dynamic_alias) { + bool dynamic_evaluate) { FUSER_PERF_SCOPE("fusion_executor::allocations::allocateOutputs"); - // TODO: Figure out if output to output aliasing is needed - KernelArgumentHolder out_tensors; - out_tensors.resize(entry.outputs.size()); - for (auto out_idx : c10::irange(entry.outputs.size())) { - auto out_info = entry.outputs.at(out_idx); - if (entry.output_aliased_to_input.at(out_idx) == -1) { + out_tensors.resize(output_infos.size()); + for (auto out_idx : c10::irange(output_infos.size())) { + auto out_info = output_infos.at(out_idx); + if (output_alias_to_input_map.at(out_idx) == -1) { auto alloc_tensor = at::native::empty_strided_cuda( out_info.shape_info.logical_sizes, out_info.shape_info.logical_strides, @@ -403,12 +282,12 @@ KernelArgumentHolder allocateKernelOutputs( } else if ( fusion->getOutputAlias(out_info.tv).type == AllocationType::ReuseBuffer) { - const auto& inp = args[entry.output_aliased_to_input.at(out_idx)]; + const auto& inp = args[output_alias_to_input_map.at(out_idx)]; NVF_ERROR(inp.is(), "Input is not a Tensor"); out_tensors[out_idx] = inp; } else if ( fusion->getOutputAlias(out_info.tv).type == AllocationType::Evaluate) { - if (dynamic_alias) { + if (dynamic_evaluate) { out_tensors[out_idx] = std::monostate(); continue; } @@ -416,7 +295,7 @@ KernelArgumentHolder allocateKernelOutputs( ExpressionEvaluator ee; ee.bind( fusion->getOutputAlias(out_info.tv).aliased_io, - args[entry.output_aliased_to_input.at(out_idx)]); + args[output_alias_to_input_map.at(out_idx)]); out_tensors[out_idx] = ee.evaluate(out_info.tv); } else { NVF_THROW( diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h index 72fea232ab1..71a3b46ddbb 100644 --- a/csrc/runtime/allocations.h +++ b/csrc/runtime/allocations.h @@ -75,32 +75,18 @@ TensorShapeInfo inferTensorShapes( TensorView* tv, const ExpressionEvaluator& expr_eval); -// Allocate an `at::Tensor` for `out_info` or compute it as an alias. -at::Tensor allocateTensor( - const GlobalBufferInfo& out_info, - const AliasInfo& alias_info, - const c10::Device& device, - ExpressionEvaluator& ee); - -// Allocate output tensors for a given fusion. Outputs may alias inputs, in -// that case output tensors are shallow copies of the aliased inputs -KernelArgumentHolder allocateOutputs( - const Fusion* fusion, - const std::vector& output_info, - const c10::Device& device, - ExpressionEvaluator& ee); - // Allocate output tensors for a given fusion. Outputs may alias inputs, in // that case output tensors are shallow copies of the aliased inputs. // -// If dynamic_alias is true, then any argument with AllocationType::Evaluate +// If dynamic_evaluate is true, then any argument with AllocationType::Evaluate // will not be populated, it will be filled with std::monostate. KernelArgumentHolder allocateKernelOutputs( const Fusion* fusion, - const KernelExecutorEntry& entry, + const std::vector& output_infos, + const std::vector& output_alias_to_input_map, const c10::Device& device, const KernelArgumentHolder& args, - bool dynamic_alias = false); + bool dynamic_evaluate = false); //! Return information necessary for allocating output tensors. Input //! and output tensors are allowed to alias each other, which is diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index 24b652ff323..8340db7b33c 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -285,13 +285,22 @@ void KernelExecutor::compile( for (auto expr : exprs) { if (ir_utils::isCpAsyncBulk(expr)) { - has_TMA_ = true; + has_tma_ = true; } if (expr->isA()) { has_rng_ = true; } } + // If an output has an alias to an input and is marked Evaluate, then + // expression evaluator evaluate is called on that output to produce the meta + // data manipulation it requires. If that manipulation is something like a + // slice, and that slice has a symbolic integer it depends on, then this + // function returns true. + // + // This could happen for other examples and this function will return true if + // to evaluate the output that has an alias, other values besides the aliased + // input need to be bound to the expression evaluator to evaluate the output. for (auto output : fusion->outputs()) { if (output->isA()) { auto out_tv = output->as(); @@ -762,47 +771,13 @@ void KernelExecutor::initializeExecutorEntry( } } - std::vector output_aliased_to_input(output_info.size(), -1); - - for (auto output_idx : c10::irange(output_info.size())) { - auto out_info = output_info[output_idx]; - auto fusion = compiled_kernel_->kernel()->as(); - auto alias_info = fusion->getOutputAlias(out_info.tv); - if (alias_info.type == AllocationType::New) { - continue; - } - NVF_ERROR(alias_info.aliased_io, "Alias info is not an input or output"); - auto aliased_to = alias_info.aliased_io->as(); - auto aliased_to_idx = std::distance( - fusion->inputs().begin(), - std::find( - fusion->inputs().begin(), fusion->inputs().end(), aliased_to)); - if (aliased_to_idx < (int64_t)fusion->inputs().size()) { - output_aliased_to_input[(int64_t)output_idx] = aliased_to_idx; - } else { - auto aliased_out = std::find( - fusion->outputs().begin(), fusion->outputs().end(), aliased_to); - NVF_ERROR( - aliased_out != fusion->outputs().end(), - "Could not find the alias tensor of: ", - out_info.tv->toString(), - "\nAliased to: ", - aliased_to->toString()); - NVF_THROW( - "Kernel found with output to output aliasing, this is unsupported at this moment.\n", - "Output: ", - out_info.tv->toString(), - "\nAliased to: ", - aliased_to->toString()); - } - } - auto intermediates = getIntermediateBufferInfo(expr_eval, index_type); // All information is gathered. Save it to KernelExecutorEntry executor_entry.launch_params = launch_params; executor_entry.outputs = output_info; - executor_entry.output_aliased_to_input = output_aliased_to_input; + executor_entry.output_aliased_to_input = + executor_utils::getOutputAliasToInputMap(compiled_kernel_->kernel()); executor_entry.intermediates = intermediates; executor_entry.inputs = input_info; executor_entry.init = true; @@ -1205,7 +1180,7 @@ KernelArgumentHolder KernelExecutor::run( } ExpressionEvaluator expr_eval; - if (has_dynamic_alias_ || has_TMA_) { + if (has_dynamic_alias_ || has_tma_) { expr_eval = executor_utils::bindInputs(args, compiled_kernel_->kernel()); } @@ -1273,7 +1248,8 @@ KernelArgumentHolder KernelExecutor::run( if (output_args.empty()) { output_args = allocateKernelOutputs( compiled_kernel_->kernel(), - *executor_entry, + executor_entry->outputs, + executor_entry->output_aliased_to_input, compiled_kernel_->device(), args, has_dynamic_alias_); @@ -1375,11 +1351,11 @@ KernelArgumentHolder KernelExecutor::run( if (args.size() != compiled_kernel_->kernel()->parameters().size()) { NVF_ERROR( - has_TMA_ || has_rng_, + has_tma_ || has_rng_, "No TMA or RNG found in the kernel, but detected an argument size mismatch."); // If args don't match one of two things is happening. We need to add TMA // related args or RNG related args. Resolve these scenarios. - if (has_TMA_) { + if (has_tma_) { // Resolving TMA requires binding all values and evaluating the TMA // arguments // std::cout << "Resolving TMA" << std::endl; @@ -1535,7 +1511,7 @@ flatbuffers::Offset KernelExecutor::serialize( toUnderlying(compiledKernel()->kernel()->indexType()), serialize(builder, compiledKernel()->cudaExecutable().get()), has_rng_, - has_TMA_, + has_tma_, has_dynamic_alias_); } @@ -1761,7 +1737,7 @@ void KernelExecutor::deserialize( } has_rng_ = buffer->has_rng(); - has_TMA_ = buffer->has_TMA(); + has_tma_ = buffer->has_tma(); has_dynamic_alias_ = buffer->has_dynamic_alias(); } diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h index ffe4d1a37da..bd352dd4fe6 100644 --- a/csrc/runtime/executor.h +++ b/csrc/runtime/executor.h @@ -319,7 +319,7 @@ class KernelExecutor : public ExecutorAbstract { // Has a TMA kernel and therefore needs to infer TMA inputs through expression // evaluator - bool has_TMA_ = false; + bool has_tma_ = false; // Has a dynamic alias and therefore needs to infer what they are through // expression evaluator diff --git a/csrc/runtime/executor_utils.cpp b/csrc/runtime/executor_utils.cpp index 6e0c2d769a5..e81f2895a31 100644 --- a/csrc/runtime/executor_utils.cpp +++ b/csrc/runtime/executor_utils.cpp @@ -599,6 +599,44 @@ ExpressionEvaluator bindInputs( return expr_eval; } +std::vector getOutputAliasToInputMap(const Fusion* fusion) { + std::vector output_to_input_map(fusion->outputs().size(), -1); + for (auto output_idx : c10::irange(fusion->outputs().size())) { + auto alias_info = fusion->getOutputAlias(fusion->outputs()[output_idx]); + if (alias_info.type == AllocationType::New) { + continue; + } + NVF_ERROR( + alias_info.aliased_io && alias_info.aliased_io->isA(), + "Alias information is missing the aliased tensor."); + + auto aliased_to = alias_info.aliased_io->as(); + auto aliased_to_idx = std::distance( + fusion->inputs().begin(), + std::find( + fusion->inputs().begin(), fusion->inputs().end(), aliased_to)); + if (aliased_to_idx < (int64_t)fusion->inputs().size()) { + output_to_input_map[(int64_t)output_idx] = aliased_to_idx; + } else { + auto aliased_out = std::find( + fusion->outputs().begin(), fusion->outputs().end(), aliased_to); + NVF_ERROR( + aliased_out != fusion->outputs().end(), + "Could not find the alias tensor of: ", + fusion->outputs()[output_idx]->toString(), + "\nAliased to: ", + aliased_to->toString()); + NVF_THROW( + "Kernel found with output to output aliasing, this is unsupported at this moment.\n", + "Output: ", + fusion->outputs()[output_idx]->toString(), + "\nAliased to: ", + aliased_to->toString()); + } + } + return output_to_input_map; +} + CudaExecutable::~CudaExecutable() { if (module != nullptr) { NVFUSER_CUDA_SAFE_CALL(cuModuleUnload(module)); diff --git a/csrc/runtime/executor_utils.h b/csrc/runtime/executor_utils.h index 9d631f2548e..211e612c31c 100644 --- a/csrc/runtime/executor_utils.h +++ b/csrc/runtime/executor_utils.h @@ -51,6 +51,11 @@ struct CudaExecutable : public NonCopyable { NVF_API ExpressionEvaluator bindInputs(const KernelArgumentHolder& args, Fusion* fusion); +// Returns a vector where vector[out_idx] == the input index in fusion->inputs() +// that output[out_idx] is aliased to. If output[out_idx] is not aliased to any +// input, then vector[out_idx] is -1. +std::vector getOutputAliasToInputMap(const Fusion* fusion); + // Compile time cache for execution namespace caching { // TODO: Could consider putting some of diff --git a/csrc/serde/fusion_cache.fbs b/csrc/serde/fusion_cache.fbs index d7ffdeba45d..a09c3497e73 100644 --- a/csrc/serde/fusion_cache.fbs +++ b/csrc/serde/fusion_cache.fbs @@ -379,7 +379,7 @@ table KernelExecutor { index_type : long; compiled_kernel: CudaKernel; has_rng: bool; - has_TMA: bool; + has_tma: bool; has_dynamic_alias: bool; } From e05cd7fb320c15b524760d864ef9d6b6ec0696d7 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Sun, 2 Mar 2025 18:18:48 -0800 Subject: [PATCH 26/35] Revert test debug changes. --- tests/cpp/test_allocation_domain.cpp | 4 ---- tests/cpp/test_multidevice_sharding.cpp | 13 +++++-------- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/tests/cpp/test_allocation_domain.cpp b/tests/cpp/test_allocation_domain.cpp index e78425045fd..f736edc1296 100644 --- a/tests/cpp/test_allocation_domain.cpp +++ b/tests/cpp/test_allocation_domain.cpp @@ -1114,11 +1114,7 @@ TEST_F(AllocationDomainTest, ContiguityIssue1021) { fusion->addOutput(tv1); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - // allocate [8,8] strided [8, 1] - // modify to [4,8] strided [1, 8] - // Tell nvFuser it's allocated as [4, 8] strided [8, 1] at::Tensor t0 = at::randn({8, 8}, options).as_strided({4, 8}, {1, 8}); - FusionExecutorCache executor_cache(std::move(fusion)); auto outputs = executor_cache.runFusionWithInputs({t0}); diff --git a/tests/cpp/test_multidevice_sharding.cpp b/tests/cpp/test_multidevice_sharding.cpp index 7772ae9da9e..5c7a5e43c62 100644 --- a/tests/cpp/test_multidevice_sharding.cpp +++ b/tests/cpp/test_multidevice_sharding.cpp @@ -241,16 +241,13 @@ TEST_F(MultiDeviceTest, DivideBySum) { for (auto* tv : {x, y}) { tv->setAllocationDomain(tv->getLoopDomain(), true); } - x->setContiguity({false, false, true, true, true}); - const int64_t b = 7; - const int64_t h = d * 5; - const int64_t s = 3; + + const int64_t b = 2; + const int64_t h = d * 3; + const int64_t s = 5; at::Tensor unsharded_x_tensor = at::randint(5, {b, h, s, s}, tensor_options); + at::Tensor x_tensor = shardTensor(unsharded_x_tensor, x); - // at::Tensor x_tensor = shardTensor(unsharded_x_tensor, x); - at::Tensor x_tensor = at::randint(5, {b, h / d, s, s}, tensor_options); - std::cout << "Properties: " << debug_str(x_tensor) << "\n"; - // std::cout<<"input:"<(); From 7c2ac6507f6098c12ed0bf8174e6373c62d0c1df Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Sun, 2 Mar 2025 18:27:13 -0800 Subject: [PATCH 27/35] Rename and remove dead code. --- csrc/host_ir/executor.cpp | 2 +- csrc/runtime/allocations.cpp | 68 +----------------------------------- csrc/runtime/allocations.h | 8 +---- csrc/runtime/executor.cpp | 2 +- 4 files changed, 4 insertions(+), 76 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 5465a02b09b..32c2f5a956e 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -110,7 +110,7 @@ KernelArgumentHolder HostIrExecutor::run( output_args.resize(host_ir_container_->outputs().size()); auto output_alias_to_input = executor_utils::getOutputAliasToInputMap(host_ir_container_.get()); - output_args = allocateKernelOutputs( + output_args = allocateOutputs( host_ir_container_.get(), output_infos, output_alias_to_input, diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp index 770d8a73a52..83ab60d900d 100644 --- a/csrc/runtime/allocations.cpp +++ b/csrc/runtime/allocations.cpp @@ -254,7 +254,7 @@ void fillTensorWithNan(at::Tensor& t) { } } -KernelArgumentHolder allocateKernelOutputs( +KernelArgumentHolder allocateOutputs( const Fusion* fusion, const std::vector& output_infos, const std::vector& output_alias_to_input_map, @@ -689,72 +689,6 @@ std::pair, std::vector> inferShapeOfOutput( return {meta_tensor.sizes().vec(), meta_tensor.strides().vec()}; } -std::vector getInputBufferInfos( - ExpressionEvaluator& expr_eval, - DataType index_dtype, - const std::vector& fusion_inputs, - const std::vector& inputs) { - NVF_ERROR( - fusion_inputs.size() == inputs.size(), - "Mismatch in inputs provided, expected ", - fusion_inputs.size(), - " but got ", - inputs.size()); - std::vector buffer_infos; - for (auto i : c10::irange(fusion_inputs.size())) { - GlobalBufferInfo buffer_info; - buffer_info.tv = fusion_inputs[i]->as(); - auto logical_sizes = inputs[i].sizes().vec(); - auto logical_strides = inputs[i].strides().vec(); - TensorShapeInfo shape_info; - shape_info.logical_sizes = logical_sizes; - shape_info.logical_strides = logical_strides; - if (isSharded(buffer_info.tv)) { - shape_info.unsharded_logical_sizes = - unshardedSizes(buffer_info.tv, logical_sizes); - } - buffer_info.shape_info = shape_info; - buffer_info.type = inputs[i].scalar_type(); - - // TODO: Handle input allocation domains that aren't permutes - // of the logical domain - if (buffer_info.tv->hasAllocation()) { - auto logical_domain = - TensorDomain::noReductions(buffer_info.tv->getLogicalDomain()); - auto allocation_domain = TensorDomain::noReductions( - buffer_info.tv->getMaybeAllocationDomain()); - std::unordered_map logical_to_allocation_map; - for (auto logical_idx : c10::irange(logical_domain.size())) { - auto allocation_id = std::find( - allocation_domain.begin(), - allocation_domain.end(), - logical_domain[logical_idx]); - NVF_ERROR( - allocation_id != allocation_domain.end(), - "Logical domain and allocation domain have different sets of IterDomains, this is not supported yet."); - logical_to_allocation_map[(int64_t)logical_idx] = - std::distance(allocation_domain.begin(), allocation_id); - } - std::vector allocation_sizes(allocation_domain.size()); - std::vector allocation_strides(allocation_domain.size()); - for (auto i : c10::irange(allocation_domain.size())) { - allocation_sizes[i] = - (int64_t)logical_sizes[logical_to_allocation_map[i]]; - allocation_strides[i] = - (int64_t)logical_strides[logical_to_allocation_map[i]]; - } - buffer_info.shape_info.allocation_sizes = allocation_sizes; - buffer_info.shape_info.allocation_strides = allocation_strides; - } else { - buffer_info.shape_info.allocation_sizes = logical_sizes; - buffer_info.shape_info.allocation_strides = logical_strides; - } - - buffer_infos.emplace_back(buffer_info); - } - return buffer_infos; -} - TensorShapeInfo inferTensorShapes( TensorView* tv, const ExpressionEvaluator& expr_eval) { diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h index 71a3b46ddbb..c1907fc1955 100644 --- a/csrc/runtime/allocations.h +++ b/csrc/runtime/allocations.h @@ -80,7 +80,7 @@ TensorShapeInfo inferTensorShapes( // // If dynamic_evaluate is true, then any argument with AllocationType::Evaluate // will not be populated, it will be filled with std::monostate. -KernelArgumentHolder allocateKernelOutputs( +KernelArgumentHolder allocateOutputs( const Fusion* fusion, const std::vector& output_infos, const std::vector& output_alias_to_input_map, @@ -96,10 +96,4 @@ std::vector getBufferInfos( DataType index_dtype, const std::vector& fusion_outputs); -std::vector getInputBufferInfos( - ExpressionEvaluator& expr_eval, - DataType index_dtype, - const std::vector& fusion_outputs, - const std::vector& inputs); - } // namespace nvfuser diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index 8340db7b33c..69d0b6a53da 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -1246,7 +1246,7 @@ KernelArgumentHolder KernelExecutor::run( // only allocate outputs when not given if (output_args.empty()) { - output_args = allocateKernelOutputs( + output_args = allocateOutputs( compiled_kernel_->kernel(), executor_entry->outputs, executor_entry->output_aliased_to_input, From 7a320bd2dd45f10187e3f827fbdb7f40c3d3f356 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Mon, 3 Mar 2025 07:44:42 -0800 Subject: [PATCH 28/35] Clang. --- csrc/runtime/executor_utils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/runtime/executor_utils.cpp b/csrc/runtime/executor_utils.cpp index e81f2895a31..8a54d0c30b9 100644 --- a/csrc/runtime/executor_utils.cpp +++ b/csrc/runtime/executor_utils.cpp @@ -616,7 +616,7 @@ std::vector getOutputAliasToInputMap(const Fusion* fusion) { std::find( fusion->inputs().begin(), fusion->inputs().end(), aliased_to)); if (aliased_to_idx < (int64_t)fusion->inputs().size()) { - output_to_input_map[(int64_t)output_idx] = aliased_to_idx; + output_to_input_map[(int64_t)output_idx] = (int64_t)aliased_to_idx; } else { auto aliased_out = std::find( fusion->outputs().begin(), fusion->outputs().end(), aliased_to); From 15b03c581129014be13d49d3d641ae225fcd3417 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Mon, 3 Mar 2025 08:04:02 -0800 Subject: [PATCH 29/35] Cleanup, remove dead code. --- benchmarks/cpp/matmul.cpp | 9 +- csrc/runtime/executor.cpp | 159 +-------------------------- csrc/runtime/executor.h | 20 +--- csrc/runtime/executor_kernel_arg.cpp | 40 ------- csrc/runtime/executor_kernel_arg.h | 14 --- 5 files changed, 10 insertions(+), 232 deletions(-) diff --git a/benchmarks/cpp/matmul.cpp b/benchmarks/cpp/matmul.cpp index 0f5159b9cad..5ef718b4a4f 100644 --- a/benchmarks/cpp/matmul.cpp +++ b/benchmarks/cpp/matmul.cpp @@ -175,7 +175,7 @@ static void SingleMatmulBase( // Compile kernel auto launch_constraints = LaunchParams(); KernelExecutor ke; - ke.compile(fusion, args.toC10Array(), launch_constraints, cparams); + ke.compile(fusion, args, launch_constraints, cparams); NVF_CHECK( getBankConflictInfo(ke.compiledKernel()->kernel(), launch_constraints) .empty(), @@ -352,7 +352,7 @@ static void SingleMatmulPartitionedK( // Compile kernel KernelExecutor ke; auto lparams = LaunchParams(); - ke.compile(fusion, args.toC10Array(), lparams, cparams); + ke.compile(fusion, args, lparams, cparams); NVF_CHECK( getBankConflictInfo(ke.compiledKernel()->kernel(), lparams).empty(), "Shared memory bank conflict not removed."); @@ -461,10 +461,7 @@ static void NvFuserScheduler_MatmulSplitKReduction( // Compile kernel KernelExecutor ke; ke.compile( - fusion, - args.toC10Array(), - heuristic_params->lparams, - heuristic_params->cparams); + fusion, args, heuristic_params->lparams, heuristic_params->cparams); NVF_CHECK( getBankConflictInfo( diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index 69d0b6a53da..8cf1adf4625 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -665,7 +665,6 @@ void KernelExecutor::initializeExecutorEntry( auto launch_params = computeLaunchParams( launch_constraints, expr_eval, warp_size_, index_type); - // NVF_THROW("Stop here"); for (const auto& entry : compiled_kernel_->kernel()->summary().validations) { NVF_CHECK(expr_eval.evaluate(entry.first).as(), entry.second); } @@ -783,101 +782,6 @@ void KernelExecutor::initializeExecutorEntry( executor_entry.init = true; } -/// Copies the data, logical_size, and alloc_stride parameters to the -/// appropriate parts of entry.args[idx]. -/// -/// For GPU tensors, we pass a Tensor struct (see -/// runtime/tensor.cu), where the rank describes the number of elements in the -/// shape and stride arrays. The actual shapes and strides are dynamic, but the -/// type and rank of the tensors are actually static (changing them would need -/// a new FusionDefinition). So we create the storage area for the -/// Tensor during ::computeArgs, and then in this function we just -/// update that memory with the current values for the tensor's base address, -/// shape, and strides. -/// -/// @param entry the entry we have previously setup for this fusion -/// @param idx the index into entry.args and related parallel arrays in the -/// entry. -/// @param idx_type_size generally sizeof(int32_t) or sizeof(int64_t); used for -/// computing how large the arrays to copy are. -static void fillTensorArgMetadata( - KernelExecutorEntry& entry, - const PolymorphicValue& tensor_metadata, - size_t idx, - size_t idx_type_size) { - void* data = tensor_metadata->*&TensorMetaData::data; - // g++ has trouble inferring the types of more complicated fields through our - // *& operators. Creating an `auto` alias as a temporary resolves this - // problem. -#define TMD_ARRAY_REF(pv, field) \ - ({ \ - const auto& fld_tmp_ = pv->*&field; \ - const c10::IntArrayRef& fld_aref_ = fld_tmp_; \ - fld_aref_; \ - }) - const c10::IntArrayRef& shape = - TMD_ARRAY_REF(tensor_metadata, TensorMetaData::logical_size); - const c10::IntArrayRef& strides = - TMD_ARRAY_REF(tensor_metadata, TensorMetaData::alloc_stride); -#undef TMD_ARRAY_REF - - // These are the three offsets we need to copy into. - std::array offsets = { - entry.args[idx].data(), // data ptr - entry.args[idx].data() + sizeof(void*), // shape array - // strides array: - entry.args[idx].data() + sizeof(void*) + shape.size() * idx_type_size, - }; - - memcpy(offsets[0], &data, sizeof(void*)); - switch (idx_type_size) { - case sizeof(int64_t): { - // we use i64's for our sizes, so can use a simple copy here - memcpy(offsets[1], shape.data(), shape.size() * sizeof(int64_t)); - memcpy(offsets[2], strides.data(), strides.size() * sizeof(int64_t)); - } break; - case sizeof(int32_t): { - // we need to cast per-element, so need a loop. - // This case happens when the kernel uses 32bit indices. Since we - // (specifically TensorMetaData) store indices in 64bit, we can't - // directly copy our buffer into the args buffer. We thus have to - // manually downcast each element to fit in the smaller buffer. - for (size_t i = 0; i < shape.size(); ++i) { - const int32_t shp = static_cast(shape[i]); - memcpy(offsets[1] + i * sizeof(int32_t), &shp, sizeof(int32_t)); - } - // In rare cases we have fewer strides than shapes - for (size_t i = 0; i < strides.size(); ++i) { - const int32_t strd = static_cast(strides[i]); - memcpy(offsets[2] + i * sizeof(int32_t), &strd, sizeof(int32_t)); - } - } break; - default: - NVF_CHECK(0, "Unhandled index type size"); - break; - } -} - -// set the arguments that we'll pass to cuLaunchKernel. This should happen -// when we change the rank of a tensor or the number of arguments to a kernel. -// It does not need to happen when only shapes change---use recomputeArgs for -// that. -void KernelExecutor::computeArgs( - KernelExecutorEntry& entry, - ExpressionEvaluator& expr_eval, - const kir::Kernel* kernel) const { - FUSER_PERF_SCOPE("KernelExecutor::computeArgs"); - - const std::vector& params = kernel->parameters(); - entry.args.resize(params.size()); - entry.arg_ptrs.resize(params.size()); - const PrimDataType idx_type = kernel->indexType(); - for (size_t p = 0; p < params.size(); ++p) { - entry.args[p] = getKernelArgument(expr_eval, params[p], idx_type); - entry.arg_ptrs[p] = entry.args[p].data(); - } -} - namespace { GlobalBufferInfo& linear_buffer_info_getter( KernelExecutorEntry& entry, @@ -906,10 +810,10 @@ GlobalBufferInfo& linear_buffer_info_getter( }; } // namespace -void KernelExecutor::computeArgs2( +void KernelExecutor::computeArgs( KernelExecutorEntry& entry, const KernelArgumentHolder& args) const { - FUSER_PERF_SCOPE("KernelExecutor::computeArgs2"); + FUSER_PERF_SCOPE("KernelExecutor::computeArgs"); if (entry.args.size() != args.size()) { entry.args.resize(args.size()); entry.arg_ptrs.resize(args.size()); @@ -997,42 +901,6 @@ void KernelExecutor::computeArgs2( } } -// Reset the arguments that we'll pass to cuLaunchKernel. This needs to be -// invoked on every shape change. -void KernelExecutor::recomputeArgs( - KernelExecutorEntry& entry, - ExpressionEvaluator& expr_eval, - const kir::Kernel* kernel) const { - FUSER_PERF_SCOPE("KernelExecutor::recomputeArgs"); - - const std::vector& params = kernel->parameters(); - const PrimDataType idx_type = kernel->indexType(); - // assert(entry.args.size() == params.size()); - // assert(entry.arg_ptrs.size() == params.size()); - // assert(params.size() >= args.size()); - for (size_t p = 0; p < params.size(); ++p) { - PolymorphicValue pv = expr_eval.evaluate(params[p]); - if (pv.is() && pv.as().is_cuda()) { - // GPU tensors are not passed directly: instead we pass a Tensor struct. The pointer and dimensions are dynamic, but the - // types and ranks are actually static (changing the rank or the types - // would need to be done via a new FusionDefinition). As such, we created - // the Tensor struct during ::computeArgs, and here we just fill - // in the base address, shape, and stride arrays to cover whatever new - // tensors we got this round. - TensorView* mtv = dynamic_cast(params[p]); - const Val* mdexpr = IrBuilder::metadataExpr(mtv); - const PolymorphicValue& tmd = expr_eval.evaluate(mdexpr); - const size_t idx_type_size = - PrimDataType::Int == idx_type ? sizeof(int64_t) : sizeof(int32_t); - fillTensorArgMetadata(entry, tmd, p, idx_type_size); - } else { - entry.args[p] = getKernelArgument(expr_eval, params[p], idx_type); - } - entry.arg_ptrs[p] = entry.args[p].data(); - } -} - int64_t KernelExecutor::getAvailableDynamicSmemSize() { if (!available_dynamic_smem_size_.has_value()) { int size = 0; @@ -1219,7 +1087,6 @@ KernelArgumentHolder KernelExecutor::run( // Initialize the executor entry if not initlized if (!executor_entry->init) { - // std::cout << "Initializing executor entry" << std::endl; initializeExecutorEntry( *executor_entry, args, @@ -1227,7 +1094,6 @@ KernelArgumentHolder KernelExecutor::run( compile_params, output_args, compiled_kernel_->kernel()->indexType()); - // std::cout << "Executor entry initialized" << std::endl; } if (!(executor_entry->launch_params.nThreads() <= @@ -1254,7 +1120,7 @@ KernelArgumentHolder KernelExecutor::run( args, has_dynamic_alias_); if (has_dynamic_alias_) { - // TODO: Make sure dynamic alias works. + // TODO: Make sure there's a dynamic alias test. for (const auto i : c10::irange(compiled_kernel_->kernel()->outputs().size())) { auto param = compiled_kernel_->kernel()->outputs()[i]; @@ -1335,14 +1201,6 @@ KernelArgumentHolder KernelExecutor::run( } args.push(intermediate_buffer); intermediate_args.push(intermediate_buffer); - // expr_eval.bind( - // compiled_kernel_->kernel() - // ->summary() - // .global_allocations.at(i) - // ->buffer(), - // args - // [compiled_kernel_->kernel()->inputs().size() + outputs.size() + - // i]); if (buf_info.is_profile_buffer) { profile_buffer = intermediate_buffer; } @@ -1358,21 +1216,16 @@ KernelArgumentHolder KernelExecutor::run( if (has_tma_) { // Resolving TMA requires binding all values and evaluating the TMA // arguments - // std::cout << "Resolving TMA" << std::endl; args = resolveTMA(*executor_entry, args); - // std::cout << "TMA resolved" << std::endl; } else if (has_rng_) { // Resolving RNG seed requires evaluating and adding those values, but // doesn't require binding all values as getting RNG seed and offset // doesn't depend on other values - // std::cout << "Resolving RNG seed" << std::endl; args = resolveRNGSeed(compiled_kernel_->kernel(), args); - // std::cout << "RNG seed resolved" << std::endl; } } - // std::cout << "Computing args" << std::endl; - computeArgs2(*executor_entry, args); - // std::cout << "Args computed" << std::endl; + + computeArgs(*executor_entry, args); if (isDebugDumpEnabled(DebugDumpOption::LaunchParam)) { launch_params_.print(); @@ -1398,8 +1251,6 @@ KernelArgumentHolder KernelExecutor::run( FUSER_PERF_SCOPE("KernelExecutor::runFusion::execute_kernel"); ensureAvailableDynamicSmemSize(executor_entry->launch_params.smem()); - // recomputeArgs(*executor_entry, expr_eval, compiled_kernel_->kernel()); - if (isDebugDumpEnabled(DebugDumpOption::Occupancy) || isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) { int blocks_per_sm = -1; diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h index bd352dd4fe6..4b8c5e12be5 100644 --- a/csrc/runtime/executor.h +++ b/csrc/runtime/executor.h @@ -229,29 +229,13 @@ class KernelExecutor : public ExecutorAbstract { // Creates the initial set of arguments to a kernel, based on the arguments // to we have now. - void computeArgs( - KernelExecutorEntry&, - ExpressionEvaluator&, - const kir::Kernel*) const; - - void computeArgs2( - KernelExecutorEntry& entry, - const KernelArgumentHolder& args) const; + void computeArgs(KernelExecutorEntry& entry, const KernelArgumentHolder& args) + const; KernelArgumentHolder resolveTMA( KernelExecutorEntry& entry, const KernelArgumentHolder& args) const; - // Updates an existing set of arguments based on the current arguments. It is - // is an error to call this before `computeArgs` has been invoked. - // recomputeArgs will fail if the arity of the function changes, or the rank - // of any tensor changes (as these are compiled-in to the generated kernel - // and therefore would require us to do a larger recompilation). - void recomputeArgs( - KernelExecutorEntry&, - ExpressionEvaluator&, - const kir::Kernel*) const; - //! Serialize CompiledKernel using flatbuffers flatbuffers::Offset serialize( flatbuffers::FlatBufferBuilder& builder, diff --git a/csrc/runtime/executor_kernel_arg.cpp b/csrc/runtime/executor_kernel_arg.cpp index 0392ae65b9a..184d7579b84 100644 --- a/csrc/runtime/executor_kernel_arg.cpp +++ b/csrc/runtime/executor_kernel_arg.cpp @@ -120,15 +120,6 @@ void KernelArgumentHolder::pushTensorProxy( push(meta_tensor); } -std::vector KernelArgumentHolder::toC10Array() const { - std::vector ival_array; - ival_array.reserve(arguments_.size()); - for (const auto& arg : arguments_) { - ival_array.push_back(PolymorphicValue_functions::toIValue(arg)); - } - return ival_array; -} - void KernelArgumentHolder::setDeviceIndex(std::optional index) { if (index.has_value()) { device_index_ = index.value(); @@ -359,26 +350,6 @@ std::vector polymorphicValueToBytes( } } -std::vector getKernelArgument( - ExpressionEvaluator& ee, - Val* parameter, - PrimDataType index_type) { - FUSER_PERF_SCOPE("getKernelArgument"); - NVF_ERROR(parameter != nullptr); - PolymorphicValue pv = ee.evaluate(parameter); - if (auto tv = dynamic_cast(parameter)) { - if (tv->isCpuScalar()) { - return polymorphicValueToBytes(pv, tv->dtype(), index_type); - } else { - const Val* metadata_val = IrBuilder::metadataExpr(tv); - const PolymorphicValue& metadata = ee.evaluate(metadata_val); - return polymorphicValueToBytes( - metadata, metadata_val->dtype(), index_type); - } - } - return polymorphicValueToBytes(pv, parameter->dtype(), index_type); -} - int64_t computeBytes(const KernelArgumentHolder& args) { int64_t num_bytes = 0; // Figure how many bytes are inputs, outputs, and temporary buffers @@ -391,15 +362,4 @@ int64_t computeBytes(const KernelArgumentHolder& args) { return num_bytes; } -int64_t computeBytes(const std::vector& outputs) { - int64_t num_bytes = 0; - for (auto i : c10::irange(outputs.size())) { - const auto& output = outputs.at(i); - // NOTE: this assumes that all output elements correspond to a single - // store - num_bytes += static_cast(output.storage().nbytes()); - } - return num_bytes; -} - } // namespace nvfuser diff --git a/csrc/runtime/executor_kernel_arg.h b/csrc/runtime/executor_kernel_arg.h index a8a7fb110de..d96705d01e7 100644 --- a/csrc/runtime/executor_kernel_arg.h +++ b/csrc/runtime/executor_kernel_arg.h @@ -107,8 +107,6 @@ class NVF_API KernelArgumentHolder { void erase(const PolymorphicValue& arg_to_delete); - std::vector toC10Array() const; - PolymorphicValue& back() { return arguments_.back(); } @@ -227,18 +225,6 @@ std::vector polymorphicValueToBytes( const DataType& dtype, PrimDataType index_type); -std::vector getKernelArgument( - ExpressionEvaluator& ee, - Val* parameter, - PrimDataType index_type); - -std::vector getKernelArgument( - at::Tensor tensor, - const GlobalBufferInfo& output_info, - PrimDataType index_type); - int64_t computeBytes(const KernelArgumentHolder& args); -int64_t computeBytes(const std::vector& outputs); - } // namespace nvfuser From 0db5b89bc6f069d7ba9f699bb539ea2d274c756e Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Mon, 3 Mar 2025 08:54:13 -0800 Subject: [PATCH 30/35] Cleanup argument to byte conversion. --- csrc/runtime/compiled_kernel.cpp | 29 ++++---- csrc/runtime/executor.cpp | 76 ++++++--------------- csrc/runtime/executor_kernel_arg.cpp | 99 ++++++++++++++++------------ csrc/runtime/executor_kernel_arg.h | 10 +++ 4 files changed, 98 insertions(+), 116 deletions(-) diff --git a/csrc/runtime/compiled_kernel.cpp b/csrc/runtime/compiled_kernel.cpp index 8b4f37ed3ca..d4715339c0b 100644 --- a/csrc/runtime/compiled_kernel.cpp +++ b/csrc/runtime/compiled_kernel.cpp @@ -1416,23 +1416,18 @@ float RtcKernel::run( std::vector pointers; for (const auto& input : args) { - const auto& input_tensor = input.as(); - auto dtype = std::get( - aten_to_data_type(input_tensor.scalar_type()).type); - DataType metadata_type = globalTensorMetaData(dtype, input_tensor.dim()); - - std::shared_ptr struct_ = std::make_shared(); - TensorMetaData* metadata = (TensorMetaData*)struct_.get(); - metadata->dtype = dtype; - metadata->data = input_tensor.data_ptr(); - metadata->logical_size = input_tensor.sizes(); - metadata->logical_stride = input_tensor.strides(); - metadata->alloc_size = input_tensor.sizes(); - metadata->alloc_stride = input_tensor.strides(); - - data.emplace_back(polymorphicValueToBytes( - PolymorphicValue(std::move(struct_)), metadata_type, index_type)); - pointers.emplace_back(data.back().data()); + NVF_ERROR( + input.is() && input.as().is_cuda(), + "Only CUDA tensors are supported for direct nvRTC launches at this time."); + if (input.is() && input.as().is_cuda()) { + auto input_tensor = input.as(); + data.emplace_back(tensorToBytes( + input_tensor, + input_tensor.sizes().vec(), + input_tensor.strides().vec(), + index_type)); + pointers.emplace_back(data.back().data()); + } } NVFUSER_CUDA_SAFE_CALL(cuLaunchKernel( diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index 8cf1adf4625..b2d34d3a1b2 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -173,6 +173,7 @@ void KernelExecutor::compile( CompileParams compile_params, SchedulerType scheduler_type) { FUSER_PERF_SCOPE("KernelExecutor::compile"); + NVF_ERROR( supported(fusion), "KernelExecutor does not support the Fusion provided."); @@ -661,7 +662,7 @@ void KernelExecutor::initializeExecutorEntry( ExpressionEvaluator expr_eval = executor_utils::bindInputs(args, compiled_kernel_->kernel()); - // expr_eval.precomputedValues() = evaluatorPrecomputedValues().get(); + auto launch_params = computeLaunchParams( launch_constraints, expr_eval, warp_size_, index_type); @@ -835,57 +836,16 @@ void KernelExecutor::computeArgs( const PrimDataType idx_type = compiled_kernel_->kernel()->indexType(); int64_t buffer_info_idx = 0; for (size_t arg_idx = 0; arg_idx < args.size(); ++arg_idx) { - std::vector bytes; if (args[arg_idx].is() && args[arg_idx].as().is_cuda()) { - auto tensor = args[arg_idx].as(); - - auto data = tensor.data_ptr(); const auto& buffer_info = - linear_buffer_info_getter(entry, buffer_info_idx); - const auto& logical_size = buffer_info.shape_info.logical_sizes.size() == - buffer_info.shape_info.unsharded_logical_sizes.size() - ? buffer_info.shape_info.unsharded_logical_sizes - : buffer_info.shape_info.logical_sizes; - const auto& alloc_stride = buffer_info.shape_info.allocation_strides; - buffer_info_idx++; - // special handle for TensorMetaData so that CPU overhead is minimal. - if (idx_type == PrimDataType::Int) { - bytes.reserve( - sizeof(void*) + sizeof(int64_t) * logical_size.size() + - sizeof(int64_t) * alloc_stride.size()); - bytes.insert(bytes.end(), (std::byte*)&data, (std::byte*)(&data + 1)); - bytes.insert( - bytes.end(), - (std::byte*)logical_size.data(), - (std::byte*)logical_size.data() + - sizeof(int64_t) * logical_size.size()); - bytes.insert( - bytes.end(), - (std::byte*)alloc_stride.data(), - (std::byte*)alloc_stride.data() + - sizeof(int64_t) * alloc_stride.size()); - } else { - bytes.reserve( - sizeof(void*) + sizeof(int32_t) * logical_size.size() + - sizeof(int32_t) * alloc_stride.size()); - bytes.insert(bytes.end(), (std::byte*)&data, (std::byte*)(&data + 1)); - std::vector logical_size32( - logical_size.begin(), logical_size.end()); - bytes.insert( - bytes.end(), - (std::byte*)logical_size32.data(), - (std::byte*)logical_size32.data() + - sizeof(int32_t) * logical_size32.size()); - std::vector alloc_stride32( - alloc_stride.begin(), alloc_stride.end()); - bytes.insert( - bytes.end(), - (std::byte*)alloc_stride32.data(), - (std::byte*)alloc_stride32.data() + - sizeof(int32_t) * alloc_stride32.size()); - } - entry.args[arg_idx] = bytes; + linear_buffer_info_getter(entry, buffer_info_idx++); + entry.args[arg_idx] = tensorToBytes( + args[arg_idx], + buffer_info.shape_info.logical_sizes, + buffer_info.shape_info.allocation_strides, + idx_type, + buffer_info.shape_info.unsharded_logical_sizes); entry.arg_ptrs[arg_idx] = entry.args[arg_idx].data(); } else { if (args[arg_idx].is()) { @@ -996,6 +956,8 @@ KernelArgumentHolder resolveRNGSeed( } } // namespace +// TODO: Reduce bindings to only those necessaary to resolve missing params. +// TODO: Check if this could be reused to also resolve dynamic aliases. KernelArgumentHolder KernelExecutor::resolveTMA( KernelExecutorEntry& entry, const KernelArgumentHolder& args) const { @@ -1018,7 +980,9 @@ KernelArgumentHolder KernelExecutor::resolveTMA( } for (const auto& intermediate_entry : entry.intermediates) { - expr_eval.bind(intermediate_entry.tv, args[arg_idx++]); + if (args[arg_idx].hasValue()) { + expr_eval.bind(intermediate_entry.tv, args[arg_idx++]); + } } KernelArgumentHolder resolved_args; @@ -1047,11 +1011,6 @@ KernelArgumentHolder KernelExecutor::run( sprof.startKernel(); } - ExpressionEvaluator expr_eval; - if (has_dynamic_alias_ || has_tma_) { - expr_eval = executor_utils::bindInputs(args, compiled_kernel_->kernel()); - } - NVF_ERROR(isCompiled()); NVF_ERROR( output_args.empty() || @@ -1120,7 +1079,12 @@ KernelArgumentHolder KernelExecutor::run( args, has_dynamic_alias_); if (has_dynamic_alias_) { - // TODO: Make sure there's a dynamic alias test. + ExpressionEvaluator expr_eval; + if (has_dynamic_alias_ || has_tma_) { + expr_eval = + executor_utils::bindInputs(args, compiled_kernel_->kernel()); + } + for (const auto i : c10::irange(compiled_kernel_->kernel()->outputs().size())) { auto param = compiled_kernel_->kernel()->outputs()[i]; diff --git a/csrc/runtime/executor_kernel_arg.cpp b/csrc/runtime/executor_kernel_arg.cpp index 184d7579b84..1531a253504 100644 --- a/csrc/runtime/executor_kernel_arg.cpp +++ b/csrc/runtime/executor_kernel_arg.cpp @@ -287,49 +287,8 @@ std::vector polymorphicValueToBytes( // FUSER_PERF_SCOPE("polymorphicValueToBytes(StructHandle)"); std::vector buffer; if (argument.as().is()) { - auto& data = argument->*&TensorMetaData::data; - auto& logical_size = argument->*&TensorMetaData::logical_size; - auto& alloc_stride = argument->*&TensorMetaData::alloc_stride; - - // special handle for TensorMetaData so that CPU overhead is minimal. - if (index_type == PrimDataType::Int) { - buffer.reserve( - sizeof(void*) + sizeof(int64_t) * logical_size.size() + - sizeof(int64_t) * alloc_stride.size()); - buffer.insert( - buffer.end(), (std::byte*)&data, (std::byte*)&data + sizeof(void*)); - buffer.insert( - buffer.end(), - (std::byte*)logical_size.data(), - (std::byte*)logical_size.data() + - sizeof(int64_t) * logical_size.size()); - buffer.insert( - buffer.end(), - (std::byte*)alloc_stride.data(), - (std::byte*)alloc_stride.data() + - sizeof(int64_t) * alloc_stride.size()); - } else { - buffer.reserve( - sizeof(void*) + sizeof(int32_t) * logical_size.size() + - sizeof(int32_t) * alloc_stride.size()); - buffer.insert( - buffer.end(), (std::byte*)&data, (std::byte*)&data + sizeof(void*)); - std::vector logical_size32( - logical_size.begin(), logical_size.end()); - buffer.insert( - buffer.end(), - (std::byte*)logical_size32.data(), - (std::byte*)logical_size32.data() + - sizeof(int32_t) * logical_size32.size()); - std::vector alloc_stride32( - alloc_stride.begin(), alloc_stride.end()); - buffer.insert( - buffer.end(), - (std::byte*)alloc_stride32.data(), - (std::byte*)alloc_stride32.data() + - sizeof(int32_t) * alloc_stride32.size()); - } - return buffer; + NVF_THROW( + "Don't send tensor metadata to this function directly, use tensorToBytes."); } else { const auto& dtype_ = std::get(dtype.type); for (const auto& field : dtype_.fields) { @@ -350,6 +309,60 @@ std::vector polymorphicValueToBytes( } } +std::vector tensorToBytes( + const PolymorphicValue& argument, + const std::vector& logical_sizes, + const std::vector& alloc_strides, + PrimDataType idx_type, + const std::vector& unsharded_logical_sizes) { + std::vector bytes; + NVF_ERROR( + argument.is() && argument.as().is_cuda(), + "Argument is not a CUDA tensor."); + auto tensor = argument.as(); + auto data = tensor.data_ptr(); + + const auto& size_to_use = + logical_sizes.size() == unsharded_logical_sizes.size() + ? unsharded_logical_sizes + : logical_sizes; + // special handle for TensorMetaData so that CPU overhead is minimal. + if (idx_type == PrimDataType::Int) { + bytes.reserve( + sizeof(void*) + sizeof(int64_t) * size_to_use.size() + + sizeof(int64_t) * alloc_strides.size()); + bytes.insert(bytes.end(), (std::byte*)&data, (std::byte*)(&data + 1)); + bytes.insert( + bytes.end(), + (std::byte*)size_to_use.data(), + (std::byte*)size_to_use.data() + sizeof(int64_t) * size_to_use.size()); + bytes.insert( + bytes.end(), + (std::byte*)alloc_strides.data(), + (std::byte*)alloc_strides.data() + + sizeof(int64_t) * alloc_strides.size()); + } else { + bytes.reserve( + sizeof(void*) + sizeof(int32_t) * size_to_use.size() + + sizeof(int32_t) * alloc_strides.size()); + bytes.insert(bytes.end(), (std::byte*)&data, (std::byte*)(&data + 1)); + std::vector logical_size32(size_to_use.begin(), size_to_use.end()); + bytes.insert( + bytes.end(), + (std::byte*)logical_size32.data(), + (std::byte*)logical_size32.data() + + sizeof(int32_t) * logical_size32.size()); + std::vector alloc_stride32( + alloc_strides.begin(), alloc_strides.end()); + bytes.insert( + bytes.end(), + (std::byte*)alloc_stride32.data(), + (std::byte*)alloc_stride32.data() + + sizeof(int32_t) * alloc_stride32.size()); + } + return bytes; +} + int64_t computeBytes(const KernelArgumentHolder& args) { int64_t num_bytes = 0; // Figure how many bytes are inputs, outputs, and temporary buffers diff --git a/csrc/runtime/executor_kernel_arg.h b/csrc/runtime/executor_kernel_arg.h index d96705d01e7..77198ae9002 100644 --- a/csrc/runtime/executor_kernel_arg.h +++ b/csrc/runtime/executor_kernel_arg.h @@ -220,11 +220,21 @@ class NVF_API KernelArgumentHolder { std::optional cache_id_ = std::nullopt; }; +// Used to convert a polymorphic value to a byte vector. Do not use for CUDA +// Tensors, use tensorToBytes instead. std::vector polymorphicValueToBytes( const PolymorphicValue& argument, const DataType& dtype, PrimDataType index_type); +// Used to convert a CUDA tensor to a byte vector. +std::vector tensorToBytes( + const PolymorphicValue& argument, + const std::vector& logical_sizes, + const std::vector& allocation_strides, + PrimDataType idx_type, + const std::vector& unsharded_logical_sizes = {}); + int64_t computeBytes(const KernelArgumentHolder& args); } // namespace nvfuser From 2241594a8f4171cba3f65db59f4ad7fcb3e89d11 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Mon, 3 Mar 2025 16:33:20 -0800 Subject: [PATCH 31/35] PR Comments. --- csrc/host_ir/executor.cpp | 1 - csrc/runtime/allocations.cpp | 4 ++-- csrc/runtime/compiled_kernel.cpp | 16 +++++++--------- csrc/runtime/executor.cpp | 16 ++++++++++------ 4 files changed, 19 insertions(+), 18 deletions(-) diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp index 32c2f5a956e..b2dc87b2789 100644 --- a/csrc/host_ir/executor.cpp +++ b/csrc/host_ir/executor.cpp @@ -107,7 +107,6 @@ KernelArgumentHolder HostIrExecutor::run( if (output_args.empty()) { std::vector output_infos = getBufferInfos( expr_eval, PrimDataType::Int, host_ir_container_->outputs()); - output_args.resize(host_ir_container_->outputs().size()); auto output_alias_to_input = executor_utils::getOutputAliasToInputMap(host_ir_container_.get()); output_args = allocateOutputs( diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp index 83ab60d900d..76a87a60e81 100644 --- a/csrc/runtime/allocations.cpp +++ b/csrc/runtime/allocations.cpp @@ -309,7 +309,7 @@ std::vector getBufferInfos( ExpressionEvaluator& expr_eval, DataType index_dtype, const std::vector& fusion_outputs) { - FUSER_PERF_SCOPE("fusion_executor::allocations::getOutbufferInfos"); + FUSER_PERF_SCOPE("fusion_executor::allocations::getBufferInfos"); std::vector output_buffer_infos; output_buffer_infos.reserve(fusion_outputs.size()); for (const auto out : fusion_outputs) { @@ -696,7 +696,7 @@ TensorShapeInfo inferTensorShapes( auto alias_info = tv->fusion()->getOutputAlias(tv); if (alias_info.type == AllocationType::Evaluate) { auto val = expr_eval.evaluate(tv); - NVF_ERROR(val.hasValue() && val.is(), "Output is not a Tensor"); + NVF_ERROR(val.is(), "Output is not a Tensor"); auto tensor = val.as(); if (!tv->hasAllocation()) { diff --git a/csrc/runtime/compiled_kernel.cpp b/csrc/runtime/compiled_kernel.cpp index d4715339c0b..e2aaf18069b 100644 --- a/csrc/runtime/compiled_kernel.cpp +++ b/csrc/runtime/compiled_kernel.cpp @@ -1419,15 +1419,13 @@ float RtcKernel::run( NVF_ERROR( input.is() && input.as().is_cuda(), "Only CUDA tensors are supported for direct nvRTC launches at this time."); - if (input.is() && input.as().is_cuda()) { - auto input_tensor = input.as(); - data.emplace_back(tensorToBytes( - input_tensor, - input_tensor.sizes().vec(), - input_tensor.strides().vec(), - index_type)); - pointers.emplace_back(data.back().data()); - } + auto input_tensor = input.as(); + data.emplace_back(tensorToBytes( + input_tensor, + input_tensor.sizes().vec(), + input_tensor.strides().vec(), + index_type)); + pointers.emplace_back(data.back().data()); } NVFUSER_CUDA_SAFE_CALL(cuLaunchKernel( diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index b2d34d3a1b2..0b35baee5bd 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -299,7 +299,7 @@ void KernelExecutor::compile( // slice, and that slice has a symbolic integer it depends on, then this // function returns true. // - // This could happen for other examples and this function will return true if + // This could happen for other examples and has_dynamic_alias_ will be true if // to evaluate the output that has an alias, other values besides the aliased // input need to be bound to the expression evaluator to evaluate the output. for (auto output : fusion->outputs()) { @@ -315,11 +315,12 @@ void KernelExecutor::compile( if (input->isA() && input->sameAs(aliased_to)) { continue; } + + if (input->isConst()) { + continue; + } + has_dynamic_alias_ = true; } - if (out_tv->isConst()) { - continue; - } - has_dynamic_alias_ = true; } } } @@ -956,7 +957,7 @@ KernelArgumentHolder resolveRNGSeed( } } // namespace -// TODO: Reduce bindings to only those necessaary to resolve missing params. +// TODO: Reduce bindings to only those necessary to resolve missing params. // TODO: Check if this could be reused to also resolve dynamic aliases. KernelArgumentHolder KernelExecutor::resolveTMA( KernelExecutorEntry& entry, @@ -1180,6 +1181,9 @@ KernelArgumentHolder KernelExecutor::run( if (has_tma_) { // Resolving TMA requires binding all values and evaluating the TMA // arguments + // + // Resolving TMA also resolves RNG, so if TMA exists the resolveRNGSeed + // function shouldn't also be called. args = resolveTMA(*executor_entry, args); } else if (has_rng_) { // Resolving RNG seed requires evaluating and adding those values, but From 9cbff1646206a82c91210352b9e37dd2cdd4e551 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Mon, 3 Mar 2025 16:47:44 -0800 Subject: [PATCH 32/35] Clang. --- csrc/runtime/executor_kernel_arg.cpp | 2 +- csrc/runtime/executor_utils.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/runtime/executor_kernel_arg.cpp b/csrc/runtime/executor_kernel_arg.cpp index 1531a253504..ccc2f7657f7 100644 --- a/csrc/runtime/executor_kernel_arg.cpp +++ b/csrc/runtime/executor_kernel_arg.cpp @@ -319,7 +319,7 @@ std::vector tensorToBytes( NVF_ERROR( argument.is() && argument.as().is_cuda(), "Argument is not a CUDA tensor."); - auto tensor = argument.as(); + const auto& tensor = argument.as(); auto data = tensor.data_ptr(); const auto& size_to_use = diff --git a/csrc/runtime/executor_utils.cpp b/csrc/runtime/executor_utils.cpp index 8a54d0c30b9..2900a1bb022 100644 --- a/csrc/runtime/executor_utils.cpp +++ b/csrc/runtime/executor_utils.cpp @@ -616,7 +616,7 @@ std::vector getOutputAliasToInputMap(const Fusion* fusion) { std::find( fusion->inputs().begin(), fusion->inputs().end(), aliased_to)); if (aliased_to_idx < (int64_t)fusion->inputs().size()) { - output_to_input_map[(int64_t)output_idx] = (int64_t)aliased_to_idx; + output_to_input_map[output_idx] = (int)aliased_to_idx; } else { auto aliased_out = std::find( fusion->outputs().begin(), fusion->outputs().end(), aliased_to); From 28e7cf90b01b64d94d5a2c249cbfffcf22091756 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Mon, 3 Mar 2025 17:14:01 -0800 Subject: [PATCH 33/35] Make allocation sizes/strides optional in tensor shape info. --- csrc/runtime/allocations.cpp | 16 +++++++--------- csrc/runtime/allocations.h | 5 +++++ csrc/runtime/executor.cpp | 28 ++++++++++++++-------------- 3 files changed, 26 insertions(+), 23 deletions(-) diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp index 76a87a60e81..28fcf60d347 100644 --- a/csrc/runtime/allocations.cpp +++ b/csrc/runtime/allocations.cpp @@ -704,9 +704,8 @@ TensorShapeInfo inferTensorShapes( tensor.sizes().vec(), tensor.strides().vec(), isSharded(tv) ? unshardedSizes(tv, tensor.sizes().vec()) - : tensor.sizes().vec(), - tensor.sizes().vec(), - tensor.strides().vec()}; + : std::vector(), + }; } auto allocation_size_stride = inferAndValidateAllocationSizesAndStrides(tensor, tv, expr_eval); @@ -714,7 +713,7 @@ TensorShapeInfo inferTensorShapes( tensor.sizes().vec(), tensor.strides().vec(), isSharded(tv) ? unshardedSizes(tv, tensor.sizes().vec()) - : tensor.sizes().vec(), + : std::vector(), allocation_size_stride.first, allocation_size_stride.second}; } @@ -726,9 +725,8 @@ TensorShapeInfo inferTensorShapes( allocation_size_stride.first, allocation_size_stride.second, isSharded(tv) ? unshardedSizes(tv, allocation_size_stride.first) - : allocation_size_stride.first, - allocation_size_stride.first, - allocation_size_stride.second}; + : std::vector(), + }; } auto options = @@ -740,11 +738,11 @@ TensorShapeInfo inferTensorShapes( // `transformFromAllocationToLogical` logical_meta_tensor = transformFromAllocationToLogical(logical_meta_tensor, tv, expr_eval); - return { + return TensorShapeInfo{ logical_meta_tensor.sizes().vec(), logical_meta_tensor.strides().vec(), isSharded(tv) ? unshardedSizes(tv, logical_meta_tensor.sizes().vec()) - : logical_meta_tensor.sizes().vec(), + : std::vector(), allocation_size_stride.first, allocation_size_stride.second}; } diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h index c1907fc1955..b05c6cffc27 100644 --- a/csrc/runtime/allocations.h +++ b/csrc/runtime/allocations.h @@ -17,6 +17,11 @@ namespace nvfuser { struct KernelExecutorEntry; +// If not sharded unsharded_logical_sizes is empty. +// If no allocation domain is found, allocation_sizes and allocation_strides +// are empty. +// For intermediate tensors, logical_sizes and logical_strides are used only, +// the rest are empty. struct TensorShapeInfo { std::vector logical_sizes; std::vector logical_strides; diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index 0b35baee5bd..bed20321cd9 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -523,8 +523,6 @@ std::vector KernelExecutor::getIntermediateBufferInfo( auto [sizes, strides] = has_expanded_domains ? inferShapeOfOutput(tv, expr_eval) : inferShapeOfIntermediate(tv, alloc, expr_eval); - info.shape_info.allocation_sizes = sizes; - info.shape_info.allocation_strides = strides; info.shape_info.logical_sizes = sizes; info.shape_info.logical_strides = strides; auto dtype = tv->dtype() == DataType::Index ? index_type : tv->dtype(); @@ -713,9 +711,6 @@ void KernelExecutor::initializeExecutorEntry( std::tie(alloc_sizes, alloc_strides) = inferAndValidateAllocationSizesAndStrides( at_tensor, input_tv, expr_eval); - } else { - alloc_sizes = at_tensor.sizes().vec(); - alloc_strides = at_tensor.strides().vec(); } TensorShapeInfo shape_info; @@ -764,8 +759,6 @@ void KernelExecutor::initializeExecutorEntry( "Accepting allocated outputs is not currently supported with allocation domain. ", "Allocation domain found for tv: ", info.tv->toString()); - info.shape_info.allocation_sizes = output_tensor.sizes().vec(); - info.shape_info.allocation_strides = output_tensor.strides().vec(); info.shape_info.logical_sizes = output_tensor.sizes().vec(); info.shape_info.logical_strides = output_tensor.strides().vec(); output_info.emplace_back(info); @@ -844,7 +837,9 @@ void KernelExecutor::computeArgs( entry.args[arg_idx] = tensorToBytes( args[arg_idx], buffer_info.shape_info.logical_sizes, - buffer_info.shape_info.allocation_strides, + buffer_info.shape_info.allocation_strides.empty() + ? buffer_info.shape_info.logical_strides + : buffer_info.shape_info.allocation_strides, idx_type, buffer_info.shape_info.unsharded_logical_sizes); entry.arg_ptrs[arg_idx] = entry.args[arg_idx].data(); @@ -1115,22 +1110,27 @@ KernelArgumentHolder KernelExecutor::run( at::Tensor profile_buffer; { FUSER_PERF_SCOPE("KernelExecutor::runFusion::intermediates"); + // Intermediates just use logical sizes and strides even though they're + // really allocation sizes and strides. + // + // This is simply because the convention used is that allocation + // sizes/strides are optional, logical are not. for (const auto intermediate_i : c10::irange(executor_entry->intermediates.size())) { const auto& buf_info = executor_entry->intermediates.at(intermediate_i); bool has_expansion = false; std::vector unexpanded_sizes; - unexpanded_sizes.reserve(buf_info.shape_info.allocation_sizes.size()); + unexpanded_sizes.reserve(buf_info.shape_info.logical_sizes.size()); NVF_ERROR( - buf_info.shape_info.allocation_sizes.size() == - buf_info.shape_info.allocation_strides.size()) + buf_info.shape_info.logical_sizes.size() == + buf_info.shape_info.logical_strides.size()) for (const auto j : - c10::irange(buf_info.shape_info.allocation_sizes.size())) { - if (buf_info.shape_info.allocation_strides[j] == 0) { + c10::irange(buf_info.shape_info.logical_sizes.size())) { + if (buf_info.shape_info.logical_strides[j] == 0) { has_expansion = true; unexpanded_sizes.push_back(1L); } else { - unexpanded_sizes.push_back(buf_info.shape_info.allocation_sizes[j]); + unexpanded_sizes.push_back(buf_info.shape_info.logical_sizes[j]); } } at::Tensor intermediate_buffer; From a6252f70b6cdbe6e1151ba4ed673903f5e16269e Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Mon, 3 Mar 2025 17:22:05 -0800 Subject: [PATCH 34/35] Remove intermediate logic in allocations.cpp output logic can be used instead. --- csrc/runtime/allocations.cpp | 24 ------------------------ csrc/runtime/allocations.h | 7 ------- csrc/runtime/executor.cpp | 19 ++++++------------- 3 files changed, 6 insertions(+), 44 deletions(-) diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp index 28fcf60d347..d0dc3becbb9 100644 --- a/csrc/runtime/allocations.cpp +++ b/csrc/runtime/allocations.cpp @@ -149,12 +149,6 @@ std::pair, std::vector> inferShape( const ExpressionEvaluator& expr_eval) { FUSER_PERF_SCOPE("fusion_executor::allocations::inferShape"); - // Allocate should be provided for intermediates. We just need to - // grab a chunk of memory of the size dicatated by - // Allocate::shape(). Fusion outputs do not come with Allocate and - // need to be allocated while taking expanded broadcasts into - // account. - std::vector concrete_sizes(symbolic_sizes.size(), 0); for (const auto i : c10::irange(symbolic_sizes.size())) { @@ -179,24 +173,6 @@ std::pair, std::vector> inferShape( } } // namespace -std::pair, std::vector> inferShapeOfIntermediate( - const TensorView* tv, - const kir::Allocate* alloc, - ExpressionEvaluator& expr_eval) { - FUSER_PERF_SCOPE("fusion_executor::allocations::inferShapeOfIntermediate"); - // The allocation domain represents the logical allocation domain, - // bu its actual allocation size may be different, e.g., for - // supporting halo accesses. The actual size is currently computed - // when creating the Allocate expr. - NVF_ERROR(alloc != nullptr); - const auto& symbolic_sizes = alloc->shape(); - // For intermediate tensors, we just need to allocate a memory chunk - // of the specified size. Broadcast expansion does not need to be considered. - const auto expand_flags = std::vector(symbolic_sizes.size(), false); - - return inferShape(tv, symbolic_sizes, expand_flags, expr_eval); -} - static bool fill_allocation_with_nan_ = false; bool shouldFillAllocationWithNan() { diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h index b05c6cffc27..40dda0eb842 100644 --- a/csrc/runtime/allocations.h +++ b/csrc/runtime/allocations.h @@ -57,13 +57,6 @@ int64_t computeSharedMemory( DataType index_type, int64_t smem_offset = 0); -// Infer the shape of an intemediate tensor using kir::Allocate. This -// is not ideal but still necessary when tensors are expanded with halo -std::pair, std::vector> inferShapeOfIntermediate( - const TensorView* tv, - const kir::Allocate* alloc, - ExpressionEvaluator& expr_eval); - bool shouldFillAllocationWithNan(); NVF_API void setFillAllocationWithNan(bool value); diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index bed20321cd9..2b8e43a5db2 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -510,19 +510,12 @@ std::vector KernelExecutor::getIntermediateBufferInfo( info.zero_init = alloc->zeroInit(); info.resets_to_zero = alloc->resetsToZero(); // TODO: Allocation size needs to consider both expanded domains - // as well as halo. Currently, allocation of tensors with halo is - // only supported by inferShapeOfIntermediate, whereas expanded - // domains are only supported by inferShapeOfOutput. Until the - // halo support is revisited, use the former for all tensors - // unless expanded and the latter otherwise. This assumes there's - // no expanded domains with halo, which is fine for now. - const auto has_expanded_domains = std::any_of( - tv->getMaybeAllocationDomain().begin(), - tv->getMaybeAllocationDomain().end(), - [](IterDomain* id) { return id->hasExpandedExtent(); }); - auto [sizes, strides] = has_expanded_domains - ? inferShapeOfOutput(tv, expr_eval) - : inferShapeOfIntermediate(tv, alloc, expr_eval); + // as well as halo. Currently, halo support has bene removed so we only need + // to worry about the expand case which is handled in inferShapeofOutputs. + // There used to also be a inferShapeOfIntermediate function before this + // commit, but that was safely removed with halo support. This will need to + // be revisited when halo support is added again. + auto [sizes, strides] = inferShapeOfOutput(tv, expr_eval); info.shape_info.logical_sizes = sizes; info.shape_info.logical_strides = strides; auto dtype = tv->dtype() == DataType::Index ? index_type : tv->dtype(); From 9e5a9ca45b2cf064cdd7a94f05d2297e85c88977 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Tue, 4 Mar 2025 12:45:25 -0800 Subject: [PATCH 35/35] Fix for ExpandedBroadcastGlobalIntermediateTest --- csrc/runtime/executor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index 2b8e43a5db2..801aa8a92c5 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -1155,7 +1155,7 @@ KernelArgumentHolder KernelExecutor::run( } if (has_expansion) { intermediate_buffer = at::native::expand( - intermediate_buffer, buf_info.shape_info.allocation_sizes); + intermediate_buffer, buf_info.shape_info.logical_sizes); } args.push(intermediate_buffer); intermediate_args.push(intermediate_buffer);