-
Notifications
You must be signed in to change notification settings - Fork 79
Use GetMetaData for stride computation
#649
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
42 commits
Select commit
Hold shift + click to select a range
faed8fb
Initial kernel input support
zasdfgbnm 55f5c37
Merge branch 'main' into kernel_inputs
zasdfgbnm ea58b44
format
zasdfgbnm 9a6786a
Merge branch 'kernel_inputs' of github.com:NVIDIA/Fuser into kernel_i…
zasdfgbnm 06645a3
cleanup
zasdfgbnm 48b2538
cleanup
zasdfgbnm 2080387
save
zasdfgbnm e09b45c
minimum set of inputs
zasdfgbnm 1cfcbc3
Merge branch 'main' of github.com:NVIDIA/Fuser into kernel_inputs
zasdfgbnm 8b9e980
cleanup
zasdfgbnm 72937d7
save
zasdfgbnm 246603a
Merge branch 'main' of github.com:NVIDIA/Fuser into kernel_inputs_exe…
zasdfgbnm d0cb6d5
fix
zasdfgbnm 5d9220d
save
zasdfgbnm 6060fb9
save
zasdfgbnm 1d6301d
save
zasdfgbnm b0ea760
outputs and global buffers as kernel inputs
zasdfgbnm 7eedc74
real migration
zasdfgbnm 5848305
tidy
zasdfgbnm 14d50b7
renamings
zasdfgbnm 86cde8d
cleanups
zasdfgbnm 38d6179
commenting
zasdfgbnm 3886732
more cleanup
zasdfgbnm a6e3524
save
zasdfgbnm c6ec315
revert kernel change
zasdfgbnm 18ca119
Merge branch 'main' into kernel_inputs_executor
zasdfgbnm 4183996
Merge branch 'kernel_inputs_executor' into metadata-for-stride-inference
zasdfgbnm a694f1a
comment
zasdfgbnm 50a70e4
Merge branch 'kernel_inputs_executor' into metadata-for-stride-inference
zasdfgbnm 9adafd3
Merge branch 'main' of github.com:NVIDIA/Fuser into metadata-for-stri…
zasdfgbnm 30292f7
fix
zasdfgbnm 27b5ad8
save both logical and alloc size and stride
zasdfgbnm 0ddb297
move code
zasdfgbnm 2d22305
without precomputed values
zasdfgbnm ca618be
fix ExprSimplifierTest
zasdfgbnm bc847b1
fix AllocationDomainTest
zasdfgbnm c815b87
fix MetadataAsTensor
zasdfgbnm 7e5dafe
fix LoopRotationTest
zasdfgbnm 1eb8338
fix other tests
zasdfgbnm b00d99b
Merge branch 'main' into metadata-for-stride-inference
zasdfgbnm cfc9b72
unchange
zasdfgbnm 2a21b8f
tidy
zasdfgbnm File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,303 +14,6 @@ | |
|
|
||
| namespace nvfuser { | ||
|
|
||
| namespace { | ||
|
|
||
| // Forward traverse from rFactor domain to allocation domain, compute frontier | ||
| // sizes and strides, validate that splits are divisible and merges are | ||
| // contiguous, and update active_ids_ correspondingly. | ||
| class ForwardTraverseFromRFactorToAlloc { | ||
| ExpressionEvaluator& ee_; | ||
| std::unordered_map<IterDomain*, std::pair<int64_t, int64_t>>& active_ids_; | ||
|
|
||
| void handle(Split* split) { | ||
| auto in = split->in(); | ||
| auto inner = split->inner(); | ||
| auto outer = split->outer(); | ||
| auto in_it = active_ids_.find(in); | ||
| // TORCH_INTERNAL_ASSERT(in_it != active_ids_.end()) | ||
| if (in_it == active_ids_.end()) { | ||
| // TODO: see [Allocation domain on both side of rFactor] | ||
| return; | ||
| } | ||
| auto [in_size, in_stride] = in_it->second; | ||
| auto factor = ee_.evaluate(split->factor()).as<int64_t>(); | ||
| TORCH_INTERNAL_ASSERT( | ||
| in_size % factor == 0, | ||
| "The rFactor domain and allocation domain of fusion input/output ", | ||
| "tensors must be a one-to-one map, therefore, ", | ||
| "non-divisible split is not allowed in allocation domain"); | ||
| TORCH_INTERNAL_ASSERT(active_ids_.erase(in) == 1); | ||
| TORCH_INTERNAL_ASSERT( | ||
| active_ids_ | ||
| .emplace(inner, std::pair<int64_t, int64_t>{factor, in_stride}) | ||
| .second); | ||
| TORCH_INTERNAL_ASSERT(active_ids_ | ||
| .emplace( | ||
| outer, | ||
| std::pair<int64_t, int64_t>{ | ||
| in_size / factor, in_stride * factor}) | ||
| .second); | ||
| } | ||
|
|
||
| void handle(Merge* merge) { | ||
| auto inner = merge->inner(); | ||
| auto outer = merge->outer(); | ||
| auto out = merge->out(); | ||
| auto inner_it = active_ids_.find(inner); | ||
| auto outer_it = active_ids_.find(outer); | ||
| // TORCH_INTERNAL_ASSERT(inner_it != active_ids_.end()) | ||
| // TORCH_INTERNAL_ASSERT(outer_it != active_ids_.end()) | ||
| if (inner_it == active_ids_.end() || outer_it == active_ids_.end()) { | ||
| // TODO: see [Allocation domain on both side of rFactor] | ||
| return; | ||
| } | ||
| auto [inner_size, inner_stride] = inner_it->second; | ||
| auto [outer_size, outer_stride] = outer_it->second; | ||
| TORCH_INTERNAL_ASSERT( | ||
| inner_stride * inner_size == outer_stride, | ||
| "The rFactor domain and allocation domain of fusion input/output ", | ||
| "tensors must be a one-to-one map, therefore, ", | ||
| "merging of discontiguous dimensions is not allowed in allocation domain"); | ||
| TORCH_INTERNAL_ASSERT(active_ids_.erase(inner) == 1); | ||
| TORCH_INTERNAL_ASSERT(active_ids_.erase(outer) == 1); | ||
| TORCH_INTERNAL_ASSERT(active_ids_ | ||
| .emplace( | ||
| out, | ||
| std::pair<int64_t, int64_t>{ | ||
| inner_size * outer_size, inner_stride}) | ||
| .second); | ||
| } | ||
|
|
||
| void handle(Expr* expr) { | ||
| if (auto split = dynamic_cast<Split*>(expr)) { | ||
| handle(split); | ||
| } else if (auto merge = dynamic_cast<Merge*>(expr)) { | ||
| handle(merge); | ||
| } else { | ||
| TORCH_INTERNAL_ASSERT( | ||
| false, "Unsupported transormation in allocation domain"); | ||
| } | ||
| } | ||
|
|
||
| public: | ||
| ForwardTraverseFromRFactorToAlloc( | ||
| ExpressionEvaluator& ee, | ||
| std::unordered_map<IterDomain*, std::pair<int64_t, int64_t>>& active_ids) | ||
| : ee_(ee), active_ids_(active_ids) {} | ||
|
|
||
| void run( | ||
| TensorView* tv, | ||
| const std::vector<IterDomain*>& rfactor, | ||
| const std::vector<IterDomain*>& alloc) { | ||
| auto forward_exprs = StmtSort::getExprsBetween( | ||
| tv->fusion(), | ||
| {rfactor.begin(), rfactor.end()}, | ||
| {alloc.begin(), alloc.end()}); | ||
| for (auto expr : forward_exprs) { | ||
| handle(expr); | ||
| } | ||
| } | ||
| }; | ||
|
|
||
| // Similar to ForwardTraverseFromRFactorToAlloc, but in the opposite direction. | ||
| class BackwardTraverseFromRFactorToAlloc { | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Moved to |
||
| at::Tensor tensor_; | ||
| ExpressionEvaluator& ee_; | ||
| std::unordered_map<IterDomain*, std::pair<int64_t, int64_t>>& active_ids_; | ||
|
|
||
| void handle(Split* split) { | ||
| auto in = split->in(); | ||
| auto inner = split->inner(); | ||
| auto outer = split->outer(); | ||
| auto inner_it = active_ids_.find(inner); | ||
| auto outer_it = active_ids_.find(outer); | ||
| // TORCH_INTERNAL_ASSERT(inner_it != active_ids_.end()) | ||
| // TORCH_INTERNAL_ASSERT(outer_it != active_ids_.end()) | ||
| if (inner_it == active_ids_.end() || outer_it == active_ids_.end()) { | ||
| // TODO: see [Allocation domain on both side of rFactor] | ||
| return; | ||
| } | ||
| auto [inner_size, inner_stride] = inner_it->second; | ||
| auto [outer_size, outer_stride] = outer_it->second; | ||
| TORCH_INTERNAL_ASSERT( | ||
| inner_stride * inner_size == outer_stride, | ||
| "The rFactor domain and allocation domain of fusion input/output ", | ||
| "tensors must be a one-to-one map, therefore, ", | ||
| "splitting one dimension into discontiguous dimensions is not allowed in allocation domain"); | ||
| TORCH_INTERNAL_ASSERT(active_ids_.erase(inner) == 1); | ||
| TORCH_INTERNAL_ASSERT(active_ids_.erase(outer) == 1); | ||
| TORCH_INTERNAL_ASSERT(active_ids_ | ||
| .emplace( | ||
| in, | ||
| std::pair<int64_t, int64_t>{ | ||
| inner_size * outer_size, inner_stride}) | ||
| .second); | ||
| } | ||
|
|
||
| void handle(Merge* merge) { | ||
| auto inner = merge->inner(); | ||
| auto outer = merge->outer(); | ||
| auto out = merge->out(); | ||
| auto factor = ee_.evaluate(inner->extent()).as<int64_t>(); | ||
| auto out_it = active_ids_.find(out); | ||
| // TORCH_INTERNAL_ASSERT(out_it != active_ids_.end()) | ||
| if (out_it == active_ids_.end()) { | ||
| // TODO: see [Allocation domain on both side of rFactor] | ||
| return; | ||
| } | ||
| auto [out_size, out_stride] = out_it->second; | ||
| TORCH_INTERNAL_ASSERT( | ||
| out_size % factor == 0, | ||
| "The rFactor domain and allocation domain of fusion input/output ", | ||
| "tensors must be a one-to-one map, therefore, ", | ||
| "the size of the output must divisible by the size of inner dimension"); | ||
| TORCH_INTERNAL_ASSERT(active_ids_.erase(out) == 1); | ||
| TORCH_INTERNAL_ASSERT( | ||
| active_ids_ | ||
| .emplace(inner, std::pair<int64_t, int64_t>{factor, out_stride}) | ||
| .second); | ||
| TORCH_INTERNAL_ASSERT(active_ids_ | ||
| .emplace( | ||
| outer, | ||
| std::pair<int64_t, int64_t>{ | ||
| out_size / factor, out_stride * factor}) | ||
| .second); | ||
| } | ||
|
|
||
| void handle(Expr* expr) { | ||
| if (auto split = dynamic_cast<Split*>(expr)) { | ||
| handle(split); | ||
| } else if (auto merge = dynamic_cast<Merge*>(expr)) { | ||
| handle(merge); | ||
| } else { | ||
| TORCH_INTERNAL_ASSERT( | ||
| false, "Unsupported transormation in allocation domain"); | ||
| } | ||
| } | ||
|
|
||
| public: | ||
| BackwardTraverseFromRFactorToAlloc( | ||
| ExpressionEvaluator& ee, | ||
| std::unordered_map<IterDomain*, std::pair<int64_t, int64_t>>& active_ids) | ||
| : ee_(ee), active_ids_(active_ids) {} | ||
|
|
||
| void run( | ||
| TensorView* tv, | ||
| const std::vector<IterDomain*>& rfactor, | ||
| const std::vector<IterDomain*>& alloc) { | ||
| auto backward_exprs = StmtSort::getExprsBetween( | ||
| tv->fusion(), | ||
| {alloc.begin(), alloc.end()}, | ||
| {rfactor.begin(), rfactor.end()}); | ||
| std::reverse(backward_exprs.begin(), backward_exprs.end()); | ||
| for (auto expr : backward_exprs) { | ||
| handle(expr); | ||
| } | ||
| } | ||
| }; | ||
|
|
||
| } // namespace | ||
|
|
||
| // Given an ATen tensor, whose sizes and strides are w.r.t to the rFactor domain | ||
| // of its corresponding TensorView, compute the sizes and strides of the tensor | ||
| // with respect to its allocation domain. | ||
| // For example, if the rFactor domain is [I1, I2], and the allocation domain is | ||
| // [I2*I1], and the tensor's size is [5, 3] and stride is [2, 10], then the | ||
| // resulting size will be [15] and stride will be [2] | ||
| // Another example, if the rFactor domain is [I1*I2] and the allocation domain | ||
| // is [I1, I2], and the tensor's size is [15] and stride is [7], and the extent | ||
| // of I2 is 5, then the resulting size will be [3, 5] and stride will be [35, 7] | ||
| std::vector<std::pair<int64_t, int64_t>> | ||
| inferAndValidateAllocationSizesAndStrides( | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Moved to |
||
| const at::Tensor& tensor, | ||
| TensorView* tv, | ||
| ExpressionEvaluator& ee) { | ||
| if (tv == nullptr || !tv->hasAllocation()) { | ||
| // When tv is nullptr, or tv does not have allocation, the given sizes and | ||
| // strides should already be in the target format. So nothing to do here. | ||
| std::vector<std::pair<int64_t, int64_t>> result; | ||
| for (auto i : c10::irange(tensor.dim())) { | ||
| result.emplace_back(tensor.size(i), tensor.stride(i)); | ||
| } | ||
| return result; | ||
| } | ||
| const auto& alloc = | ||
| TensorDomain::noReductions(tv->getMaybeAllocationDomain()); | ||
| const auto& rfactor = TensorDomain::noReductions(tv->getMaybeRFactorDomain()); | ||
|
|
||
| // active IDs and their shape and stride | ||
| std::unordered_map<IterDomain*, std::pair<int64_t, int64_t>> active_ids; | ||
| TORCH_INTERNAL_ASSERT((int64_t)rfactor.size() == tensor.dim()); | ||
| for (int64_t i : c10::irange((int64_t)rfactor.size())) { | ||
| auto rf_id = rfactor.at(i); | ||
| active_ids[rf_id] = {tensor.size(i), tensor.stride(i)}; | ||
| } | ||
|
|
||
| ForwardTraverseFromRFactorToAlloc(ee, active_ids).run(tv, rfactor, alloc); | ||
| BackwardTraverseFromRFactorToAlloc(ee, active_ids).run(tv, rfactor, alloc); | ||
|
|
||
| // Now active_ids should contain the final sizes and strides, unordered. We | ||
| // need to put them to the correct order. | ||
| std::vector<std::pair<int64_t, int64_t>> sizes_strides; | ||
| sizes_strides.reserve(alloc.size()); | ||
| for (auto i : c10::irange(alloc.size())) { | ||
| auto id = alloc.at(i); | ||
| sizes_strides.emplace_back(active_ids.at(id)); | ||
| } | ||
| // Validate final sizes and strides with contiguity | ||
| int64_t contiguous_stride = 1; | ||
| std::vector<std::optional<bool>> contiguity = tv->getContiguity(); | ||
| for (int64_t i = (int64_t)sizes_strides.size() - 1; i >= 0; i--) { | ||
| if (alloc.at(i)->isBroadcast()) { | ||
| continue; | ||
| } | ||
| while (!contiguity.back().has_value()) { | ||
| contiguity.pop_back(); | ||
| } | ||
| auto [size, stride] = sizes_strides.at(i); | ||
| TORCH_INTERNAL_ASSERT(!contiguity.empty()); | ||
| auto last_contiguity = contiguity.back(); | ||
| TORCH_INTERNAL_ASSERT( | ||
| last_contiguity.has_value(), | ||
| "I don't think this check makes sense, but unfortunately ", | ||
| "clang-tidy is not smart enough to infer from the context that this is always true."); | ||
| if (*last_contiguity) { | ||
| TORCH_CHECK( | ||
| stride == contiguous_stride, | ||
| "Stride mismatch with contiguity info. ", | ||
| "tv: ", | ||
| tv->toString(), | ||
| " allocation domain: ", | ||
| ir_utils::toString(tv->getMaybeAllocationDomain()), | ||
| " dim: ", | ||
| i, | ||
| " expected stride: ", | ||
| contiguous_stride, | ||
| " actual stride: ", | ||
| stride); | ||
| } | ||
| contiguous_stride = stride * size; | ||
| contiguity.pop_back(); | ||
| } | ||
| TORCH_INTERNAL_ASSERT( | ||
| contiguity.empty(), | ||
| "The size of contiguity mismatch with the dimensionality of allocation domain"); | ||
| // Validate that for expanded broadcast, the stride must be zero. | ||
| for (int64_t i : c10::irange((int64_t)sizes_strides.size())) { | ||
| if (auto alloc_id = alloc.at(i); alloc_id->hasExpandedExtent()) { | ||
| auto [_, stride] = sizes_strides.at(i); | ||
| TORCH_CHECK( | ||
| stride == 0, | ||
| "Expecting an expanded dimension on dimension ", | ||
| i, | ||
| " but found stride ", | ||
| stride); | ||
| } | ||
| } | ||
| return sizes_strides; | ||
| } | ||
|
|
||
| PrimDataType TensorArgAbstract::getSmallestIndexType() const { | ||
| KernelIndexTypeCompute index_type_helper; | ||
| for (const auto dim_i : c10::irange(tensor_.ndimension())) { | ||
|
|
@@ -637,8 +340,8 @@ std::vector<std::byte> getTensorArgBuffer( | |
| auto struct_ = metadata.as<Struct>(); | ||
| std::vector<std::byte> buffer; | ||
| void* ptr = (void*)struct_["data"]; | ||
| std::vector<int64_t> sizes = (std::vector<int64_t>)struct_["size"]; | ||
| std::vector<int64_t> strides = (std::vector<int64_t>)struct_["stride"]; | ||
| std::vector<int64_t> sizes = (std::vector<int64_t>)struct_["logical_size"]; | ||
| std::vector<int64_t> strides = (std::vector<int64_t>)struct_["alloc_stride"]; | ||
| if (index_type == PrimDataType::Int) { | ||
| buffer.reserve( | ||
| sizeof(ptr) + sizeof(int64_t) * (sizes.size() + strides.size())); | ||
|
|
@@ -686,10 +389,8 @@ std::vector<std::byte> getKernelArgument( | |
| (std::byte*)tensor.data_ptr(), | ||
| (std::byte*)tensor.data_ptr() + tensor.element_size()); | ||
| } else { | ||
| auto resolved_arg = getTensorArg(tensor, tv, ee, index_type); | ||
| return std::vector<std::byte>( | ||
| (std::byte*)resolved_arg->arg(), | ||
| (std::byte*)resolved_arg->arg() + resolved_arg->argSize()); | ||
| auto metadata = ee.evaluate(IrBuilder::metadataExpr(tv)); | ||
| return getTensorArgBuffer(metadata, index_type); | ||
| } | ||
| } else if (isIntegralType(parameter->dtype())) { | ||
| int64_t v = pv.as<int64_t>(); | ||
|
|
||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Moved to
tensor_metadata.cppunchanged.