From 5be2421d3afb5e3987073d64507c04d5d0195b9b Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Mon, 22 May 2023 16:13:33 -0700 Subject: [PATCH 01/12] Allocation domain support in cacheFork --- csrc/tensor_view.cpp | 3 +- test/test_allocation_domain.cpp | 164 ++++++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+), 1 deletion(-) diff --git a/csrc/tensor_view.cpp b/csrc/tensor_view.cpp index 2c0992aa5ca..bfeaf9414c8 100644 --- a/csrc/tensor_view.cpp +++ b/csrc/tensor_view.cpp @@ -1281,7 +1281,8 @@ TensorView* TensorView::cacheFork() { fusion()->replaceOutput(this, new_output); // Transform new output according to this TV - auto replayed_output_pair = TransformReplay::replayCasP(new_output, this, -1); + auto replayed_output_pair = TransformReplay::replayCasP( + new_output, this, -1, TransformReplayOptions().replayAllocation()); new_output->setDomain(replayed_output_pair.first); return new_output; diff --git a/test/test_allocation_domain.cpp b/test/test_allocation_domain.cpp index 838807f9540..355fc761506 100644 --- a/test/test_allocation_domain.cpp +++ b/test/test_allocation_domain.cpp @@ -888,4 +888,168 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter_CUDA) { testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } +// Similar to NHWC4d_To_NHWC4d_CUDA, but does a cacheFork +TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork_CUDA) { + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(4); + fusion.addInput(tv0); + auto tv1 = set(tv0); + fusion.addOutput(tv1); + auto tv2 = set(tv1); + fusion.addOutput(tv2); + + std::vector tv0_nhwc = { + tv0->axis(0), tv0->axis(2), tv0->axis(3), tv0->axis(1)}; + tv0->setAllocationDomain(tv0_nhwc, true); + + std::vector tv1_nhwc = { + tv1->axis(0), tv1->axis(2), tv1->axis(3), tv1->axis(1)}; + tv1->setAllocationDomain(tv1_nhwc, true); + + std::vector tv2_nhwc = { + tv2->axis(0), tv2->axis(2), tv2->axis(3), tv2->axis(1)}; + tv2->setAllocationDomain(tv2_nhwc, true); + + auto tv3 = tv1->cacheFork(); + + std::vector expected_new_allocation_domain{ + tv3->axis(0), tv3->axis(2), tv3->axis(3), tv3->axis(1)}; + + ASSERT_EQ(tv0->getAllocationDomain(), tv0_nhwc); + ASSERT_EQ(tv1->getAllocationDomain(), tv1_nhwc); + ASSERT_EQ(tv2->getAllocationDomain(), tv2_nhwc); + ASSERT_EQ(tv3->getAllocationDomain(), expected_new_allocation_domain); + + for (auto tv : {tv1, tv2, tv3}) { + // [N, C, H, W] + tv->reorder({{1, -1}}); + // [N, H, W, C] + tv->merge(0); + tv->merge(0); + tv->merge(0); + // [N*H*W*C] + tv->split(0, 4); + tv->axis(1)->parallelize(ParallelType::Vectorize); + tv->split(0, 128); + tv->axis(1)->parallelize(ParallelType::TIDx); + tv->axis(0)->parallelize(ParallelType::BIDx); + // [BIDx, TIDx, V] + } + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + int n = 31, h = 64, w = 103, c = 21; + + at::Tensor t0_wrong_format = at::randn({n, c, h, w}, options); + at::Tensor t0 = + t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); + + FusionExecutor fe; + fe.compileFusion(fusion_ptr.get(), {t0}); + + EXPECT_THAT( + [&]() { fe.runFusion({t0_wrong_format}); }, + ::testing::ThrowsMessage( + ::testing::HasSubstr("Stride mismatch with contiguity info"))); + + auto cg_outputs = fe.runFusion({t0}); + + ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); + + testValidate(&fusion, cg_outputs, {t0}, {t0, t0}, __LINE__, __FILE__); +} + +// Similar to NHWC2d_To_NHWC2d_CUDA, but does a cacheFork +TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork_CUDA) { + auto fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); + FusionGuard fg(&fusion); + + int n = 31, h = 64, w = 103, c = 21; + + auto tv0 = makeContigConcreteTensor({n * h / 8, 8 * w * c}); + fusion.addInput(tv0); + + std::vector tv0_2d = {tv0->axis(0), tv0->axis(1)}; + tv0->setAllocationDomain(tv0_2d, true); + tv0->merge(0); + tv0->split(0, c); + tv0->split(0, w); + tv0->split(0, h); + // [N, H, W, C] + tv0->reorder({{-1, 1}}); + // [N, C, H, W] + tv0->commitLeafToRFactor(); + + auto tv1 = set(tv0); + fusion.addOutput(tv1); + + auto tv2 = set(tv1); + fusion.addOutput(tv2); + + std::vector tv1_nhwc = { + tv1->axis(0), tv1->axis(2), tv1->axis(3), tv1->axis(1)}; + tv1->setAllocationDomain(tv1_nhwc, true); + + for (auto tv : {tv1, tv2}) { + // [N, C, H, W] + tv->reorder({{1, -1}}); + // [N, H, W, C] + tv->merge(0); + tv->merge(1); + tv->merge(0); + // [N*H*W*C] + + tv->split(0, 4); + // [N*H*W*C/4, 4] + } + + std::vector tv2_2d = {tv2->axis(0), tv2->axis(1)}; + tv2->setAllocationDomain(tv2_2d, true); + + auto tv3 = tv1->cacheFork(); + + std::vector expected_new_allocation_domain{ + tv3->getMaybeRFactorDomain().at(0), + tv3->getMaybeRFactorDomain().at(2), + tv3->getMaybeRFactorDomain().at(3), + tv3->getMaybeRFactorDomain().at(1)}; + + ASSERT_EQ(tv0->getAllocationDomain(), tv0_2d); + ASSERT_EQ(tv1->getAllocationDomain(), tv1_nhwc); + ASSERT_EQ(tv2->getAllocationDomain(), tv2_2d); + ASSERT_EQ(tv3->getAllocationDomain(), expected_new_allocation_domain); + + for (auto tv : {tv1, tv2, tv3}) { + tv->split(0, 128); + tv->axis(0)->parallelize(ParallelType::BIDx); + tv->axis(1)->parallelize(ParallelType::TIDx); + tv->axis(2)->parallelize(ParallelType::Vectorize); + // [BIDx, TIDx, V] + } + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + at::Tensor t0_wrong_format = at::randn({n, c, h, w}, options); + at::Tensor t0 = + t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); + + FusionExecutor fe; + fe.compileFusion(fusion_ptr.get(), {t0}); + + EXPECT_THAT( + [&]() { fe.runFusion({t0_wrong_format}); }, + ::testing::ThrowsMessage(::testing::HasSubstr( + "splitting one dimension into discontiguous dimensions is not allowed in allocation domain"))); + + auto cg_outputs = fe.runFusion({t0}); + + ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); + + testValidate(&fusion, cg_outputs, {t0}, {t0, t0}, __LINE__, __FILE__); +} + } // namespace nvfuser From 2010de6822f28d8c4677291053b483bec2dd9098 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 23 May 2023 09:43:44 -0700 Subject: [PATCH 02/12] TensorArgAbstract allocation size --- csrc/executor_kernel_arg.h | 37 +++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/csrc/executor_kernel_arg.h b/csrc/executor_kernel_arg.h index 9c22f6c6785..b8977a82926 100644 --- a/csrc/executor_kernel_arg.h +++ b/csrc/executor_kernel_arg.h @@ -199,9 +199,19 @@ struct TensorArgAbstract : ArgAbstract { return tensor_.size(i); } - virtual int64_t getStride(int64_t i) const { + virtual int64_t getAllocRank() const { TORCH_INTERNAL_ASSERT( - false, "The stride of an abstract tensor arg is not known."); + false, "The allocation rank of an abstract tensor arg is not known."); + } + + virtual int64_t getAllocSize(int64_t i) const { + TORCH_INTERNAL_ASSERT( + false, "The allocation shape of an abstract tensor arg is not known."); + } + + virtual int64_t getAllocStride(int64_t i) const { + TORCH_INTERNAL_ASSERT( + false, "The allocation stride of an abstract tensor arg is not known."); } size_t getPointerAddress() const { @@ -274,6 +284,7 @@ inferAndValidateAllocationSizesAndStrides( template struct TensorArg : public TensorArgAbstract { TENSOR_TYPE instance_; + std::array alloc_sizes; TensorArg(const at::Tensor& tensor, TensorView* tv, ExpressionEvaluator& eval) : TensorArgAbstract(tensor) { @@ -293,12 +304,21 @@ struct TensorArg : public TensorArgAbstract { TORCH_INTERNAL_ASSERT( (size_t)instance_.nAllocationDims() == sizes_strides.size()); for (auto i : c10::irange((int64_t)sizes_strides.size())) { + alloc_sizes.at(i) = sizes_strides.at(i).first; using stride_t = typename TENSOR_TYPE::index_type; instance_.setStride(i, (stride_t)sizes_strides.at(i).second); } } - int64_t getStride(int64_t i) const override { + int64_t getAllocRank() const override { + return instance_.nAllocationDims(); + } + + int64_t getAllocSize(int64_t i) const override { + return alloc_sizes.at(i); + } + + int64_t getAllocStride(int64_t i) const override { return instance_.getStride(i); } @@ -323,10 +343,15 @@ struct TensorArg : public TensorArgAbstract { std::string toString() const override { std::stringstream ss; ss << TensorArgAbstract::toString(); - ss << " stride: ("; - for (auto i = 0; i < getRank(); i++) { - ss << getStride(i) << ", "; + ss << " allocation size: ("; + for (auto i = 0; i < getAllocRank(); i++) { + ss << getAllocSize(i) << ", "; + } + ss << ") allocation stride: ("; + for (auto i = 0; i < getAllocRank(); i++) { + ss << getAllocStride(i) << ", "; } + ss << ")"; return ss.str(); } From f498f3f66cf9fd93c8b813dc9291b69d7ea62612 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 23 May 2023 09:45:54 -0700 Subject: [PATCH 03/12] registry --- csrc/scheduler/registry.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/csrc/scheduler/registry.cpp b/csrc/scheduler/registry.cpp index 9b84d87bf9d..06266584b99 100644 --- a/csrc/scheduler/registry.cpp +++ b/csrc/scheduler/registry.cpp @@ -970,14 +970,14 @@ SchedulerRuntimeInfo::SchedulerRuntimeInfo( // find and push discontiguous stride auto dtype_size = dataTypeSize(tensor_arg_abstract->getDataType()); input_discontig_strides_[fusion_inp] = {}; - auto dims = tensor_arg_abstract->getRank(); + auto dims = tensor_arg_abstract->getAllocRank(); int64_t expected_stride = 1; for (auto dim = dims - 1; dim >= 0; dim--) { - auto size = tensor_arg_abstract->getSize((int)dim); + auto size = tensor_arg_abstract->getAllocSize((int)dim); if (size <= 1) { continue; } - auto stride = tensor_arg_abstract->getStride((int)dim); + auto stride = tensor_arg_abstract->getAllocStride((int)dim); if (stride != expected_stride) { input_discontig_strides_[fusion_inp].push_back(stride * dtype_size); expected_stride = stride; From 5dd984488960f12239da39c9815891d2889f5372 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 23 May 2023 09:46:12 -0700 Subject: [PATCH 04/12] Revert "Allocation domain support in cacheFork" This reverts commit 5be2421d3afb5e3987073d64507c04d5d0195b9b. --- csrc/tensor_view.cpp | 3 +- test/test_allocation_domain.cpp | 164 -------------------------------- 2 files changed, 1 insertion(+), 166 deletions(-) diff --git a/csrc/tensor_view.cpp b/csrc/tensor_view.cpp index bfeaf9414c8..2c0992aa5ca 100644 --- a/csrc/tensor_view.cpp +++ b/csrc/tensor_view.cpp @@ -1281,8 +1281,7 @@ TensorView* TensorView::cacheFork() { fusion()->replaceOutput(this, new_output); // Transform new output according to this TV - auto replayed_output_pair = TransformReplay::replayCasP( - new_output, this, -1, TransformReplayOptions().replayAllocation()); + auto replayed_output_pair = TransformReplay::replayCasP(new_output, this, -1); new_output->setDomain(replayed_output_pair.first); return new_output; diff --git a/test/test_allocation_domain.cpp b/test/test_allocation_domain.cpp index 355fc761506..838807f9540 100644 --- a/test/test_allocation_domain.cpp +++ b/test/test_allocation_domain.cpp @@ -888,168 +888,4 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter_CUDA) { testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__); } -// Similar to NHWC4d_To_NHWC4d_CUDA, but does a cacheFork -TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork_CUDA) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); - FusionGuard fg(&fusion); - - auto tv0 = makeContigTensor(4); - fusion.addInput(tv0); - auto tv1 = set(tv0); - fusion.addOutput(tv1); - auto tv2 = set(tv1); - fusion.addOutput(tv2); - - std::vector tv0_nhwc = { - tv0->axis(0), tv0->axis(2), tv0->axis(3), tv0->axis(1)}; - tv0->setAllocationDomain(tv0_nhwc, true); - - std::vector tv1_nhwc = { - tv1->axis(0), tv1->axis(2), tv1->axis(3), tv1->axis(1)}; - tv1->setAllocationDomain(tv1_nhwc, true); - - std::vector tv2_nhwc = { - tv2->axis(0), tv2->axis(2), tv2->axis(3), tv2->axis(1)}; - tv2->setAllocationDomain(tv2_nhwc, true); - - auto tv3 = tv1->cacheFork(); - - std::vector expected_new_allocation_domain{ - tv3->axis(0), tv3->axis(2), tv3->axis(3), tv3->axis(1)}; - - ASSERT_EQ(tv0->getAllocationDomain(), tv0_nhwc); - ASSERT_EQ(tv1->getAllocationDomain(), tv1_nhwc); - ASSERT_EQ(tv2->getAllocationDomain(), tv2_nhwc); - ASSERT_EQ(tv3->getAllocationDomain(), expected_new_allocation_domain); - - for (auto tv : {tv1, tv2, tv3}) { - // [N, C, H, W] - tv->reorder({{1, -1}}); - // [N, H, W, C] - tv->merge(0); - tv->merge(0); - tv->merge(0); - // [N*H*W*C] - tv->split(0, 4); - tv->axis(1)->parallelize(ParallelType::Vectorize); - tv->split(0, 128); - tv->axis(1)->parallelize(ParallelType::TIDx); - tv->axis(0)->parallelize(ParallelType::BIDx); - // [BIDx, TIDx, V] - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - int n = 31, h = 64, w = 103, c = 21; - - at::Tensor t0_wrong_format = at::randn({n, c, h, w}, options); - at::Tensor t0 = - t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); - - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); - - EXPECT_THAT( - [&]() { fe.runFusion({t0_wrong_format}); }, - ::testing::ThrowsMessage( - ::testing::HasSubstr("Stride mismatch with contiguity info"))); - - auto cg_outputs = fe.runFusion({t0}); - - ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); - - testValidate(&fusion, cg_outputs, {t0}, {t0, t0}, __LINE__, __FILE__); -} - -// Similar to NHWC2d_To_NHWC2d_CUDA, but does a cacheFork -TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork_CUDA) { - auto fusion_ptr = std::make_unique(); - Fusion& fusion = *fusion_ptr.get(); - FusionGuard fg(&fusion); - - int n = 31, h = 64, w = 103, c = 21; - - auto tv0 = makeContigConcreteTensor({n * h / 8, 8 * w * c}); - fusion.addInput(tv0); - - std::vector tv0_2d = {tv0->axis(0), tv0->axis(1)}; - tv0->setAllocationDomain(tv0_2d, true); - tv0->merge(0); - tv0->split(0, c); - tv0->split(0, w); - tv0->split(0, h); - // [N, H, W, C] - tv0->reorder({{-1, 1}}); - // [N, C, H, W] - tv0->commitLeafToRFactor(); - - auto tv1 = set(tv0); - fusion.addOutput(tv1); - - auto tv2 = set(tv1); - fusion.addOutput(tv2); - - std::vector tv1_nhwc = { - tv1->axis(0), tv1->axis(2), tv1->axis(3), tv1->axis(1)}; - tv1->setAllocationDomain(tv1_nhwc, true); - - for (auto tv : {tv1, tv2}) { - // [N, C, H, W] - tv->reorder({{1, -1}}); - // [N, H, W, C] - tv->merge(0); - tv->merge(1); - tv->merge(0); - // [N*H*W*C] - - tv->split(0, 4); - // [N*H*W*C/4, 4] - } - - std::vector tv2_2d = {tv2->axis(0), tv2->axis(1)}; - tv2->setAllocationDomain(tv2_2d, true); - - auto tv3 = tv1->cacheFork(); - - std::vector expected_new_allocation_domain{ - tv3->getMaybeRFactorDomain().at(0), - tv3->getMaybeRFactorDomain().at(2), - tv3->getMaybeRFactorDomain().at(3), - tv3->getMaybeRFactorDomain().at(1)}; - - ASSERT_EQ(tv0->getAllocationDomain(), tv0_2d); - ASSERT_EQ(tv1->getAllocationDomain(), tv1_nhwc); - ASSERT_EQ(tv2->getAllocationDomain(), tv2_2d); - ASSERT_EQ(tv3->getAllocationDomain(), expected_new_allocation_domain); - - for (auto tv : {tv1, tv2, tv3}) { - tv->split(0, 128); - tv->axis(0)->parallelize(ParallelType::BIDx); - tv->axis(1)->parallelize(ParallelType::TIDx); - tv->axis(2)->parallelize(ParallelType::Vectorize); - // [BIDx, TIDx, V] - } - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor t0_wrong_format = at::randn({n, c, h, w}, options); - at::Tensor t0 = - t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c}); - - FusionExecutor fe; - fe.compileFusion(fusion_ptr.get(), {t0}); - - EXPECT_THAT( - [&]() { fe.runFusion({t0_wrong_format}); }, - ::testing::ThrowsMessage(::testing::HasSubstr( - "splitting one dimension into discontiguous dimensions is not allowed in allocation domain"))); - - auto cg_outputs = fe.runFusion({t0}); - - ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast)); - - testValidate(&fusion, cg_outputs, {t0}, {t0, t0}, __LINE__, __FILE__); -} - } // namespace nvfuser From 3340235ee651c0ed8870f01632b72ae16627c58a Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 23 May 2023 10:16:29 -0700 Subject: [PATCH 05/12] cleanup vectorize --- csrc/scheduler/registry.cpp | 96 ----------------------------- csrc/scheduler/registry.h | 6 -- csrc/scheduler/transpose.cpp | 4 +- csrc/scheduler/vectorize_helper.cpp | 81 +++++++----------------- csrc/scheduler/vectorize_helper.h | 11 ---- 5 files changed, 24 insertions(+), 174 deletions(-) diff --git a/csrc/scheduler/registry.cpp b/csrc/scheduler/registry.cpp index 06266584b99..a35a171b6c6 100644 --- a/csrc/scheduler/registry.cpp +++ b/csrc/scheduler/registry.cpp @@ -1149,102 +1149,6 @@ size_t SchedulerRuntimeInfo::getMaxVectorizableWidth(TensorView* tv) { return vector_size; } -// Gets the vectorizable width of the inner most dimension of tv if it's -// contiguous. Ignores inner most dimensions that are broadcast or reduction. -size_t SchedulerRuntimeInfo::getInnerDimVectorizableWidth(TensorView* tv) { - auto inner_vectorword_map_it_ = inner_vectorword_map_.find(tv); - if (inner_vectorword_map_it_ != inner_vectorword_map_.end()) { - return inner_vectorword_map_it_->second; - } - - // If we don't have an record, either it is a tv with innermost broadcast, - // or it is an intermediate tensor allocated by fuser. Logic copied to get - // root according to scheduler_utils::innerMostRootDim. - auto tv_root = tv->hasReduction() && tv->hasRFactor() - ? tv->getRootDomain() - : tv->getMaybeRFactorDomain(); - - auto tv_root_no_reductions = TensorDomain::noReductions(tv_root); - - auto contiguity = tv->domain()->contiguity(); - // Appears after reductions the reduction domain often has a contiguity entry. - // This only matters if the result of the reduction is an output - if (contiguity.size() == tv_root.size() && - contiguity.size() != tv_root_no_reductions.size()) { - std::vector> new_contiguity; - for (auto i : c10::irange(tv_root.size())) { - if (!tv_root[i]->isReduction()) { - new_contiguity.push_back(contiguity[i]); - } - } - contiguity = new_contiguity; - } - tv_root = tv_root_no_reductions; - - auto tv_root_no_reductions_size = tv_root_no_reductions.size(); - - // Filter out 0-dim tensors - if (tv_root_no_reductions_size < 1) { - return 1; - } - - // Filter out mismatched contiguity info - if (tv_root_no_reductions_size != contiguity.size()) { - return 1; - } - - auto inner_most_dim = scheduler_utils::innerMostRootDim(tv); - - int id_pos = -1; - for (auto root_i : c10::irange((int)tv_root_no_reductions_size)) { - if (tv_root_no_reductions[root_i] == inner_most_dim) { - id_pos = root_i; - break; - } - } - - // Something went wrong with finding the inner most dimension, just - // return 1. - if (id_pos == -1) { - return 1; - } - - // If the inner most dimension is not contiguous return 1 - auto contiguity_opt = contiguity.at(id_pos); - TORCH_INTERNAL_ASSERT(contiguity_opt.has_value()); - if (!*contiguity_opt) { - return 1; - } - - size_t item_size = dataTypeSize(tv->dtype(), getIndexType()); - - // Alignment should always at least be the data type size - TORCH_INTERNAL_ASSERT(getAlignmentSize(tv) % item_size == 0); - size_t max_vector_size = getAlignmentSize(tv) / item_size; - - // Assuming intermediate tensors have friendly alignment, and - // all contiguity true. Determine the largest power of 2 below - // innermost dimension size for the word size of vectorizaiton - size_t vector_size = 1; - size_t next_vector_size = 2; - auto maybe_inner_dimension_size = - expression_evaluator_->evaluate(inner_most_dim->extent()); - TORCH_INTERNAL_ASSERT(maybe_inner_dimension_size.has_value()); - size_t inner_dimension_size = maybe_inner_dimension_size->as(); - - while (next_vector_size <= max_vector_size && - next_vector_size <= inner_dimension_size && - inner_dimension_size % next_vector_size == 0) { - vector_size = next_vector_size; - next_vector_size *= 2; - } - - // save output to avoid re-compute - inner_vectorword_map_[tv] = vector_size; - - return vector_size; -} - bool SchedulerEntry::sameAs(const SchedulerEntry* other) { return heuristic_ == other->heuristic_ && params_->sameAs(other->params_); } diff --git a/csrc/scheduler/registry.h b/csrc/scheduler/registry.h index 81ce5f4b997..80332f0cd65 100644 --- a/csrc/scheduler/registry.h +++ b/csrc/scheduler/registry.h @@ -71,10 +71,6 @@ class TORCH_CUDA_CU_API SchedulerRuntimeInfo : public NonCopyable { // contiguity. Ignores dimensions that are broadcast or reduction. size_t getMaxVectorizableWidth(TensorView* tv); - // Gets the vectorizable width of the inner most dimension of tv if it's - // contiguous. Ignores inner most dimensions that are broadcast or reduction. - size_t getInnerDimVectorizableWidth(TensorView* tv); - // Computes alignment size in bytes for provided ptr address static size_t computeAlignmentSize(size_t ptr_address); @@ -129,8 +125,6 @@ class TORCH_CUDA_CU_API SchedulerRuntimeInfo : public NonCopyable { std::unordered_map alignment_map_; // Cache for getMaxVectorizableWidth std::unordered_map max_vectorword_map_; - // Cache for getInnerDimVectorizableWidth - std::unordered_map inner_vectorword_map_; // Found index mode kernel needs to be run in PrimDataType index_type_ = PrimDataType::Int; diff --git a/csrc/scheduler/transpose.cpp b/csrc/scheduler/transpose.cpp index 69175bbe002..d0e2f445bfe 100644 --- a/csrc/scheduler/transpose.cpp +++ b/csrc/scheduler/transpose.cpp @@ -700,7 +700,7 @@ std::shared_ptr getTransposeHeuristics( for (auto tv : grouped_inputs_outputs[0]) { const auto tv_vectorize_factor = - runtime_info.getInnerDimVectorizableWidth(tv); + runtime_info.getMaxVectorizableWidth(tv); vectorize_factor1 = std::min(vectorize_factor1, tv_vectorize_factor); } // TODO: Since group2 only has global->shared and shared->global set op, we @@ -709,7 +709,7 @@ std::shared_ptr getTransposeHeuristics( // group 2 for (auto tv : grouped_inputs_outputs[1]) { const auto tv_vectorize_factor = - runtime_info.getInnerDimVectorizableWidth(tv); + runtime_info.getMaxVectorizableWidth(tv); vectorize_factor2 = std::min(vectorize_factor2, tv_vectorize_factor); } diff --git a/csrc/scheduler/vectorize_helper.cpp b/csrc/scheduler/vectorize_helper.cpp index ca2fcf27b86..d1ada990770 100644 --- a/csrc/scheduler/vectorize_helper.cpp +++ b/csrc/scheduler/vectorize_helper.cpp @@ -1201,13 +1201,29 @@ int64_t getVectorizationSize( return vectorize_size; } -size_t getExpandedVectorization( - const std::vector& reference_maps, +size_t getVectorizationFactor( SchedulerRuntimeInfo& runtime_info, - const std::vector vectorizable_inputs_outputs, TensorView* reference_tv, - int break_point, - size_t default_word_size) { + HeuristicSummary* data_cache, + int break_point) { + auto vectorizable_inputs_outputs_entry = + HeuristicSummaryEntry( + data_cache, [&reference_tv]() { + return std::make_unique>( + scheduler_utils::getInputsOutputsWithInnerDim( + reference_tv, true, true)); + }); + + auto& vectorizable_inputs_outputs = vectorizable_inputs_outputs_entry.get(); + + auto vectorize_maps_entry = + HeuristicSummaryEntry( + data_cache, [&reference_tv]() { + return std::make_unique< + std::vector>( + vectorize_helper::getAllVectorizedMapsOf(reference_tv)); + }); + if (vectorizable_inputs_outputs.empty()) { return 1; } @@ -1229,14 +1245,7 @@ size_t getExpandedVectorization( common_alignment_size, runtime_info.getAlignmentSize(inp_or_out)); } - // If there's no possibility to increase vector size of provided tensors, - // then don't bother doing a more complex analysis to try and do so, just - // return early. - if (max_expand_size == default_word_size) { - return default_word_size; - } - - auto reference_map = reference_maps[break_point]; + auto reference_map = vectorize_maps_entry.get().at(break_point); // Initialize to max the tensors could support. size_t max_supported_vector_size = max_expand_size; for (auto inp_or_out : vectorizable_inputs_outputs) { @@ -1257,51 +1266,5 @@ size_t getExpandedVectorization( return max_supported_vector_size; } -size_t getVectorizationFactor( - SchedulerRuntimeInfo& runtime_info, - TensorView* reference_tv, - HeuristicSummary* data_cache, - int break_point) { - auto vectorizable_inputs_outputs_entry = - HeuristicSummaryEntry( - data_cache, [&reference_tv]() { - return std::make_unique>( - scheduler_utils::getInputsOutputsWithInnerDim( - reference_tv, true, true)); - }); - - auto& vectorizable_inputs_outputs = vectorizable_inputs_outputs_entry.get(); - - size_t vectorize_factor = std::numeric_limits::max(); - - for (auto tv : vectorizable_inputs_outputs) { - const auto tv_vectorize_factor = - runtime_info.getInnerDimVectorizableWidth(tv); - vectorize_factor = std::min(vectorize_factor, tv_vectorize_factor); - } - - if (vectorize_factor == std::numeric_limits::max()) { - vectorize_factor = 1; - } - - auto vectorize_maps_entry = - HeuristicSummaryEntry( - data_cache, [&reference_tv]() { - return std::make_unique< - std::vector>( - vectorize_helper::getAllVectorizedMapsOf(reference_tv)); - }); - - vectorize_factor = vectorize_helper::getExpandedVectorization( - vectorize_maps_entry.get(), - runtime_info, - vectorizable_inputs_outputs, - reference_tv, - break_point, - vectorize_factor); - - return vectorize_factor; -} - } // namespace vectorize_helper } // namespace nvfuser diff --git a/csrc/scheduler/vectorize_helper.h b/csrc/scheduler/vectorize_helper.h index 0c8b6536195..2204782fe85 100644 --- a/csrc/scheduler/vectorize_helper.h +++ b/csrc/scheduler/vectorize_helper.h @@ -594,17 +594,6 @@ std::vector> getContigVectorSizesOf( TensorView* of_tv, ContiguousInnerDimensionsMapper& mapper); -// TODO: vectorizable_inputs_outputs should actually be known based on the -// computed mappings. If nothing is mapped for a tensorview it's not -// vectorizable. -size_t getExpandedVectorization( - const std::vector& reference_maps, - SchedulerRuntimeInfo& runtime_info, - const std::vector vectorizable_inputs_outputs, - TensorView* reference_tv, - int break_point, - size_t default_word_size); - size_t getVectorizationFactor( SchedulerRuntimeInfo& runtime_info, TensorView* reference_tv, From dd83215414be1c5ff0d212d3383e8f6f502cfae6 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 23 May 2023 10:17:53 -0700 Subject: [PATCH 06/12] Revert "registry" This reverts commit f498f3f66cf9fd93c8b813dc9291b69d7ea62612. --- csrc/scheduler/registry.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/csrc/scheduler/registry.cpp b/csrc/scheduler/registry.cpp index a35a171b6c6..e9145258230 100644 --- a/csrc/scheduler/registry.cpp +++ b/csrc/scheduler/registry.cpp @@ -970,14 +970,14 @@ SchedulerRuntimeInfo::SchedulerRuntimeInfo( // find and push discontiguous stride auto dtype_size = dataTypeSize(tensor_arg_abstract->getDataType()); input_discontig_strides_[fusion_inp] = {}; - auto dims = tensor_arg_abstract->getAllocRank(); + auto dims = tensor_arg_abstract->getRank(); int64_t expected_stride = 1; for (auto dim = dims - 1; dim >= 0; dim--) { - auto size = tensor_arg_abstract->getAllocSize((int)dim); + auto size = tensor_arg_abstract->getSize((int)dim); if (size <= 1) { continue; } - auto stride = tensor_arg_abstract->getAllocStride((int)dim); + auto stride = tensor_arg_abstract->getStride((int)dim); if (stride != expected_stride) { input_discontig_strides_[fusion_inp].push_back(stride * dtype_size); expected_stride = stride; From 91bc0c1aa55ad3515868bab6ecef5207eac06172 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 23 May 2023 10:19:19 -0700 Subject: [PATCH 07/12] Revert "TensorArgAbstract allocation size" This reverts commit 2010de6822f28d8c4677291053b483bec2dd9098. --- csrc/executor_kernel_arg.h | 37 ++++++------------------------------- 1 file changed, 6 insertions(+), 31 deletions(-) diff --git a/csrc/executor_kernel_arg.h b/csrc/executor_kernel_arg.h index b8977a82926..9c22f6c6785 100644 --- a/csrc/executor_kernel_arg.h +++ b/csrc/executor_kernel_arg.h @@ -199,19 +199,9 @@ struct TensorArgAbstract : ArgAbstract { return tensor_.size(i); } - virtual int64_t getAllocRank() const { + virtual int64_t getStride(int64_t i) const { TORCH_INTERNAL_ASSERT( - false, "The allocation rank of an abstract tensor arg is not known."); - } - - virtual int64_t getAllocSize(int64_t i) const { - TORCH_INTERNAL_ASSERT( - false, "The allocation shape of an abstract tensor arg is not known."); - } - - virtual int64_t getAllocStride(int64_t i) const { - TORCH_INTERNAL_ASSERT( - false, "The allocation stride of an abstract tensor arg is not known."); + false, "The stride of an abstract tensor arg is not known."); } size_t getPointerAddress() const { @@ -284,7 +274,6 @@ inferAndValidateAllocationSizesAndStrides( template struct TensorArg : public TensorArgAbstract { TENSOR_TYPE instance_; - std::array alloc_sizes; TensorArg(const at::Tensor& tensor, TensorView* tv, ExpressionEvaluator& eval) : TensorArgAbstract(tensor) { @@ -304,21 +293,12 @@ struct TensorArg : public TensorArgAbstract { TORCH_INTERNAL_ASSERT( (size_t)instance_.nAllocationDims() == sizes_strides.size()); for (auto i : c10::irange((int64_t)sizes_strides.size())) { - alloc_sizes.at(i) = sizes_strides.at(i).first; using stride_t = typename TENSOR_TYPE::index_type; instance_.setStride(i, (stride_t)sizes_strides.at(i).second); } } - int64_t getAllocRank() const override { - return instance_.nAllocationDims(); - } - - int64_t getAllocSize(int64_t i) const override { - return alloc_sizes.at(i); - } - - int64_t getAllocStride(int64_t i) const override { + int64_t getStride(int64_t i) const override { return instance_.getStride(i); } @@ -343,15 +323,10 @@ struct TensorArg : public TensorArgAbstract { std::string toString() const override { std::stringstream ss; ss << TensorArgAbstract::toString(); - ss << " allocation size: ("; - for (auto i = 0; i < getAllocRank(); i++) { - ss << getAllocSize(i) << ", "; - } - ss << ") allocation stride: ("; - for (auto i = 0; i < getAllocRank(); i++) { - ss << getAllocStride(i) << ", "; + ss << " stride: ("; + for (auto i = 0; i < getRank(); i++) { + ss << getStride(i) << ", "; } - ss << ")"; return ss.str(); } From 6fcbbbcc3749281183c2801230573f7f0261ba48 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 23 May 2023 10:23:09 -0700 Subject: [PATCH 08/12] format --- csrc/scheduler/transpose.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/csrc/scheduler/transpose.cpp b/csrc/scheduler/transpose.cpp index d0e2f445bfe..5d30bf81c49 100644 --- a/csrc/scheduler/transpose.cpp +++ b/csrc/scheduler/transpose.cpp @@ -699,8 +699,7 @@ std::shared_ptr getTransposeHeuristics( size_t vectorize_factor2 = max_unroll_factor; for (auto tv : grouped_inputs_outputs[0]) { - const auto tv_vectorize_factor = - runtime_info.getMaxVectorizableWidth(tv); + const auto tv_vectorize_factor = runtime_info.getMaxVectorizableWidth(tv); vectorize_factor1 = std::min(vectorize_factor1, tv_vectorize_factor); } // TODO: Since group2 only has global->shared and shared->global set op, we @@ -708,8 +707,7 @@ std::shared_ptr getTransposeHeuristics( // We should not be using a single global vectorize factor for the entire // group 2 for (auto tv : grouped_inputs_outputs[1]) { - const auto tv_vectorize_factor = - runtime_info.getMaxVectorizableWidth(tv); + const auto tv_vectorize_factor = runtime_info.getMaxVectorizableWidth(tv); vectorize_factor2 = std::min(vectorize_factor2, tv_vectorize_factor); } From 3b928298868d5aec1d20ee46b23f9034ccc46375 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Wed, 24 May 2023 11:16:23 -0700 Subject: [PATCH 09/12] transpose fix --- csrc/scheduler/registry.cpp | 8 +++++++- csrc/scheduler/registry.h | 2 +- csrc/scheduler/transpose.cpp | 6 ++++-- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/csrc/scheduler/registry.cpp b/csrc/scheduler/registry.cpp index 711e8911fc0..4bbea966fbf 100644 --- a/csrc/scheduler/registry.cpp +++ b/csrc/scheduler/registry.cpp @@ -1051,7 +1051,9 @@ size_t SchedulerRuntimeInfo::getAlignmentSize(TensorView* tv) { // Gets maximum vectorizable width of tv, assumes we can merge across all // iteration domains if contiguous. Cannot permute the dimensions to fix // contiguity. -size_t SchedulerRuntimeInfo::getMaxVectorizableWidth(TensorView* tv) { +size_t SchedulerRuntimeInfo::getMaxVectorizableWidth( + TensorView* tv, + bool contig_merge) { // Gets the vectorizable width of the tv starting from the inner most // dimension, working its way towards the outer most dimension, if they're // contiguous. Ignores broadcast and reduction domains. @@ -1130,6 +1132,10 @@ size_t SchedulerRuntimeInfo::getMaxVectorizableWidth(TensorView* tv) { // Still contiguous numel *= dim_size->as(); + + if (!contig_merge) { + break; + } } // Assuming intermediate tensors have friendly alignment, and diff --git a/csrc/scheduler/registry.h b/csrc/scheduler/registry.h index 80332f0cd65..1763e87f1a7 100644 --- a/csrc/scheduler/registry.h +++ b/csrc/scheduler/registry.h @@ -69,7 +69,7 @@ class TORCH_CUDA_CU_API SchedulerRuntimeInfo : public NonCopyable { // Gets maximum vectorizable width of tv, assumes we can merge across all // iteration domains if contiguous. Cannot permute the dimensions to fix // contiguity. Ignores dimensions that are broadcast or reduction. - size_t getMaxVectorizableWidth(TensorView* tv); + size_t getMaxVectorizableWidth(TensorView* tv, bool contig_merge = true); // Computes alignment size in bytes for provided ptr address static size_t computeAlignmentSize(size_t ptr_address); diff --git a/csrc/scheduler/transpose.cpp b/csrc/scheduler/transpose.cpp index 5d30bf81c49..9c10f9865ec 100644 --- a/csrc/scheduler/transpose.cpp +++ b/csrc/scheduler/transpose.cpp @@ -699,7 +699,8 @@ std::shared_ptr getTransposeHeuristics( size_t vectorize_factor2 = max_unroll_factor; for (auto tv : grouped_inputs_outputs[0]) { - const auto tv_vectorize_factor = runtime_info.getMaxVectorizableWidth(tv); + const auto tv_vectorize_factor = + runtime_info.getMaxVectorizableWidth(tv, false); vectorize_factor1 = std::min(vectorize_factor1, tv_vectorize_factor); } // TODO: Since group2 only has global->shared and shared->global set op, we @@ -707,7 +708,8 @@ std::shared_ptr getTransposeHeuristics( // We should not be using a single global vectorize factor for the entire // group 2 for (auto tv : grouped_inputs_outputs[1]) { - const auto tv_vectorize_factor = runtime_info.getMaxVectorizableWidth(tv); + const auto tv_vectorize_factor = + runtime_info.getMaxVectorizableWidth(tv, false); vectorize_factor2 = std::min(vectorize_factor2, tv_vectorize_factor); } From b740de247a3cb176321c27108fa5d28c3fdd2f92 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Wed, 24 May 2023 11:21:51 -0700 Subject: [PATCH 10/12] fix test --- test/test_gpu1.cpp | 12 ++++++------ test/test_gpu2.cpp | 30 +++++++++++++++--------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/test/test_gpu1.cpp b/test/test_gpu1.cpp index c809148c6c0..342d8b5aaad 100644 --- a/test/test_gpu1.cpp +++ b/test/test_gpu1.cpp @@ -1201,17 +1201,17 @@ TEST_F(NVFuserTest, FusionParser_CUDA) { // 2. use a fuzzy compare (ignore non-significant whitespaces for example) const std::string expected_kernel = R"( __global__ void CUDAGeneratedKernel(Tensor T0, Tensor T1, Tensor T3) { - int64_t i86; - i86 = ((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)); - if ((i86 < T0.size[0])) { + int64_t i87; + i87 = ((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)); + if ((i87 < T0.size[0])) { float T5[1]; T5[0] = 0; T5[0] - = T1[i86]; + = T1[i87]; float T4[1]; T4[0] = 0; T4[0] - = T0[i86]; + = T0[i87]; float T2[1]; T2[0] = T4[0] @@ -1220,7 +1220,7 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor T6[0] = T2[0] * T4[0]; - T3[i86] + T3[i87] = T6[0]; } } diff --git a/test/test_gpu2.cpp b/test/test_gpu2.cpp index 6bccc21b9ce..d69dac4c9d1 100644 --- a/test/test_gpu2.cpp +++ b/test/test_gpu2.cpp @@ -9026,27 +9026,27 @@ TEST_F(NVFuserTest, FusionChannelsLastParser_CUDA) { // 2. use a fuzzy compare (ignore non-significant whitespaces for example) const std::string expected_kernel = R"( __global__ void CUDAGeneratedKernel(Tensor<__half, 4, 4> T0, Tensor<__half, 4, 4> T2, Tensor<__half, 4, 4> T7) { - int64_t i1201; - i1201 = T0.size[2] * T0.size[1]; - int64_t i1204; - i1204 = ((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)); - int64_t i1206; - i1206 = (T0.size[1] * T0.size[2]) * T0.size[3]; - int64_t i1238; - i1238 = i1204 % i1206; - int64_t i1215; - i1215 = T0.size[2] * T0.size[3]; + int64_t i1202; + i1202 = T0.size[2] * T0.size[1]; + int64_t i1205; + i1205 = ((nvfuser_index_t)threadIdx.x) + (128 * ((nvfuser_index_t)blockIdx.x)); + int64_t i1207; + i1207 = (T0.size[1] * T0.size[2]) * T0.size[3]; int64_t i1239; - i1239 = i1238 % i1215; - if ((i1204 < (((T0.size[0] * T0.size[1]) * T0.size[2]) * T0.size[3]))) { + i1239 = i1205 % i1207; + int64_t i1216; + i1216 = T0.size[2] * T0.size[3]; + int64_t i1240; + i1240 = i1239 % i1216; + if ((i1205 < (((T0.size[0] * T0.size[1]) * T0.size[2]) * T0.size[3]))) { __half T9[1]; T9[0] = 0; T9[0] - = T2[(((((i1201 * T0.size[3]) * (i1204 / i1206)) + (i1201 * (i1239 % T0.size[3]))) + (T0.size[2] * (i1238 / i1215))) + (i1239 / T0.size[3]))]; + = T2[(((((i1202 * T0.size[3]) * (i1205 / i1207)) + (i1202 * (i1240 % T0.size[3]))) + (T0.size[2] * (i1239 / i1216))) + (i1240 / T0.size[3]))]; __half T8[1]; T8[0] = 0; T8[0] - = T0[i1204]; + = T0[i1205]; float T3[1]; T3[0] = __half2float(T9[0]); @@ -9066,7 +9066,7 @@ __global__ void CUDAGeneratedKernel(Tensor<__half, 4, 4> T0, Tensor<__half, 4, 4 __half T10[1]; T10[0] = __float2half(T6[0]); - T7[i1204] + T7[i1205] = T10[0]; } } From a4099e5990f3513cdea0c7ea00b706ba5084277f Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Wed, 24 May 2023 11:25:02 -0700 Subject: [PATCH 11/12] doc --- csrc/scheduler/registry.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/csrc/scheduler/registry.h b/csrc/scheduler/registry.h index 1763e87f1a7..b1ffc5b1d84 100644 --- a/csrc/scheduler/registry.h +++ b/csrc/scheduler/registry.h @@ -67,8 +67,9 @@ class TORCH_CUDA_CU_API SchedulerRuntimeInfo : public NonCopyable { size_t getAlignmentSize(TensorView* tv); // Gets maximum vectorizable width of tv, assumes we can merge across all - // iteration domains if contiguous. Cannot permute the dimensions to fix - // contiguity. Ignores dimensions that are broadcast or reduction. + // iteration domains if contiguous, unless contig_merge=false. Cannot permute + // the dimensions to fix contiguity. Ignores dimensions that are broadcast or + // reduction. size_t getMaxVectorizableWidth(TensorView* tv, bool contig_merge = true); // Computes alignment size in bytes for provided ptr address From 344ef497a3a71fb10fdb8ea4eb3bde0c68b27278 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Wed, 24 May 2023 12:53:32 -0700 Subject: [PATCH 12/12] more vectorization --- csrc/scheduler/vectorize_helper.cpp | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/csrc/scheduler/vectorize_helper.cpp b/csrc/scheduler/vectorize_helper.cpp index d1ada990770..26f99d0d5bd 100644 --- a/csrc/scheduler/vectorize_helper.cpp +++ b/csrc/scheduler/vectorize_helper.cpp @@ -1157,6 +1157,11 @@ int64_t getVectorizationSize( auto denominator = denominator_optional->as(); auto extent = extent_optional->as(); + // TODO: we should clean this up with expr simplifier + auto gcd = std::gcd(numerator, denominator); + numerator = numerator / gcd; + denominator = denominator / gcd; + if (denominator != 1) { break; } @@ -1228,7 +1233,7 @@ size_t getVectorizationFactor( return 1; } - size_t max_expand_size = SchedulerRuntimeInfo::max_alignment_size_in_byte; + size_t max_vec_size = SchedulerRuntimeInfo::max_alignment_size_in_byte; size_t common_alignment_size = SchedulerRuntimeInfo::max_alignment_size_in_byte; @@ -1236,18 +1241,18 @@ size_t getVectorizationFactor( auto dtype_size = dataTypeSize(inp_or_out->dtype(), runtime_info.getIndexType()); - max_expand_size = std::min( - max_expand_size, + max_vec_size = std::min( + max_vec_size, SchedulerRuntimeInfo::max_alignment_size_in_byte / dtype_size); - max_expand_size = std::min( - max_expand_size, runtime_info.getMaxVectorizableWidth(inp_or_out)); + max_vec_size = std::min( + max_vec_size, runtime_info.getMaxVectorizableWidth(inp_or_out)); common_alignment_size = std::min( common_alignment_size, runtime_info.getAlignmentSize(inp_or_out)); } auto reference_map = vectorize_maps_entry.get().at(break_point); // Initialize to max the tensors could support. - size_t max_supported_vector_size = max_expand_size; + size_t max_supported_vector_size = max_vec_size; for (auto inp_or_out : vectorizable_inputs_outputs) { size_t contig_dim_size = getVectorizationSize( getContigVectorSizesOf(inp_or_out, reference_map), @@ -1255,7 +1260,7 @@ size_t getVectorizationFactor( size_t local_max_vec_size = 1; while (contig_dim_size > 1 && contig_dim_size % 2 == 0 && - local_max_vec_size < max_expand_size) { + local_max_vec_size < max_vec_size) { contig_dim_size /= 2; local_max_vec_size *= 2; }