From ea9b18123ace164da2c6640e73b83dd8635425f9 Mon Sep 17 00:00:00 2001 From: Steven Lyubomirsky Date: Fri, 22 Mar 2024 14:01:06 -0400 Subject: [PATCH 1/3] Revert changes that cause failures in MLC, mark and skip the failing tests --- src/driver/driver_api.cc | 4 +++- src/tir/ir/data_type_rewriter.cc | 6 ------ src/tir/transforms/default_gpu_schedule.cc | 3 +-- .../test_tir_transform_force_narrow_index_to_i32.py | 7 +++++++ .../test_tir_transform_inject_ptx_async_copy.py | 12 +++++++++--- .../test_transform_default_gpu_schedule.py | 6 ++++++ 6 files changed, 26 insertions(+), 12 deletions(-) diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc index e3b4a5a6517c..33b4514e6b29 100644 --- a/src/driver/driver_api.cc +++ b/src/driver/driver_api.cc @@ -590,7 +590,6 @@ transform::Sequential MixedModulePassManager(IRModule mixed_mod, Target target) mixed_pass_list.push_back(tir::transform::ThreadSync("shared")); mixed_pass_list.push_back(tir::transform::ThreadSync("shared.dyn")); - mixed_pass_list.push_back(tir::transform::MergeSharedMemoryAllocations()); mixed_pass_list.push_back(tir::transform::ThreadSync("warp")); mixed_pass_list.push_back(tir::transform::InferFragment()); mixed_pass_list.push_back(tir::transform::LowerThreadAllreduce()); @@ -608,6 +607,9 @@ transform::Sequential MixedModulePassManager(IRModule mixed_mod, Target target) mixed_pass_list.push_back(tir::transform::AnnotateDeviceRegions()); mixed_pass_list.push_back(tir::transform::SplitHostDevice()); + // MergeSharedMemoryAllocations must be applied after SplitHostDevice + // because the merged allocation site is at the beginning of each device function + mixed_pass_list.push_back(tir::transform::MergeSharedMemoryAllocations()); bool unpacked_api = mixed_mod->GetAttr(tvm::attr::kExecutor) .value_or(relay::Executor::Create("graph", {})) diff --git a/src/tir/ir/data_type_rewriter.cc b/src/tir/ir/data_type_rewriter.cc index 3461597b8e0f..2d2c097be494 100644 --- a/src/tir/ir/data_type_rewriter.cc +++ b/src/tir/ir/data_type_rewriter.cc @@ -532,12 +532,6 @@ Stmt IndexDataTypeRewriter::VisitStmt_(const ForNode* op) { n->loop_var = new_loop_var; n->min = cast(new_loop_var.dtype(), min); n->extent = cast(new_loop_var.dtype(), extent); - if (op->thread_binding.defined()) { - auto old_thread_binding = op->thread_binding.value(); - auto* ptr = old_thread_binding.CopyOnWrite(); - ptr->var = old_thread_binding->var.copy_with_dtype(new_loop_var.dtype()); - n->thread_binding = std::move(Optional(std::move(old_thread_binding))); - } n->body = new_body; return std::move(new_for); } else { diff --git a/src/tir/transforms/default_gpu_schedule.cc b/src/tir/transforms/default_gpu_schedule.cc index 6d0542257309..6cf7f6e06743 100644 --- a/src/tir/transforms/default_gpu_schedule.cc +++ b/src/tir/transforms/default_gpu_schedule.cc @@ -113,8 +113,7 @@ bool IsScheduledOnGPU(const BaseFunc& func) { if (target.defined()) { int dev_type = target->GetTargetDeviceType(); - if (!(dev_type == kDLCUDA || dev_type == kDLMetal || dev_type == kDLROCM || - dev_type == kDLWebGPU)) { + if (dev_type != kDLCUDA) { return false; } } diff --git a/tests/python/tir-transform/test_tir_transform_force_narrow_index_to_i32.py b/tests/python/tir-transform/test_tir_transform_force_narrow_index_to_i32.py index c1b81853deed..4f33ec0b88b4 100644 --- a/tests/python/tir-transform/test_tir_transform_force_narrow_index_to_i32.py +++ b/tests/python/tir-transform/test_tir_transform_force_narrow_index_to_i32.py @@ -45,6 +45,13 @@ def expected(A: T.Buffer((64,), "float32"), B: T.Buffer((64,), "float32")): tvm.ir.assert_structural_equal(func, expected) +@pytest.mark.skip( + reason="Caused by failing to update the datatype in an IndexVar in data_type_rewriter.cc. " + "However, changing it breaks important code in MLC. " + "This should be fixed and addressed. " + "See discussion in https://github.com/apache/tvm/pull/16634#issuecomment-1973891325 " + "and https://github.com/apache/tvm/pull/16769." +) def test_thread_axis2(): @T.prim_func def before( diff --git a/tests/python/tir-transform/test_tir_transform_inject_ptx_async_copy.py b/tests/python/tir-transform/test_tir_transform_inject_ptx_async_copy.py index c52aca767410..4dd59b4dd18b 100644 --- a/tests/python/tir-transform/test_tir_transform_inject_ptx_async_copy.py +++ b/tests/python/tir-transform/test_tir_transform_inject_ptx_async_copy.py @@ -482,6 +482,12 @@ def simple_compute( assert generated_code == expected_cuda_script +@pytest.mark.skip( + reason="This test fails due to an ordering issue with MergeSharedMemoryAllocations " + "in device_driver_api.cc. However, fixing this causes failures in MLC. " + "This bug should be addressed. See discussion in https://github.com/apache/tvm/pull/16769 " + "and https://github.com/apache/tvm/pull/16569#issuecomment-1992720448" +) @tvm.testing.requires_cuda def test_vectorize_cp_async_in_if_then_else(postproc_if_missing_async_support): @T.prim_func @@ -949,9 +955,9 @@ def before(A: T.Buffer((32, 128), "float16")): T.attr("default", "async_scope", 1) for i in range(16): cse_var_1: T.int64 = T.Cast("int64", i) - A_shared[ - T.Ramp(tx * T.int64(128) + cse_var_1 * T.int64(8), T.int64(1), 8) - ] = A_flattened[T.Ramp(tx * T.int64(128) + cse_var_1 * T.int64(8), T.int64(1), 8)] + A_shared[T.Ramp(tx * T.int64(128) + cse_var_1 * T.int64(8), T.int64(1), 8)] = ( + A_flattened[T.Ramp(tx * T.int64(128) + cse_var_1 * T.int64(8), T.int64(1), 8)] + ) T.ptx_commit_group() T.ptx_wait_group(0) diff --git a/tests/python/tir-transform/test_transform_default_gpu_schedule.py b/tests/python/tir-transform/test_transform_default_gpu_schedule.py index 63809beade8a..7dc3ca20d544 100644 --- a/tests/python/tir-transform/test_transform_default_gpu_schedule.py +++ b/tests/python/tir-transform/test_transform_default_gpu_schedule.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=invalid-name,,missing-function-docstring +import pytest import tvm from tvm.tir.transform import DefaultGPUSchedule from tvm.script import tir as T @@ -454,6 +455,11 @@ def full( assert tvm.ir.structural_equal(After, Expected) +@pytest.mark.skip( + reason="src/tir/transforms/default_gpu_schedule.cc only checks for CUDA, not Metal. " + "However, changing this definition causes failures in MLC. This should be fixed. " + "See discussion in https://github.com/apache/tvm/pull/16634" +) def test_add_on_metal(): # pylint: disable=no-self-argument,missing-class-docstring,line-too-long # fmt: off From 428544d12c9899c0a80a48b5c9310ac749c19c8f Mon Sep 17 00:00:00 2001 From: Steven Lyubomirsky Date: Fri, 22 Mar 2024 14:23:23 -0400 Subject: [PATCH 2/3] Lint --- .../test_tir_transform_inject_ptx_async_copy.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/python/tir-transform/test_tir_transform_inject_ptx_async_copy.py b/tests/python/tir-transform/test_tir_transform_inject_ptx_async_copy.py index 4dd59b4dd18b..4c94dc04ccb6 100644 --- a/tests/python/tir-transform/test_tir_transform_inject_ptx_async_copy.py +++ b/tests/python/tir-transform/test_tir_transform_inject_ptx_async_copy.py @@ -955,9 +955,9 @@ def before(A: T.Buffer((32, 128), "float16")): T.attr("default", "async_scope", 1) for i in range(16): cse_var_1: T.int64 = T.Cast("int64", i) - A_shared[T.Ramp(tx * T.int64(128) + cse_var_1 * T.int64(8), T.int64(1), 8)] = ( - A_flattened[T.Ramp(tx * T.int64(128) + cse_var_1 * T.int64(8), T.int64(1), 8)] - ) + A_shared[ + T.Ramp(tx * T.int64(128) + cse_var_1 * T.int64(8), T.int64(1), 8) + ] = A_flattened[T.Ramp(tx * T.int64(128) + cse_var_1 * T.int64(8), T.int64(1), 8)] T.ptx_commit_group() T.ptx_wait_group(0) From de1e9a619b3bee6cef773ff2fd8e389ab1dc7bfe Mon Sep 17 00:00:00 2001 From: Steven Lyubomirsky Date: Fri, 22 Mar 2024 14:27:03 -0400 Subject: [PATCH 3/3] Restore changes unrelated to driver API reordering --- src/tir/ir/data_type_rewriter.cc | 6 ++++++ src/tir/transforms/default_gpu_schedule.cc | 3 ++- .../test_tir_transform_force_narrow_index_to_i32.py | 7 ------- .../tir-transform/test_transform_default_gpu_schedule.py | 6 ------ 4 files changed, 8 insertions(+), 14 deletions(-) diff --git a/src/tir/ir/data_type_rewriter.cc b/src/tir/ir/data_type_rewriter.cc index 2d2c097be494..3461597b8e0f 100644 --- a/src/tir/ir/data_type_rewriter.cc +++ b/src/tir/ir/data_type_rewriter.cc @@ -532,6 +532,12 @@ Stmt IndexDataTypeRewriter::VisitStmt_(const ForNode* op) { n->loop_var = new_loop_var; n->min = cast(new_loop_var.dtype(), min); n->extent = cast(new_loop_var.dtype(), extent); + if (op->thread_binding.defined()) { + auto old_thread_binding = op->thread_binding.value(); + auto* ptr = old_thread_binding.CopyOnWrite(); + ptr->var = old_thread_binding->var.copy_with_dtype(new_loop_var.dtype()); + n->thread_binding = std::move(Optional(std::move(old_thread_binding))); + } n->body = new_body; return std::move(new_for); } else { diff --git a/src/tir/transforms/default_gpu_schedule.cc b/src/tir/transforms/default_gpu_schedule.cc index 6cf7f6e06743..6d0542257309 100644 --- a/src/tir/transforms/default_gpu_schedule.cc +++ b/src/tir/transforms/default_gpu_schedule.cc @@ -113,7 +113,8 @@ bool IsScheduledOnGPU(const BaseFunc& func) { if (target.defined()) { int dev_type = target->GetTargetDeviceType(); - if (dev_type != kDLCUDA) { + if (!(dev_type == kDLCUDA || dev_type == kDLMetal || dev_type == kDLROCM || + dev_type == kDLWebGPU)) { return false; } } diff --git a/tests/python/tir-transform/test_tir_transform_force_narrow_index_to_i32.py b/tests/python/tir-transform/test_tir_transform_force_narrow_index_to_i32.py index 4f33ec0b88b4..c1b81853deed 100644 --- a/tests/python/tir-transform/test_tir_transform_force_narrow_index_to_i32.py +++ b/tests/python/tir-transform/test_tir_transform_force_narrow_index_to_i32.py @@ -45,13 +45,6 @@ def expected(A: T.Buffer((64,), "float32"), B: T.Buffer((64,), "float32")): tvm.ir.assert_structural_equal(func, expected) -@pytest.mark.skip( - reason="Caused by failing to update the datatype in an IndexVar in data_type_rewriter.cc. " - "However, changing it breaks important code in MLC. " - "This should be fixed and addressed. " - "See discussion in https://github.com/apache/tvm/pull/16634#issuecomment-1973891325 " - "and https://github.com/apache/tvm/pull/16769." -) def test_thread_axis2(): @T.prim_func def before( diff --git a/tests/python/tir-transform/test_transform_default_gpu_schedule.py b/tests/python/tir-transform/test_transform_default_gpu_schedule.py index 7dc3ca20d544..63809beade8a 100644 --- a/tests/python/tir-transform/test_transform_default_gpu_schedule.py +++ b/tests/python/tir-transform/test_transform_default_gpu_schedule.py @@ -15,7 +15,6 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=invalid-name,,missing-function-docstring -import pytest import tvm from tvm.tir.transform import DefaultGPUSchedule from tvm.script import tir as T @@ -455,11 +454,6 @@ def full( assert tvm.ir.structural_equal(After, Expected) -@pytest.mark.skip( - reason="src/tir/transforms/default_gpu_schedule.cc only checks for CUDA, not Metal. " - "However, changing this definition causes failures in MLC. This should be fixed. " - "See discussion in https://github.com/apache/tvm/pull/16634" -) def test_add_on_metal(): # pylint: disable=no-self-argument,missing-class-docstring,line-too-long # fmt: off