diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py index 8227530f7ab7..40790bc1797e 100644 --- a/python/tvm/testing/utils.py +++ b/python/tvm/testing/utils.py @@ -903,6 +903,9 @@ def _multi_gpu_exists(): # Mark a test as requiring NCCL support requires_nccl = Feature("nccl", "NCCL", cmake_flag="USE_NCCL", parent_features="cuda") +# Mark a test as requiring RCCL support +requires_rccl = Feature("rccl", "RCCL", cmake_flag="USE_RCCL", parent_features="rocm") + # Mark a test as requiring the NVPTX compilation on the CUDA runtime requires_nvptx = Feature( "nvptx", diff --git a/tests/python/disco/test_ccl.py b/tests/python/disco/test_ccl.py index c29ece957245..0f630c3efc39 100644 --- a/tests/python/disco/test_ccl.py +++ b/tests/python/disco/test_ccl.py @@ -32,7 +32,13 @@ from tvm.script import relax as R _all_session_kinds = [di.ThreadedSession, di.ProcessSession] -_ccl = [get_global_func("runtime.disco.compiled_ccl")()] +_ccl = [ + pytest.param( + ccl, + marks=tvm.testing.Feature._all_features[ccl].marks(), + ) + for ccl in ["nccl", "rccl"] +] def create_device_target(ccl): @@ -445,7 +451,6 @@ def main( W1: R.Tensor((128, 128), "float32"), W2: R.Tensor((128, 128), "float32"), ) -> R.Tensor((128, 128), "float32"): - R.func_attr({"global_symbol": "main"}) with R.dataflow(): lv0: R.Tensor((128, 128), "float32") = R.matmul(x, W1) lv1: R.Tensor((128, 128), "float32") = R.nn.gelu(lv0) @@ -461,7 +466,6 @@ def main( W1: R.Tensor((128, 64), "float32"), # shard along axis 1 W2: R.Tensor((64, 128), "float32"), # shard along axis 0 ) -> R.Tensor((128, 128), "float32"): - R.func_attr({"global_symbol": "main"}) with R.dataflow(): broadcast_x: R.Tensor((128, 128), "float32") = R.ccl.broadcast_from_worker0(x) lv0: R.Tensor((128, 64), "float32") = R.matmul(broadcast_x, W1) @@ -538,7 +542,6 @@ def main( # pylint: disable=too-many-locals Wv: R.Tensor((128, 512), "float32"), Wo: R.Tensor((512, 128), "float32"), ) -> R.Tensor((128, 128), "float32"): - R.func_attr({"global_symbol": "main"}) with R.dataflow(): # q lv0: R.Tensor((1, 10, 512), "float32") = R.matmul(x, Wq) @@ -578,7 +581,6 @@ def main( # pylint: disable=too-many-locals Wv: R.Tensor((128, 256), "float32"), # shard along axis 1 Wo: R.Tensor((256, 128), "float32"), # shard along axis 0 ) -> R.Tensor((128, 128), "float32"): - R.func_attr({"global_symbol": "main"}) with R.dataflow(): broadcast_x: R.Tensor((1, 10, 128), "float32") = R.ccl.broadcast_from_worker0(x) # q diff --git a/tests/python/disco/test_custom_allreduce.py b/tests/python/disco/test_custom_allreduce.py index 4aed32c052d9..cd075d6ba2b3 100644 --- a/tests/python/disco/test_custom_allreduce.py +++ b/tests/python/disco/test_custom_allreduce.py @@ -44,7 +44,13 @@ class AllReduceStrategyType(enum.IntEnum): AllReduceStrategyType.AUTO, ] -_ccl = [ccl for ccl in tvm.get_global_func("runtime.disco.compiled_ccl")() if ccl == "nccl"] +_ccl = [ + pytest.param( + ccl, + marks=tvm.testing.Feature._all_features[ccl].marks(), + ) + for ccl in ["nccl"] +] @pytest.mark.parametrize("shape", _shapes) diff --git a/tests/python/disco/test_loader.py b/tests/python/disco/test_loader.py index b4e2440857e6..77fbe4b2667f 100644 --- a/tests/python/disco/test_loader.py +++ b/tests/python/disco/test_loader.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. """Test sharded loader""" + # pylint: disable=missing-docstring import json import tempfile @@ -120,6 +121,7 @@ def _simulate_presharded_weights(base_path, param_dict, num_shards, shard_info): ) +@tvm.testing.requires_nccl def test_load_shard(): devices = [0, 1] num_shards = len(devices) @@ -177,6 +179,7 @@ def _create_presharded_loader(sess, path): return loader +@tvm.testing.requires_nccl def test_load_presharded(): devices = [0, 1] param_dict = { @@ -217,6 +220,7 @@ def test_load_presharded(): ) +@tvm.testing.requires_nccl def test_load_shard_in_relax(): devices = [0, 1] num_shards = len(devices) @@ -248,7 +252,6 @@ class Module: # pylint: disable=too-few-public-methods def main( loader: R.Object, ) -> R.Tuple(R.Tensor((64, 64), "float32"), R.Tensor((16, 128), "float32")): - R.func_attr({"global_symbol": "main"}) with R.dataflow(): lv0: R.Tensor((64, 64), "float32") = R.call_pure_packed( "runtime.disco.ShardLoaderLoad", @@ -309,6 +312,7 @@ def relax_build(mod, target): ) +@tvm.testing.requires_nccl def test_load_shard_all(): devices = [0, 1] num_shards = len(devices) @@ -346,6 +350,7 @@ def test_load_shard_all(): np.testing.assert_equal(param_dict["param_1"][16:32, :], p_1[1].numpy()) +@tvm.testing.requires_nccl def test_load_all_presharded(): devices = [0, 1] num_shards = len(devices) @@ -375,6 +380,7 @@ def test_load_all_presharded(): np.testing.assert_equal(param_dict["param_1"][:, 64:128], p_1[1].numpy()) +@tvm.testing.requires_nccl def test_load_shard_broadcast(): devices = [0, 1] param_dict = { @@ -396,6 +402,7 @@ def test_load_shard_broadcast(): np.testing.assert_equal(param_dict["param_1"], p_1[1].numpy()) +@tvm.testing.requires_nccl def test_load_qkv_proj_shard(): # pylint: disable=too-many-locals devices = [0, 1] num_shards = len(devices) diff --git a/tests/python/disco/test_session.py b/tests/python/disco/test_session.py index 38aa757bf8f1..b780eb97ab01 100644 --- a/tests/python/disco/test_session.py +++ b/tests/python/disco/test_session.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. """Basic tests for a Disco session""" + # pylint: disable=missing-docstring import tempfile @@ -260,7 +261,6 @@ def t2(A: T.Buffer((16, 8), "float32"), B: T.Buffer((8, 16), "float32")): def transpose_1( A: R.Tensor((8, 16), dtype="float32") ) -> R.Tensor((16, 8), dtype="float32"): - R.func_attr({"global_symbol": "transpose_1"}) cls = TestMod with R.dataflow(): B = R.call_tir(cls.t1, (A,), out_sinfo=R.Tensor((16, 8), dtype="float32")) @@ -271,7 +271,6 @@ def transpose_1( def transpose_2( A: R.Tensor((16, 8), dtype="float32") ) -> R.Tensor((8, 16), dtype="float32"): - R.func_attr({"global_symbol": "transpose_2"}) cls = TestMod with R.dataflow(): B = R.call_tir(cls.t2, (A,), out_sinfo=R.Tensor((8, 16), dtype="float32")) diff --git a/tests/scripts/task_python_unittest.sh b/tests/scripts/task_python_unittest.sh index 5b07b5256ea5..764383976f77 100755 --- a/tests/scripts/task_python_unittest.sh +++ b/tests/scripts/task_python_unittest.sh @@ -39,6 +39,7 @@ TEST_FILES=( "auto_scheduler" "autotvm" "codegen" + "disco" "ir" "meta_schedule" "micro"