From 1adbedc2084973d109d58226f710194aa6255dca Mon Sep 17 00:00:00 2001 From: krishnaraj36 Date: Tue, 7 May 2024 12:53:52 +0530 Subject: [PATCH 1/4] [DLIGHT][GPU] Enhance opencl thread limit for schedules Enhanced the opencl thread limit and improved the gpu schedules for opencl targets. It improves decode performance 20 % for few set of models. --- python/tvm/dlight/gpu/general_reduction.py | 3 +++ python/tvm/dlight/gpu/rmsnorm.py | 2 ++ python/tvm/dlight/gpu/transpose.py | 4 ++++ python/tvm/dlight/gpu/utils.py | 2 ++ src/target/target_kind.cc | 4 ++-- 5 files changed, 13 insertions(+), 2 deletions(-) diff --git a/python/tvm/dlight/gpu/general_reduction.py b/python/tvm/dlight/gpu/general_reduction.py index ef6bb1db91e1..e996e5e2026c 100644 --- a/python/tvm/dlight/gpu/general_reduction.py +++ b/python/tvm/dlight/gpu/general_reduction.py @@ -40,6 +40,9 @@ def apply( # pylint: disable=too-many-locals if target.kind.name == "cuda": len_tx = 256 unroll_depth = 256 + if target.kind.name == "opencl": + len_tx = 256 + unroll_depth = 64 else: len_tx = 64 unroll_depth = 64 diff --git a/python/tvm/dlight/gpu/rmsnorm.py b/python/tvm/dlight/gpu/rmsnorm.py index f8b2bb4a172d..35cf26bbe62a 100644 --- a/python/tvm/dlight/gpu/rmsnorm.py +++ b/python/tvm/dlight/gpu/rmsnorm.py @@ -82,6 +82,8 @@ def apply( # pylint: disable=too-many-locals,missing-docstring ) -> tir.Schedule: if target.kind.name == "cuda": num_tx = 512 + if target.kind.name == "opencl": + num_tx = 256 else: num_tx = 64 diff --git a/python/tvm/dlight/gpu/transpose.py b/python/tvm/dlight/gpu/transpose.py index d4496756a2d0..85ed0ce18213 100644 --- a/python/tvm/dlight/gpu/transpose.py +++ b/python/tvm/dlight/gpu/transpose.py @@ -57,6 +57,10 @@ def apply( # pylint: disable=too-many-locals len_tx = 16 len_ty = 8 unroll_depth = 256 + if target.kind.name == "opencl": + len_tx = 16 + len_ty = 8 + unroll_depth = 64 else: len_tx = 8 len_ty = 4 diff --git a/python/tvm/dlight/gpu/utils.py b/python/tvm/dlight/gpu/utils.py index 4f2df5cfa0c9..e27a6969ad88 100644 --- a/python/tvm/dlight/gpu/utils.py +++ b/python/tvm/dlight/gpu/utils.py @@ -55,6 +55,8 @@ def suggest_threads_per_block( threads = 256 elif target.kind.name == "metal": threads = 256 + elif target.kind.name == "opencl": + threads = 256 else: threads = 64 results: List[Optional[int]] = [] diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc index 708d3ccd7621..f0fec041abcb 100644 --- a/src/target/target_kind.cc +++ b/src/target/target_kind.cc @@ -340,9 +340,9 @@ TVM_REGISTER_TARGET_KIND("rocm", kDLROCM) .set_target_parser(UpdateROCmAttrs); TVM_REGISTER_TARGET_KIND("opencl", kDLOpenCL) - .add_attr_option("max_threads_per_block", Integer(256)) + .add_attr_option("max_threads_per_block", Integer(1024)) .add_attr_option("max_shared_memory_per_block", Integer(16384)) - .add_attr_option("max_num_threads", Integer(256)) + .add_attr_option("max_num_threads", Integer(1024)) .add_attr_option("thread_warp_size", Integer(1)) .add_attr_option("texture_spatial_limit", Integer(16384)) // Faced that Qualcomm OpenCL runtime crashed without any error message in From ebe70b30354c47c9a5cf6322250233488a5f140e Mon Sep 17 00:00:00 2001 From: krishnaraj36 Date: Wed, 8 May 2024 17:18:50 +0530 Subject: [PATCH 2/4] Update the build test --- tests/cpp/build_module_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc index 3d2adb235546..e47d0e258e8f 100644 --- a/tests/cpp/build_module_test.cc +++ b/tests/cpp/build_module_test.cc @@ -63,7 +63,7 @@ TEST(BuildModule, Basic) { ICHECK_EQ(mali_target->keys[2], "gpu"); ICHECK_EQ(mali_target->GetAttr("device").value(), "mali"); ICHECK_EQ(mali_target->GetAttr("model").value(), "Mali-T860MP4@800Mhz"); - ICHECK_EQ(mali_target->GetAttr("max_num_threads").value(), 256); + ICHECK_EQ(mali_target->GetAttr("max_num_threads").value(), 1024); } TEST(BuildModule, Heterogeneous) { From 4a7cfc56bc48f83ef344b54119ea87f0b985e37f Mon Sep 17 00:00:00 2001 From: krishnaraj36 Date: Wed, 8 May 2024 17:41:11 +0530 Subject: [PATCH 3/4] reverted opencl max_thread enhancement --- src/target/target_kind.cc | 4 ++-- tests/cpp/build_module_test.cc | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc index f0fec041abcb..708d3ccd7621 100644 --- a/src/target/target_kind.cc +++ b/src/target/target_kind.cc @@ -340,9 +340,9 @@ TVM_REGISTER_TARGET_KIND("rocm", kDLROCM) .set_target_parser(UpdateROCmAttrs); TVM_REGISTER_TARGET_KIND("opencl", kDLOpenCL) - .add_attr_option("max_threads_per_block", Integer(1024)) + .add_attr_option("max_threads_per_block", Integer(256)) .add_attr_option("max_shared_memory_per_block", Integer(16384)) - .add_attr_option("max_num_threads", Integer(1024)) + .add_attr_option("max_num_threads", Integer(256)) .add_attr_option("thread_warp_size", Integer(1)) .add_attr_option("texture_spatial_limit", Integer(16384)) // Faced that Qualcomm OpenCL runtime crashed without any error message in diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc index e47d0e258e8f..3d2adb235546 100644 --- a/tests/cpp/build_module_test.cc +++ b/tests/cpp/build_module_test.cc @@ -63,7 +63,7 @@ TEST(BuildModule, Basic) { ICHECK_EQ(mali_target->keys[2], "gpu"); ICHECK_EQ(mali_target->GetAttr("device").value(), "mali"); ICHECK_EQ(mali_target->GetAttr("model").value(), "Mali-T860MP4@800Mhz"); - ICHECK_EQ(mali_target->GetAttr("max_num_threads").value(), 1024); + ICHECK_EQ(mali_target->GetAttr("max_num_threads").value(), 256); } TEST(BuildModule, Heterogeneous) { From 3af9b21fae22be4d044a3a4d0e85bb1f25ed7491 Mon Sep 17 00:00:00 2001 From: krishnaraj36 Date: Thu, 9 May 2024 11:00:36 +0530 Subject: [PATCH 4/4] Fix in opencl thread assign --- python/tvm/dlight/gpu/general_reduction.py | 2 +- python/tvm/dlight/gpu/rmsnorm.py | 2 +- python/tvm/dlight/gpu/transpose.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/tvm/dlight/gpu/general_reduction.py b/python/tvm/dlight/gpu/general_reduction.py index e996e5e2026c..404b73a6f0cc 100644 --- a/python/tvm/dlight/gpu/general_reduction.py +++ b/python/tvm/dlight/gpu/general_reduction.py @@ -40,7 +40,7 @@ def apply( # pylint: disable=too-many-locals if target.kind.name == "cuda": len_tx = 256 unroll_depth = 256 - if target.kind.name == "opencl": + elif target.kind.name == "opencl": len_tx = 256 unroll_depth = 64 else: diff --git a/python/tvm/dlight/gpu/rmsnorm.py b/python/tvm/dlight/gpu/rmsnorm.py index 35cf26bbe62a..4047721c9aa8 100644 --- a/python/tvm/dlight/gpu/rmsnorm.py +++ b/python/tvm/dlight/gpu/rmsnorm.py @@ -82,7 +82,7 @@ def apply( # pylint: disable=too-many-locals,missing-docstring ) -> tir.Schedule: if target.kind.name == "cuda": num_tx = 512 - if target.kind.name == "opencl": + elif target.kind.name == "opencl": num_tx = 256 else: num_tx = 64 diff --git a/python/tvm/dlight/gpu/transpose.py b/python/tvm/dlight/gpu/transpose.py index 85ed0ce18213..3bef3d61e536 100644 --- a/python/tvm/dlight/gpu/transpose.py +++ b/python/tvm/dlight/gpu/transpose.py @@ -57,7 +57,7 @@ def apply( # pylint: disable=too-many-locals len_tx = 16 len_ty = 8 unroll_depth = 256 - if target.kind.name == "opencl": + elif target.kind.name == "opencl": len_tx = 16 len_ty = 8 unroll_depth = 64