From 1adbedc2084973d109d58226f710194aa6255dca Mon Sep 17 00:00:00 2001
From: krishnaraj36 <quic_kvegiraj@quicinc.com>
Date: Tue, 7 May 2024 12:53:52 +0530
Subject: [PATCH 1/4] [DLIGHT][GPU] Enhance opencl thread limit for schedules

Enhanced the opencl thread limit and improved the gpu schedules
for opencl targets.
It improves decode performance 20 % for few set of models.
---
 python/tvm/dlight/gpu/general_reduction.py | 3 +++
 python/tvm/dlight/gpu/rmsnorm.py           | 2 ++
 python/tvm/dlight/gpu/transpose.py         | 4 ++++
 python/tvm/dlight/gpu/utils.py             | 2 ++
 src/target/target_kind.cc                  | 4 ++--
 5 files changed, 13 insertions(+), 2 deletions(-)
diff --git a/python/tvm/dlight/gpu/general_reduction.py b/python/tvm/dlight/gpu/general_reduction.py
index ef6bb1db91e1..e996e5e2026c 100644
--- a/python/tvm/dlight/gpu/general_reduction.py
+++ b/python/tvm/dlight/gpu/general_reduction.py
@@ -40,6 +40,9 @@ def apply(  # pylint: disable=too-many-locals
         if target.kind.name == "cuda":
             len_tx = 256
             unroll_depth = 256
+        if target.kind.name == "opencl":
+            len_tx = 256
+            unroll_depth = 64
         else:
             len_tx = 64
             unroll_depth = 64
diff --git a/python/tvm/dlight/gpu/rmsnorm.py b/python/tvm/dlight/gpu/rmsnorm.py
index f8b2bb4a172d..35cf26bbe62a 100644
--- a/python/tvm/dlight/gpu/rmsnorm.py
+++ b/python/tvm/dlight/gpu/rmsnorm.py
@@ -82,6 +82,8 @@ def apply(  # pylint: disable=too-many-locals,missing-docstring
     ) -> tir.Schedule:
         if target.kind.name == "cuda":
             num_tx = 512
+        if target.kind.name == "opencl":
+            num_tx = 256
         else:
             num_tx = 64
 
diff --git a/python/tvm/dlight/gpu/transpose.py b/python/tvm/dlight/gpu/transpose.py
index d4496756a2d0..85ed0ce18213 100644
--- a/python/tvm/dlight/gpu/transpose.py
+++ b/python/tvm/dlight/gpu/transpose.py
@@ -57,6 +57,10 @@ def apply(  # pylint: disable=too-many-locals
             len_tx = 16
             len_ty = 8
             unroll_depth = 256
+        if target.kind.name == "opencl":
+            len_tx = 16
+            len_ty = 8
+            unroll_depth = 64
         else:
             len_tx = 8
             len_ty = 4
diff --git a/python/tvm/dlight/gpu/utils.py b/python/tvm/dlight/gpu/utils.py
index 4f2df5cfa0c9..e27a6969ad88 100644
--- a/python/tvm/dlight/gpu/utils.py
+++ b/python/tvm/dlight/gpu/utils.py
@@ -55,6 +55,8 @@ def suggest_threads_per_block(
         threads = 256
     elif target.kind.name == "metal":
         threads = 256
+    elif target.kind.name == "opencl":
+        threads = 256
     else:
         threads = 64
     results: List[Optional[int]] = []
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 708d3ccd7621..f0fec041abcb 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -340,9 +340,9 @@ TVM_REGISTER_TARGET_KIND("rocm", kDLROCM)
     .set_target_parser(UpdateROCmAttrs);
 
 TVM_REGISTER_TARGET_KIND("opencl", kDLOpenCL)
-    .add_attr_option<Integer>("max_threads_per_block", Integer(256))
+    .add_attr_option<Integer>("max_threads_per_block", Integer(1024))
     .add_attr_option<Integer>("max_shared_memory_per_block", Integer(16384))
-    .add_attr_option<Integer>("max_num_threads", Integer(256))
+    .add_attr_option<Integer>("max_num_threads", Integer(1024))
     .add_attr_option<Integer>("thread_warp_size", Integer(1))
     .add_attr_option<Integer>("texture_spatial_limit", Integer(16384))
     // Faced that Qualcomm OpenCL runtime crashed without any error message in

From ebe70b30354c47c9a5cf6322250233488a5f140e Mon Sep 17 00:00:00 2001
From: krishnaraj36 <quic_kvegiraj@quicinc.com>
Date: Wed, 8 May 2024 17:18:50 +0530
Subject: [PATCH 2/4] Update the build test

---
 tests/cpp/build_module_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc
index 3d2adb235546..e47d0e258e8f 100644
--- a/tests/cpp/build_module_test.cc
+++ b/tests/cpp/build_module_test.cc
@@ -63,7 +63,7 @@ TEST(BuildModule, Basic) {
   ICHECK_EQ(mali_target->keys[2], "gpu");
   ICHECK_EQ(mali_target->GetAttr<String>("device").value(), "mali");
   ICHECK_EQ(mali_target->GetAttr<String>("model").value(), "Mali-T860MP4@800Mhz");
-  ICHECK_EQ(mali_target->GetAttr<Integer>("max_num_threads").value(), 256);
+  ICHECK_EQ(mali_target->GetAttr<Integer>("max_num_threads").value(), 1024);
 }
 
 TEST(BuildModule, Heterogeneous) {

From 4a7cfc56bc48f83ef344b54119ea87f0b985e37f Mon Sep 17 00:00:00 2001
From: krishnaraj36 <quic_kvegiraj@quicinc.com>
Date: Wed, 8 May 2024 17:41:11 +0530
Subject: [PATCH 3/4] reverted opencl max_thread enhancement

---
 src/target/target_kind.cc      | 4 ++--
 tests/cpp/build_module_test.cc | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index f0fec041abcb..708d3ccd7621 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -340,9 +340,9 @@ TVM_REGISTER_TARGET_KIND("rocm", kDLROCM)
     .set_target_parser(UpdateROCmAttrs);
 
 TVM_REGISTER_TARGET_KIND("opencl", kDLOpenCL)
-    .add_attr_option<Integer>("max_threads_per_block", Integer(1024))
+    .add_attr_option<Integer>("max_threads_per_block", Integer(256))
     .add_attr_option<Integer>("max_shared_memory_per_block", Integer(16384))
-    .add_attr_option<Integer>("max_num_threads", Integer(1024))
+    .add_attr_option<Integer>("max_num_threads", Integer(256))
     .add_attr_option<Integer>("thread_warp_size", Integer(1))
     .add_attr_option<Integer>("texture_spatial_limit", Integer(16384))
     // Faced that Qualcomm OpenCL runtime crashed without any error message in
diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc
index e47d0e258e8f..3d2adb235546 100644
--- a/tests/cpp/build_module_test.cc
+++ b/tests/cpp/build_module_test.cc
@@ -63,7 +63,7 @@ TEST(BuildModule, Basic) {
   ICHECK_EQ(mali_target->keys[2], "gpu");
   ICHECK_EQ(mali_target->GetAttr<String>("device").value(), "mali");
   ICHECK_EQ(mali_target->GetAttr<String>("model").value(), "Mali-T860MP4@800Mhz");
-  ICHECK_EQ(mali_target->GetAttr<Integer>("max_num_threads").value(), 1024);
+  ICHECK_EQ(mali_target->GetAttr<Integer>("max_num_threads").value(), 256);
 }
 
 TEST(BuildModule, Heterogeneous) {

From 3af9b21fae22be4d044a3a4d0e85bb1f25ed7491 Mon Sep 17 00:00:00 2001
From: krishnaraj36 <quic_kvegiraj@quicinc.com>
Date: Thu, 9 May 2024 11:00:36 +0530
Subject: [PATCH 4/4] Fix in opencl thread assign

---
 python/tvm/dlight/gpu/general_reduction.py | 2 +-
 python/tvm/dlight/gpu/rmsnorm.py           | 2 +-
 python/tvm/dlight/gpu/transpose.py         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/dlight/gpu/general_reduction.py b/python/tvm/dlight/gpu/general_reduction.py
index e996e5e2026c..404b73a6f0cc 100644
--- a/python/tvm/dlight/gpu/general_reduction.py
+++ b/python/tvm/dlight/gpu/general_reduction.py
@@ -40,7 +40,7 @@ def apply(  # pylint: disable=too-many-locals
         if target.kind.name == "cuda":
             len_tx = 256
             unroll_depth = 256
-        if target.kind.name == "opencl":
+        elif target.kind.name == "opencl":
             len_tx = 256
             unroll_depth = 64
         else:
diff --git a/python/tvm/dlight/gpu/rmsnorm.py b/python/tvm/dlight/gpu/rmsnorm.py
index 35cf26bbe62a..4047721c9aa8 100644
--- a/python/tvm/dlight/gpu/rmsnorm.py
+++ b/python/tvm/dlight/gpu/rmsnorm.py
@@ -82,7 +82,7 @@ def apply(  # pylint: disable=too-many-locals,missing-docstring
     ) -> tir.Schedule:
         if target.kind.name == "cuda":
             num_tx = 512
-        if target.kind.name == "opencl":
+        elif target.kind.name == "opencl":
             num_tx = 256
         else:
             num_tx = 64
diff --git a/python/tvm/dlight/gpu/transpose.py b/python/tvm/dlight/gpu/transpose.py
index 85ed0ce18213..3bef3d61e536 100644
--- a/python/tvm/dlight/gpu/transpose.py
+++ b/python/tvm/dlight/gpu/transpose.py
@@ -57,7 +57,7 @@ def apply(  # pylint: disable=too-many-locals
             len_tx = 16
             len_ty = 8
             unroll_depth = 256
-        if target.kind.name == "opencl":
+        elif target.kind.name == "opencl":
             len_tx = 16
             len_ty = 8
             unroll_depth = 64