From 87cdae87509c4a23b0f12f2accace97bfea304f1 Mon Sep 17 00:00:00 2001
From: Dayuxiaoshui <792179245@qq.com>
Date: Fri, 12 Dec 2025 03:59:28 +0000
Subject: [PATCH 1/7] =?UTF-8?q?Fix=20ACOS=20precision=20issue=20for=20boun?=
 =?UTF-8?q?dary=20values=20(x=3D=C2=B11.0)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ACOS operator was producing incorrect results for boundary values
due to poor precision of ASIN's Taylor series expansion near x=±1.0.

Root cause:
- ASIN used a 6-term Taylor series that converges slowly near boundaries
- ACOS was implemented as acos(x) = π/2 - asin(x), inheriting ASIN errors
- At x=1.0, ASIN error of 0.354874 (22.6%) caused ACOS to output 0.354874
  instead of 0.0

Solution:
- Modified ASIN to use system library function (asinf) for |x| >= 0.9
- Modified ACOS to use system library function (acosf) for |x| >= 0.9
- For |x| < 0.9, continue using Taylor series (accurate in this range)

This ensures high precision for boundary values while maintaining the
existing behavior for values in the middle range.

Fixes #18580
---
 src/target/llvm/intrin_rule_llvm.cc | 37 +++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)
diff --git a/src/target/llvm/intrin_rule_llvm.cc b/src/target/llvm/intrin_rule_llvm.cc
index 4ce7ce9f2291..60fa34713a4f 100644
--- a/src/target/llvm/intrin_rule_llvm.cc
+++ b/src/target/llvm/intrin_rule_llvm.cc
@@ -167,9 +167,18 @@ TVM_REGISTER_OP("tir.sinh")
 TVM_REGISTER_OP("tir.asin")
     .set_attr<FLegalize>("llvm.FLegalize", [](const PrimExpr& e) -> PrimExpr {
       using tir::make_const;
+      using namespace intrin;
       const tir::CallNode* call = e.as<tir::CallNode>();
       ICHECK(call != nullptr);
       const PrimExpr& x = call->args[0];
+      
+      // Use system library function for values near boundaries where Taylor series
+      // has poor precision. Threshold chosen to keep error < 1% for Taylor series.
+      PrimExpr threshold = make_const(x.dtype(), 0.9);
+      PrimExpr abs_x = tir::abs(x);
+      PrimExpr use_lib = abs_x >= threshold;
+      
+      // Taylor series for values away from boundaries
       PrimExpr x2 = x * x;
       PrimExpr term1 = x;
       PrimExpr term3 = term1 * x2 / make_const(x.dtype(), 6);
@@ -178,25 +187,45 @@ TVM_REGISTER_OP("tir.asin")
       PrimExpr term9 = term7 * x2 * make_const(x.dtype(), 1225) / make_const(x.dtype(), 3456);
       PrimExpr term11 = term9 * x2 * make_const(x.dtype(), 3969) / make_const(x.dtype(), 28160);
       PrimExpr series = term1 + term3 + term5 + term7 + term9 + term11;
+      
+      // System library function for boundary values
+      PrimExpr lib_result = DispatchPureExtern<FloatSuffix>(e);
+      
       /* --- domain limit check --- */
       PrimExpr lower = make_const(x.dtype(), -1.0);
       PrimExpr upper = make_const(x.dtype(), 1.0);
-      PrimExpr out_range = tir::Or(x<lower, x> upper);
+      PrimExpr out_range = tir::Or(x < lower, x > upper);
       // Use a quiet NaN constant
       PrimExpr nan_const = make_const(x.dtype(), std::numeric_limits<double>::quiet_NaN());
-      // select: if out of [-1,1] → NaN, else → series
-      return tir::Select(out_range, nan_const, series);
+      
+      // select: if out of [-1,1] → NaN, else if |x| >= threshold → lib, else → series
+      return tir::Select(out_range, nan_const,
+                        tir::Select(use_lib, lib_result, series));
     });
 
 TVM_REGISTER_OP("tir.acos")
     .set_attr<FLegalize>("llvm.FLegalize", [](const PrimExpr& e) -> PrimExpr {
       using tir::make_const;
+      using namespace intrin;
       const tir::CallNode* call = e.as<tir::CallNode>();
       ICHECK(call != nullptr) << "Invalid call node in acos legalization";
       const PrimExpr& x = call->args[0];
+      
+      // Use system library function for values near boundaries where ASIN Taylor series
+      // has poor precision, which would cause ACOS errors.
+      PrimExpr threshold = make_const(x.dtype(), 0.9);
+      PrimExpr abs_x = tir::abs(x);
+      PrimExpr use_lib = abs_x >= threshold;
+      
+      // For values away from boundaries, use π/2 - asin(x)
       PrimExpr half_pi = make_const(x.dtype(), M_PI / 2);
       PrimExpr asin_x = asin(x);
-      return half_pi - asin_x;
+      PrimExpr formula_result = half_pi - asin_x;
+      
+      // System library function for boundary values
+      PrimExpr lib_result = DispatchPureExtern<FloatSuffix>(e);
+      
+      return tir::Select(use_lib, lib_result, formula_result);
     });
 
 TVM_REGISTER_OP("tir.atan")

From 38da5475a762678380a59e9494c8f53ff1797adc Mon Sep 17 00:00:00 2001
From: Dayuxiaoshui <792179245@qq.com>
Date: Fri, 12 Dec 2025 04:49:56 +0000
Subject: [PATCH 2/7] Add explicit domain check for acos to match asin
 implementation

---
 src/target/llvm/intrin_rule_llvm.cc | 39 +++++++++++++----------------
 1 file changed, 17 insertions(+), 22 deletions(-)

diff --git a/src/target/llvm/intrin_rule_llvm.cc b/src/target/llvm/intrin_rule_llvm.cc
index 60fa34713a4f..5837d074bc71 100644
--- a/src/target/llvm/intrin_rule_llvm.cc
+++ b/src/target/llvm/intrin_rule_llvm.cc
@@ -171,14 +171,11 @@ TVM_REGISTER_OP("tir.asin")
       const tir::CallNode* call = e.as<tir::CallNode>();
       ICHECK(call != nullptr);
       const PrimExpr& x = call->args[0];
-      
-      // Use system library function for values near boundaries where Taylor series
-      // has poor precision. Threshold chosen to keep error < 1% for Taylor series.
+
       PrimExpr threshold = make_const(x.dtype(), 0.9);
       PrimExpr abs_x = tir::abs(x);
       PrimExpr use_lib = abs_x >= threshold;
-      
-      // Taylor series for values away from boundaries
+
       PrimExpr x2 = x * x;
       PrimExpr term1 = x;
       PrimExpr term3 = term1 * x2 / make_const(x.dtype(), 6);
@@ -187,20 +184,16 @@ TVM_REGISTER_OP("tir.asin")
       PrimExpr term9 = term7 * x2 * make_const(x.dtype(), 1225) / make_const(x.dtype(), 3456);
       PrimExpr term11 = term9 * x2 * make_const(x.dtype(), 3969) / make_const(x.dtype(), 28160);
       PrimExpr series = term1 + term3 + term5 + term7 + term9 + term11;
-      
-      // System library function for boundary values
+
       PrimExpr lib_result = DispatchPureExtern<FloatSuffix>(e);
-      
-      /* --- domain limit check --- */
+
       PrimExpr lower = make_const(x.dtype(), -1.0);
       PrimExpr upper = make_const(x.dtype(), 1.0);
       PrimExpr out_range = tir::Or(x < lower, x > upper);
-      // Use a quiet NaN constant
       PrimExpr nan_const = make_const(x.dtype(), std::numeric_limits<double>::quiet_NaN());
-      
-      // select: if out of [-1,1] → NaN, else if |x| >= threshold → lib, else → series
+
       return tir::Select(out_range, nan_const,
-                        tir::Select(use_lib, lib_result, series));
+                         tir::Select(use_lib, lib_result, series));
     });
 
 TVM_REGISTER_OP("tir.acos")
@@ -210,22 +203,24 @@ TVM_REGISTER_OP("tir.acos")
       const tir::CallNode* call = e.as<tir::CallNode>();
       ICHECK(call != nullptr) << "Invalid call node in acos legalization";
       const PrimExpr& x = call->args[0];
-      
-      // Use system library function for values near boundaries where ASIN Taylor series
-      // has poor precision, which would cause ACOS errors.
+
       PrimExpr threshold = make_const(x.dtype(), 0.9);
       PrimExpr abs_x = tir::abs(x);
       PrimExpr use_lib = abs_x >= threshold;
-      
-      // For values away from boundaries, use π/2 - asin(x)
+
       PrimExpr half_pi = make_const(x.dtype(), M_PI / 2);
       PrimExpr asin_x = asin(x);
       PrimExpr formula_result = half_pi - asin_x;
-      
-      // System library function for boundary values
+
       PrimExpr lib_result = DispatchPureExtern<FloatSuffix>(e);
-      
-      return tir::Select(use_lib, lib_result, formula_result);
+
+      PrimExpr lower = make_const(x.dtype(), -1.0);
+      PrimExpr upper = make_const(x.dtype(), 1.0);
+      PrimExpr out_range = tir::Or(x < lower, x > upper);
+      PrimExpr nan_const = make_const(x.dtype(), std::numeric_limits<double>::quiet_NaN());
+
+      return tir::Select(out_range, nan_const,
+                         tir::Select(use_lib, lib_result, formula_result));
     });
 
 TVM_REGISTER_OP("tir.atan")

From e27cea07a39eb0d7acf4e0ffcc5575520405d3dc Mon Sep 17 00:00:00 2001
From: Dayuxiaoshui <792179245@qq.com>
Date: Fri, 12 Dec 2025 05:08:01 +0000
Subject: [PATCH 3/7] Fix clang-format issues: adjust operator spacing and
 return statement formatting

---
 src/target/llvm/intrin_rule_llvm.cc | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/target/llvm/intrin_rule_llvm.cc b/src/target/llvm/intrin_rule_llvm.cc
index 5837d074bc71..ba8eac7122b7 100644
--- a/src/target/llvm/intrin_rule_llvm.cc
+++ b/src/target/llvm/intrin_rule_llvm.cc
@@ -189,11 +189,10 @@ TVM_REGISTER_OP("tir.asin")
 
       PrimExpr lower = make_const(x.dtype(), -1.0);
       PrimExpr upper = make_const(x.dtype(), 1.0);
-      PrimExpr out_range = tir::Or(x < lower, x > upper);
+      PrimExpr out_range = tir::Or(x<lower, x> upper);
       PrimExpr nan_const = make_const(x.dtype(), std::numeric_limits<double>::quiet_NaN());
 
-      return tir::Select(out_range, nan_const,
-                         tir::Select(use_lib, lib_result, series));
+      return tir::Select(out_range, nan_const, tir::Select(use_lib, lib_result, series));
     });
 
 TVM_REGISTER_OP("tir.acos")
@@ -216,11 +215,10 @@ TVM_REGISTER_OP("tir.acos")
 
       PrimExpr lower = make_const(x.dtype(), -1.0);
       PrimExpr upper = make_const(x.dtype(), 1.0);
-      PrimExpr out_range = tir::Or(x < lower, x > upper);
+      PrimExpr out_range = tir::Or(x<lower, x> upper);
       PrimExpr nan_const = make_const(x.dtype(), std::numeric_limits<double>::quiet_NaN());
 
-      return tir::Select(out_range, nan_const,
-                         tir::Select(use_lib, lib_result, formula_result));
+      return tir::Select(out_range, nan_const, tir::Select(use_lib, lib_result, formula_result));
     });
 
 TVM_REGISTER_OP("tir.atan")

From 7d1b8df953e6fde4a8974a1e0a1737a91ec71ff1 Mon Sep 17 00:00:00 2001
From: Dayuxiaoshui <792179245@qq.com>
Date: Fri, 12 Dec 2025 06:30:22 +0000
Subject: [PATCH 4/7] Fix compilation errors: use tvm::abs and intrin::
 namespace for template parameters

---
 src/target/llvm/intrin_rule_llvm.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/target/llvm/intrin_rule_llvm.cc b/src/target/llvm/intrin_rule_llvm.cc
index ba8eac7122b7..2cc3d2f2959d 100644
--- a/src/target/llvm/intrin_rule_llvm.cc
+++ b/src/target/llvm/intrin_rule_llvm.cc
@@ -173,7 +173,7 @@ TVM_REGISTER_OP("tir.asin")
       const PrimExpr& x = call->args[0];
 
       PrimExpr threshold = make_const(x.dtype(), 0.9);
-      PrimExpr abs_x = tir::abs(x);
+      PrimExpr abs_x = tvm::abs(x);
       PrimExpr use_lib = abs_x >= threshold;
 
       PrimExpr x2 = x * x;
@@ -185,7 +185,7 @@ TVM_REGISTER_OP("tir.asin")
       PrimExpr term11 = term9 * x2 * make_const(x.dtype(), 3969) / make_const(x.dtype(), 28160);
       PrimExpr series = term1 + term3 + term5 + term7 + term9 + term11;
 
-      PrimExpr lib_result = DispatchPureExtern<FloatSuffix>(e);
+      PrimExpr lib_result = intrin::DispatchPureExtern<intrin::FloatSuffix>(e);
 
       PrimExpr lower = make_const(x.dtype(), -1.0);
       PrimExpr upper = make_const(x.dtype(), 1.0);
@@ -204,14 +204,14 @@ TVM_REGISTER_OP("tir.acos")
       const PrimExpr& x = call->args[0];
 
       PrimExpr threshold = make_const(x.dtype(), 0.9);
-      PrimExpr abs_x = tir::abs(x);
+      PrimExpr abs_x = tvm::abs(x);
       PrimExpr use_lib = abs_x >= threshold;
 
       PrimExpr half_pi = make_const(x.dtype(), M_PI / 2);
       PrimExpr asin_x = asin(x);
       PrimExpr formula_result = half_pi - asin_x;
 
-      PrimExpr lib_result = DispatchPureExtern<FloatSuffix>(e);
+      PrimExpr lib_result = intrin::DispatchPureExtern<intrin::FloatSuffix>(e);
 
       PrimExpr lower = make_const(x.dtype(), -1.0);
       PrimExpr upper = make_const(x.dtype(), 1.0);

From 22d1b1457609eb60a0c59614261b185be40bc1d5 Mon Sep 17 00:00:00 2001
From: Dayuxiaoshui <792179245@qq.com>
Date: Fri, 12 Dec 2025 06:55:06 +0000
Subject: [PATCH 5/7] Fix namespace resolution for DispatchPureExtern in LLVM
 intrinsic rules

---
 src/target/llvm/intrin_rule_llvm.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/target/llvm/intrin_rule_llvm.cc b/src/target/llvm/intrin_rule_llvm.cc
index 2cc3d2f2959d..138df3871daf 100644
--- a/src/target/llvm/intrin_rule_llvm.cc
+++ b/src/target/llvm/intrin_rule_llvm.cc
@@ -185,7 +185,7 @@ TVM_REGISTER_OP("tir.asin")
       PrimExpr term11 = term9 * x2 * make_const(x.dtype(), 3969) / make_const(x.dtype(), 28160);
       PrimExpr series = term1 + term3 + term5 + term7 + term9 + term11;
 
-      PrimExpr lib_result = intrin::DispatchPureExtern<intrin::FloatSuffix>(e);
+      PrimExpr lib_result = ::tvm::codegen::intrin::DispatchPureExtern<::tvm::codegen::intrin::FloatSuffix>(e);
 
       PrimExpr lower = make_const(x.dtype(), -1.0);
       PrimExpr upper = make_const(x.dtype(), 1.0);
@@ -211,7 +211,7 @@ TVM_REGISTER_OP("tir.acos")
       PrimExpr asin_x = asin(x);
       PrimExpr formula_result = half_pi - asin_x;
 
-      PrimExpr lib_result = intrin::DispatchPureExtern<intrin::FloatSuffix>(e);
+      PrimExpr lib_result = ::tvm::codegen::intrin::DispatchPureExtern<::tvm::codegen::intrin::FloatSuffix>(e);
 
       PrimExpr lower = make_const(x.dtype(), -1.0);
       PrimExpr upper = make_const(x.dtype(), 1.0);

From 07d8c2155f5561f6fbe11f916877bcd1d783016f Mon Sep 17 00:00:00 2001
From: Dayuxiaoshui <792179245@qq.com>
Date: Fri, 12 Dec 2025 07:10:49 +0000
Subject: [PATCH 6/7] Fix line length lint errors in intrin_rule_llvm.cc

---
 src/target/llvm/intrin_rule_llvm.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/target/llvm/intrin_rule_llvm.cc b/src/target/llvm/intrin_rule_llvm.cc
index 138df3871daf..14a2772a8af4 100644
--- a/src/target/llvm/intrin_rule_llvm.cc
+++ b/src/target/llvm/intrin_rule_llvm.cc
@@ -185,7 +185,8 @@ TVM_REGISTER_OP("tir.asin")
       PrimExpr term11 = term9 * x2 * make_const(x.dtype(), 3969) / make_const(x.dtype(), 28160);
       PrimExpr series = term1 + term3 + term5 + term7 + term9 + term11;
 
-      PrimExpr lib_result = ::tvm::codegen::intrin::DispatchPureExtern<::tvm::codegen::intrin::FloatSuffix>(e);
+      PrimExpr lib_result =
+          ::tvm::codegen::intrin::DispatchPureExtern<::tvm::codegen::intrin::FloatSuffix>(e);
 
       PrimExpr lower = make_const(x.dtype(), -1.0);
       PrimExpr upper = make_const(x.dtype(), 1.0);
@@ -211,7 +212,8 @@ TVM_REGISTER_OP("tir.acos")
       PrimExpr asin_x = asin(x);
       PrimExpr formula_result = half_pi - asin_x;
 
-      PrimExpr lib_result = ::tvm::codegen::intrin::DispatchPureExtern<::tvm::codegen::intrin::FloatSuffix>(e);
+      PrimExpr lib_result =
+          ::tvm::codegen::intrin::DispatchPureExtern<::tvm::codegen::intrin::FloatSuffix>(e);
 
       PrimExpr lower = make_const(x.dtype(), -1.0);
       PrimExpr upper = make_const(x.dtype(), 1.0);

From b584e10876f82f8267f0bfc061f64180d435d909 Mon Sep 17 00:00:00 2001
From: Dayuxiaoshui <792179245@qq.com>
Date: Sun, 14 Dec 2025 09:21:00 +0000
Subject: [PATCH 7/7] Improve asin/acos precision by adjusting threshold and
 add boundary tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Lower threshold from 0.9 to 0.5 for asin/acos legalization
  * For |x| >= 0.5: use system library function for better accuracy
  * For |x| < 0.5: use Taylor series for efficiency
  * This improves precision for values near the threshold while maintaining
    performance for smaller values

- Add test_asin_acos_boundary_values() test function
  * Tests boundary values (±1.0)
  * Tests threshold switching point (±0.5)
  * Tests values below threshold (±0.49, ±0.3, 0.0)
  * Tests out-of-domain values (should return NaN)

This addresses precision issues with asin/acos near boundary values.
---
 src/target/llvm/intrin_rule_llvm.cc      |  4 +-
 tests/python/tir-base/test_tir_intrin.py | 53 ++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/src/target/llvm/intrin_rule_llvm.cc b/src/target/llvm/intrin_rule_llvm.cc
index 14a2772a8af4..a8a3d911ca8e 100644
--- a/src/target/llvm/intrin_rule_llvm.cc
+++ b/src/target/llvm/intrin_rule_llvm.cc
@@ -172,7 +172,7 @@ TVM_REGISTER_OP("tir.asin")
       ICHECK(call != nullptr);
       const PrimExpr& x = call->args[0];
 
-      PrimExpr threshold = make_const(x.dtype(), 0.9);
+      PrimExpr threshold = make_const(x.dtype(), 0.5);
       PrimExpr abs_x = tvm::abs(x);
       PrimExpr use_lib = abs_x >= threshold;
 
@@ -204,7 +204,7 @@ TVM_REGISTER_OP("tir.acos")
       ICHECK(call != nullptr) << "Invalid call node in acos legalization";
       const PrimExpr& x = call->args[0];
 
-      PrimExpr threshold = make_const(x.dtype(), 0.9);
+      PrimExpr threshold = make_const(x.dtype(), 0.5);
       PrimExpr abs_x = tvm::abs(x);
       PrimExpr use_lib = abs_x >= threshold;
 
diff --git a/tests/python/tir-base/test_tir_intrin.py b/tests/python/tir-base/test_tir_intrin.py
index 8dabdbb344f3..1e8c88e08e65 100644
--- a/tests/python/tir-base/test_tir_intrin.py
+++ b/tests/python/tir-base/test_tir_intrin.py
@@ -135,6 +135,58 @@ def run_test(tvm_intrin, np_func, atol=1e-5, rtol=1e-5):
         run_test(*func, atol, rtol)
 
 
+def test_asin_acos_boundary_values():
+    """Test asin and acos with boundary values and threshold switching."""
+    test_funcs = [
+        (tvm.tir.asin, lambda x: np.arcsin(x)),
+        (tvm.tir.acos, lambda x: np.arccos(x)),
+    ]
+
+    def run_test(tvm_intrin, np_func):
+        m = te.var("m")
+        A = te.placeholder((m,), name="A")
+        B = te.compute((m,), lambda *i: tvm_intrin(A(*i)), name="B")
+
+        mod = te.create_prim_func([A, B])
+        sch = tir.Schedule(mod)
+        func = tvm.compile(sch.mod, target="llvm")
+
+        dev = tvm.cpu(0)
+
+        # Test boundary values: ±1.0 (should use system library)
+        boundary_values = np.array([1.0, -1.0], dtype=np.float32)
+        a1 = tvm.runtime.tensor(boundary_values, dev)
+        b1 = tvm.runtime.tensor(np.empty_like(boundary_values), dev)
+        func(a1, b1)
+        tvm.testing.assert_allclose(b1.numpy(), np_func(boundary_values), atol=1e-5, rtol=1e-5)
+
+        # Test values at threshold: ±0.5 (should use system library)
+        threshold_values = np.array([0.5, -0.5], dtype=np.float32)
+        a2 = tvm.runtime.tensor(threshold_values, dev)
+        b2 = tvm.runtime.tensor(np.empty_like(threshold_values), dev)
+        func(a2, b2)
+        tvm.testing.assert_allclose(b2.numpy(), np_func(threshold_values), atol=1e-4, rtol=1e-4)
+
+        # Test values just below threshold: ±0.49 (should use Taylor series)
+        below_threshold_values = np.array([0.49, -0.49, 0.3, -0.3, 0.0], dtype=np.float32)
+        a3 = tvm.runtime.tensor(below_threshold_values, dev)
+        b3 = tvm.runtime.tensor(np.empty_like(below_threshold_values), dev)
+        func(a3, b3)
+        tvm.testing.assert_allclose(
+            b3.numpy(), np_func(below_threshold_values), atol=1e-3, rtol=1e-3
+        )
+
+        # Test out-of-domain values: should return NaN
+        out_of_domain = np.array([1.1, -1.1, 2.0, -2.0], dtype=np.float32)
+        a4 = tvm.runtime.tensor(out_of_domain, dev)
+        b4 = tvm.runtime.tensor(np.empty_like(out_of_domain), dev)
+        func(a4, b4)
+        assert np.all(np.isnan(b4.numpy())), "Out-of-domain inputs should return NaN"
+
+    for func in test_funcs:
+        run_test(*func)
+
+
 def test_binary_intrin():
     test_funcs = [
         (tvm.tir.atan2, lambda x1, x2: np.arctan2(x1, x2)),
@@ -315,6 +367,7 @@ def test_fma():
     test_nearbyint()
     test_unary_intrin()
     test_round_intrinsics_on_int()
+    test_asin_acos_boundary_values()
     test_binary_intrin()
     test_ldexp()
     test_clz()