From 7e38020d06e75b748987cc922e10dd791ec1de35 Mon Sep 17 00:00:00 2001
From: iabhi4 <iamonecool@gmail.com>
Date: Mon, 26 May 2025 13:44:22 -0700
Subject: [PATCH 1/2] ARROW-46589: Fixed utf8_is_digit to support full Unicode
 digit range

---
 cpp/src/arrow/compute/kernels/scalar_string_test.cc | 12 ++++++++----
 cpp/src/arrow/compute/kernels/scalar_string_utf8.cc |  7 ++++---
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index bce788ca38d..6167621845a 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -1384,10 +1384,14 @@ TYPED_TEST(TestStringKernels, IsDecimalUnicode) {
 }
 
 TYPED_TEST(TestStringKernels, IsDigitUnicode) {
-  // These are digits according to Python, but we don't have the information in
-  // utf8proc for this
-  // this->CheckUnary("utf8_is_digit", "[\"²\", \"①\"]", boolean(), "[true,
-  // true]");
+  // Tests for digits across various Unicode scripts.
+  // ٤: Arabic 4, ³: Superscript 3, ५: Devanagari 5, Ⅷ: Roman 8 (not digit), １２３: Fullwidth 123
+  // '¾' (vulgar fraction) is treated as a digit by utf8proc
+  this->CheckUnary(
+      "utf8_is_digit",
+      R"(["0", "٤", "۵", "३", "१२३", "٣٣", "²", "１２３", "٣٢", "٩", "①", "Ⅷ", "abc" , "⻁", ""])",
+      boolean(),
+      R"([true, true, true, true, true, true, true, true, true, true, true, false, false, false, false])");
 }
 
 TYPED_TEST(TestStringKernels, IsNumericUnicode) {
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc b/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc
index e7b7952df3a..baf32df7b29 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc
@@ -138,9 +138,10 @@ static inline bool IsDecimalCharacterUnicode(uint32_t codepoint) {
 }
 
 static inline bool IsDigitCharacterUnicode(uint32_t codepoint) {
-  // Python defines this as Numeric_Type=Digit or Numeric_Type=Decimal.
-  // utf8proc has no support for this, this is the best we can do:
-  return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND);
+  // Approximates Python's str.isnumeric():
+  // returns true for Nd and No (e.g., '٣', '³'), but excludes Nl like Roman numerals ('Ⅷ') due to utf8proc limits.
+  // '¾' (vulgar fraction) is treated as a digit by utf8proc 'No'
+  return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND, UTF8PROC_CATEGORY_NO);
 }
 
 static inline bool IsNumericCharacterUnicode(uint32_t codepoint) {

From ce921513241e144c3e362a4448327f1fba2790e9 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Mon, 2 Jun 2025 12:22:19 +0200
Subject: [PATCH 2/2] Fix lint

---
 cpp/src/arrow/compute/kernels/scalar_string_test.cc | 3 ++-
 cpp/src/arrow/compute/kernels/scalar_string_utf8.cc | 6 ++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 6167621845a..52616323b02 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -1385,7 +1385,8 @@ TYPED_TEST(TestStringKernels, IsDecimalUnicode) {
 
 TYPED_TEST(TestStringKernels, IsDigitUnicode) {
   // Tests for digits across various Unicode scripts.
-  // ٤: Arabic 4, ³: Superscript 3, ५: Devanagari 5, Ⅷ: Roman 8 (not digit), １２３: Fullwidth 123
+  // ٤: Arabic 4, ³: Superscript 3, ५: Devanagari 5, Ⅷ: Roman 8 (not digit),
+  // １２３: Fullwidth 123.
   // '¾' (vulgar fraction) is treated as a digit by utf8proc
   this->CheckUnary(
       "utf8_is_digit",
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc b/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc
index baf32df7b29..8eb2751cbb0 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc
@@ -139,9 +139,11 @@ static inline bool IsDecimalCharacterUnicode(uint32_t codepoint) {
 
 static inline bool IsDigitCharacterUnicode(uint32_t codepoint) {
   // Approximates Python's str.isnumeric():
-  // returns true for Nd and No (e.g., '٣', '³'), but excludes Nl like Roman numerals ('Ⅷ') due to utf8proc limits.
+  // returns true for Nd and No (e.g., '٣', '³'), but excludes Nl like Roman numerals
+  // ('Ⅷ') due to utf8proc limits.
   // '¾' (vulgar fraction) is treated as a digit by utf8proc 'No'
-  return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND, UTF8PROC_CATEGORY_NO);
+  return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND,
+                                      UTF8PROC_CATEGORY_NO);
 }
 
 static inline bool IsNumericCharacterUnicode(uint32_t codepoint) {