From 7e38020d06e75b748987cc922e10dd791ec1de35 Mon Sep 17 00:00:00 2001 From: iabhi4 Date: Mon, 26 May 2025 13:44:22 -0700 Subject: [PATCH 1/2] ARROW-46589: Fixed utf8_is_digit to support full Unicode digit range --- cpp/src/arrow/compute/kernels/scalar_string_test.cc | 12 ++++++++---- cpp/src/arrow/compute/kernels/scalar_string_utf8.cc | 7 ++++--- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index bce788ca38d..6167621845a 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -1384,10 +1384,14 @@ TYPED_TEST(TestStringKernels, IsDecimalUnicode) { } TYPED_TEST(TestStringKernels, IsDigitUnicode) { - // These are digits according to Python, but we don't have the information in - // utf8proc for this - // this->CheckUnary("utf8_is_digit", "[\"²\", \"①\"]", boolean(), "[true, - // true]"); + // Tests for digits across various Unicode scripts. + // ٤: Arabic 4, ³: Superscript 3, ५: Devanagari 5, Ⅷ: Roman 8 (not digit), 123: Fullwidth 123 + // '¾' (vulgar fraction) is treated as a digit by utf8proc + this->CheckUnary( + "utf8_is_digit", + R"(["0", "٤", "۵", "३", "१२३", "٣٣", "²", "123", "٣٢", "٩", "①", "Ⅷ", "abc" , "⻁", ""])", + boolean(), + R"([true, true, true, true, true, true, true, true, true, true, true, false, false, false, false])"); } TYPED_TEST(TestStringKernels, IsNumericUnicode) { diff --git a/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc b/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc index e7b7952df3a..baf32df7b29 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc @@ -138,9 +138,10 @@ static inline bool IsDecimalCharacterUnicode(uint32_t codepoint) { } static inline bool IsDigitCharacterUnicode(uint32_t codepoint) { - // Python defines this as Numeric_Type=Digit or Numeric_Type=Decimal. - // utf8proc has no support for this, this is the best we can do: - return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND); + // Approximates Python's str.isnumeric(): + // returns true for Nd and No (e.g., '٣', '³'), but excludes Nl like Roman numerals ('Ⅷ') due to utf8proc limits. + // '¾' (vulgar fraction) is treated as a digit by utf8proc 'No' + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND, UTF8PROC_CATEGORY_NO); } static inline bool IsNumericCharacterUnicode(uint32_t codepoint) { From ce921513241e144c3e362a4448327f1fba2790e9 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 2 Jun 2025 12:22:19 +0200 Subject: [PATCH 2/2] Fix lint --- cpp/src/arrow/compute/kernels/scalar_string_test.cc | 3 ++- cpp/src/arrow/compute/kernels/scalar_string_utf8.cc | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 6167621845a..52616323b02 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -1385,7 +1385,8 @@ TYPED_TEST(TestStringKernels, IsDecimalUnicode) { TYPED_TEST(TestStringKernels, IsDigitUnicode) { // Tests for digits across various Unicode scripts. - // ٤: Arabic 4, ³: Superscript 3, ५: Devanagari 5, Ⅷ: Roman 8 (not digit), 123: Fullwidth 123 + // ٤: Arabic 4, ³: Superscript 3, ५: Devanagari 5, Ⅷ: Roman 8 (not digit), + // 123: Fullwidth 123. // '¾' (vulgar fraction) is treated as a digit by utf8proc this->CheckUnary( "utf8_is_digit", diff --git a/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc b/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc index baf32df7b29..8eb2751cbb0 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc @@ -139,9 +139,11 @@ static inline bool IsDecimalCharacterUnicode(uint32_t codepoint) { static inline bool IsDigitCharacterUnicode(uint32_t codepoint) { // Approximates Python's str.isnumeric(): - // returns true for Nd and No (e.g., '٣', '³'), but excludes Nl like Roman numerals ('Ⅷ') due to utf8proc limits. + // returns true for Nd and No (e.g., '٣', '³'), but excludes Nl like Roman numerals + // ('Ⅷ') due to utf8proc limits. // '¾' (vulgar fraction) is treated as a digit by utf8proc 'No' - return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND, UTF8PROC_CATEGORY_NO); + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND, + UTF8PROC_CATEGORY_NO); } static inline bool IsNumericCharacterUnicode(uint32_t codepoint) {