diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index bce788ca38d..52616323b02 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -1384,10 +1384,15 @@ TYPED_TEST(TestStringKernels, IsDecimalUnicode) { } TYPED_TEST(TestStringKernels, IsDigitUnicode) { - // These are digits according to Python, but we don't have the information in - // utf8proc for this - // this->CheckUnary("utf8_is_digit", "[\"²\", \"①\"]", boolean(), "[true, - // true]"); + // Tests for digits across various Unicode scripts. + // ٤: Arabic 4, ³: Superscript 3, ५: Devanagari 5, Ⅷ: Roman 8 (not digit), + // 123: Fullwidth 123. + // '¾' (vulgar fraction) is treated as a digit by utf8proc + this->CheckUnary( + "utf8_is_digit", + R"(["0", "٤", "۵", "३", "१२३", "٣٣", "²", "123", "٣٢", "٩", "①", "Ⅷ", "abc" , "⻁", ""])", + boolean(), + R"([true, true, true, true, true, true, true, true, true, true, true, false, false, false, false])"); } TYPED_TEST(TestStringKernels, IsNumericUnicode) { diff --git a/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc b/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc index e7b7952df3a..8eb2751cbb0 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc @@ -138,9 +138,12 @@ static inline bool IsDecimalCharacterUnicode(uint32_t codepoint) { } static inline bool IsDigitCharacterUnicode(uint32_t codepoint) { - // Python defines this as Numeric_Type=Digit or Numeric_Type=Decimal. - // utf8proc has no support for this, this is the best we can do: - return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND); + // Approximates Python's str.isnumeric(): + // returns true for Nd and No (e.g., '٣', '³'), but excludes Nl like Roman numerals + // ('Ⅷ') due to utf8proc limits. + // '¾' (vulgar fraction) is treated as a digit by utf8proc 'No' + return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND, + UTF8PROC_CATEGORY_NO); } static inline bool IsNumericCharacterUnicode(uint32_t codepoint) {