Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions cpp/src/arrow/compute/kernels/scalar_string_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1384,10 +1384,15 @@ TYPED_TEST(TestStringKernels, IsDecimalUnicode) {
}

TYPED_TEST(TestStringKernels, IsDigitUnicode) {
// These are digits according to Python, but we don't have the information in
// utf8proc for this
// this->CheckUnary("utf8_is_digit", "[\"²\", \"①\"]", boolean(), "[true,
// true]");
// Tests for digits across various Unicode scripts.
// ٤: Arabic 4, ³: Superscript 3, ५: Devanagari 5, Ⅷ: Roman 8 (not digit),
// 123: Fullwidth 123.
// '¾' (vulgar fraction) is treated as a digit by utf8proc
this->CheckUnary(
"utf8_is_digit",
R"(["0", "٤", "۵", "३", "१२३", "٣٣", "²", "123", "٣٢", "٩", "①", "Ⅷ", "abc" , "⻁", ""])",
boolean(),
R"([true, true, true, true, true, true, true, true, true, true, true, false, false, false, false])");
}

TYPED_TEST(TestStringKernels, IsNumericUnicode) {
Expand Down
9 changes: 6 additions & 3 deletions cpp/src/arrow/compute/kernels/scalar_string_utf8.cc
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,12 @@ static inline bool IsDecimalCharacterUnicode(uint32_t codepoint) {
}

static inline bool IsDigitCharacterUnicode(uint32_t codepoint) {
// Python defines this as Numeric_Type=Digit or Numeric_Type=Decimal.
// utf8proc has no support for this, this is the best we can do:
return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND);
// Approximates Python's str.isnumeric():
// returns true for Nd and No (e.g., '٣', '³'), but excludes Nl like Roman numerals
// ('Ⅷ') due to utf8proc limits.
// '¾' (vulgar fraction) is treated as a digit by utf8proc 'No'
return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND,
UTF8PROC_CATEGORY_NO);
}

static inline bool IsNumericCharacterUnicode(uint32_t codepoint) {
Expand Down
Loading