Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 22 additions & 22 deletions cpp/src/arrow/compute/kernels/scalar_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -944,33 +944,33 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
MakeUnaryStringBatchKernel<AsciiUpper>("ascii_upper", registry);
MakeUnaryStringBatchKernel<AsciiLower>("ascii_lower", registry);

AddUnaryStringPredicate<IsAscii>("string_isascii", registry);

AddUnaryStringPredicate<IsAlphaNumericAscii>("ascii_isalnum", registry);
AddUnaryStringPredicate<IsAlphaAscii>("ascii_isalpha", registry);
AddUnaryStringPredicate<IsDecimalAscii>("ascii_isdecimal", registry);
// no isdigic for ascii, since it is the same as isdecimal
AddUnaryStringPredicate<IsLowerAscii>("ascii_islower", registry);
// no isnumeric for ascii, since it is the same as isdecimal
AddUnaryStringPredicate<IsPrintableAscii>("ascii_isprintable", registry);
AddUnaryStringPredicate<IsSpaceAscii>("ascii_isspace", registry);
AddUnaryStringPredicate<IsTitleAscii>("ascii_istitle", registry);
AddUnaryStringPredicate<IsUpperAscii>("ascii_isupper", registry);
AddUnaryStringPredicate<IsAscii>("string_is_ascii", registry);

AddUnaryStringPredicate<IsAlphaNumericAscii>("ascii_is_alnum", registry);
AddUnaryStringPredicate<IsAlphaAscii>("ascii_is_alpha", registry);
AddUnaryStringPredicate<IsDecimalAscii>("ascii_is_decimal", registry);
// no is_digit for ascii, since it is the same as is_decimal
AddUnaryStringPredicate<IsLowerAscii>("ascii_is_lower", registry);
// no is_numeric for ascii, since it is the same as is_decimal
AddUnaryStringPredicate<IsPrintableAscii>("ascii_is_printable", registry);
AddUnaryStringPredicate<IsSpaceAscii>("ascii_is_space", registry);
AddUnaryStringPredicate<IsTitleAscii>("ascii_is_title", registry);
AddUnaryStringPredicate<IsUpperAscii>("ascii_is_upper", registry);

#ifdef ARROW_WITH_UTF8PROC
MakeUnaryStringUTF8TransformKernel<UTF8Upper>("utf8_upper", registry);
MakeUnaryStringUTF8TransformKernel<UTF8Lower>("utf8_lower", registry);

AddUnaryStringPredicate<IsAlphaNumericUnicode>("utf8_isalnum", registry);
AddUnaryStringPredicate<IsAlphaUnicode>("utf8_isalpha", registry);
AddUnaryStringPredicate<IsDecimalUnicode>("utf8_isdecimal", registry);
AddUnaryStringPredicate<IsDigitUnicode>("utf8_isdigit", registry);
AddUnaryStringPredicate<IsLowerUnicode>("utf8_islower", registry);
AddUnaryStringPredicate<IsNumericUnicode>("utf8_isnumeric", registry);
AddUnaryStringPredicate<IsPrintableUnicode>("utf8_isprintable", registry);
AddUnaryStringPredicate<IsSpaceUnicode>("utf8_isspace", registry);
AddUnaryStringPredicate<IsTitleUnicode>("utf8_istitle", registry);
AddUnaryStringPredicate<IsUpperUnicode>("utf8_isupper", registry);
AddUnaryStringPredicate<IsAlphaNumericUnicode>("utf8_is_alnum", registry);
AddUnaryStringPredicate<IsAlphaUnicode>("utf8_is_alpha", registry);
AddUnaryStringPredicate<IsDecimalUnicode>("utf8_is_decimal", registry);
AddUnaryStringPredicate<IsDigitUnicode>("utf8_is_digit", registry);
AddUnaryStringPredicate<IsLowerUnicode>("utf8_is_lower", registry);
AddUnaryStringPredicate<IsNumericUnicode>("utf8_is_numeric", registry);
AddUnaryStringPredicate<IsPrintableUnicode>("utf8_is_printable", registry);
AddUnaryStringPredicate<IsSpaceUnicode>("utf8_is_space", registry);
AddUnaryStringPredicate<IsTitleUnicode>("utf8_is_title", registry);
AddUnaryStringPredicate<IsUpperUnicode>("utf8_is_upper", registry);
#endif

AddBinaryLength(registry);
Expand Down
77 changes: 40 additions & 37 deletions cpp/src/arrow/compute/kernels/scalar_string_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ TYPED_TEST(TestStringKernels, Utf8Lower) {
this->CheckUnary("utf8_lower", "[\"aAazZæÆ&\", null, \"\", \"b\"]", this->type(),
"[\"aaazzææ&\", null, \"\", \"b\"]");

// test varying encoding lenghts and thus changing indices/offsets
// test varying encoding lengths and thus changing indices/offsets
this->CheckUnary("utf8_lower", "[\"ⱭɽⱤoW\", null, \"ıI\", \"B\"]", this->type(),
"[\"ɑɽɽow\", null, \"ıi\", \"b\"]");

Expand All @@ -149,81 +149,81 @@ TYPED_TEST(TestStringKernels, Utf8Lower) {
TYPED_TEST(TestStringKernels, IsAlphaNumericUnicode) {
// U+08BE (utf8: \xE0\xA2\xBE) is undefined, but utf8proc things it is
// UTF8PROC_CATEGORY_LO
this->CheckUnary("utf8_isalnum", "[\"ⱭɽⱤoW123\", null, \"Ɑ2\", \"!\", \"\"]", boolean(),
"[true, null, true, false, false]");
this->CheckUnary("utf8_is_alnum", "[\"ⱭɽⱤoW123\", null, \"Ɑ2\", \"!\", \"\"]",
boolean(), "[true, null, true, false, false]");
}

TYPED_TEST(TestStringKernels, IsAlphaUnicode) {
// U+08BE (utf8: \xE0\xA2\xBE) is undefined, but utf8proc things it is
// UTF8PROC_CATEGORY_LO
this->CheckUnary("utf8_isalpha", "[\"ⱭɽⱤoW\", null, \"Ɑ2\", \"!\", \"\"]", boolean(),
this->CheckUnary("utf8_is_alpha", "[\"ⱭɽⱤoW\", null, \"Ɑ2\", \"!\", \"\"]", boolean(),
"[true, null, false, false, false]");
}

TYPED_TEST(TestStringKernels, IsAscii) {
this->CheckUnary("string_isascii", "[\"azAZ~\", null, \"Ɑ\", \"\"]", boolean(),
this->CheckUnary("string_is_ascii", "[\"azAZ~\", null, \"Ɑ\", \"\"]", boolean(),
"[true, null, false, true]");
}

TYPED_TEST(TestStringKernels, IsDecimalUnicode) {
// ٣ is arabic 3 (decimal), Ⅳ roman (non-decimal)
this->CheckUnary("utf8_isdecimal", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]",
this->CheckUnary("utf8_is_decimal", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]",
boolean(), "[true, null, true, false, false, false]");
}

TYPED_TEST(TestStringKernels, IsDigitUnicode) {
// These are digits according to Python, but we don't have the information in
// utf8proc for this
// this->CheckUnary("utf8_isdigit", "[\"²\", \"①\"]", boolean(), "[true,
// this->CheckUnary("utf8_is_digit", "[\"²\", \"①\"]", boolean(), "[true,
// true]");
}

TYPED_TEST(TestStringKernels, IsNumericUnicode) {
// ٣ is arabic 3 (decimal), Ⅳ roman (non-decimal)
this->CheckUnary("utf8_isnumeric", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]",
this->CheckUnary("utf8_is_numeric", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]",
boolean(), "[true, null, true, true, false, false]");
// These are numerical according to Python, but we don't have the information in
// utf8proc for this
// this->CheckUnary("utf8_isnumeric", "[\"㐅\", \"卌\"]", boolean(),
// this->CheckUnary("utf8_is_numeric", "[\"㐅\", \"卌\"]", boolean(),
// "[true, null, true, true, false, false]");
}

TYPED_TEST(TestStringKernels, IsLowerUnicode) {
// ٣ is arabic 3 (decimal), Φ capital
this->CheckUnary("utf8_islower",
this->CheckUnary("utf8_is_lower",
"[\"12\", null, \"٣a\", \"٣A\", \"1a\", \"Φ\", \"\", \"with space\", "
"\"With space\"]",
boolean(),
"[false, null, true, false, true, false, false, true, false]");
// lower case character utf8proc does not know about
// this->CheckUnary("utf8_islower", "[\"ª\", \"ₕ\"]", boolean(), "[true,
// this->CheckUnary("utf8_is_lower", "[\"ª\", \"ₕ\"]", boolean(), "[true,
// true]");
}

TYPED_TEST(TestStringKernels, IsPrintableUnicode) {
// U+2008 (utf8: \xe2\x80\x88) is punctuaction space, it is NOT printable
// U+2008 (utf8: \xe2\x80\x88) is punctuation space, it is NOT printable
// U+0378 (utf8: \xCD\xB8) is an undefined char, it has no category
this->CheckUnary(
"utf8_isprintable",
"utf8_is_printable",
"[\" 123azAZ!~\", null, \"\xe2\x80\x88\", \"\", \"\\r\", \"\xCD\xB8\"]", boolean(),
"[true, null, false, true, false, false]");
}

TYPED_TEST(TestStringKernels, IsSpaceUnicode) {
// U+2008 (utf8: \xe2\x80\x88) is punctuaction space
this->CheckUnary("utf8_isspace", "[\" \", null, \" \", \"\\t\\r\"]", boolean(),
// U+2008 (utf8: \xe2\x80\x88) is punctuation space
this->CheckUnary("utf8_is_space", "[\" \", null, \" \", \"\\t\\r\"]", boolean(),
"[true, null, true, true]");
this->CheckUnary("utf8_isspace", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]",
this->CheckUnary("utf8_is_space", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]",
boolean(), "[false, null, false, false, true]");
}

TYPED_TEST(TestStringKernels, IsTitleUnicode) {
// ٣ is arabic 3 (decimal), Φ capital
this->CheckUnary("utf8_istitle",
this->CheckUnary("utf8_is_title",
"[\"Is\", null, \"Is Title\", \"Is٣Title\", \"Is_DŽ\", \"Φ\", \"DŽ\"]",
boolean(), "[true, null, true, true, true, true, true]");
this->CheckUnary(
"utf8_istitle",
"utf8_is_title",
"[\"IsN\", null, \"IsNoTitle\", \"Is No T٣tle\", \"IsDŽ\", \"ΦΦ\", \"dž\", \"_\"]",
boolean(), "[false, null, false, false, false, false, false, false]");
}
Expand All @@ -233,9 +233,10 @@ TYPED_TEST(TestStringKernels, IsTitleUnicode) {

TYPED_TEST(TestStringKernels, IsUpperUnicode) {
// ٣ is arabic 3 (decimal), Φ capital
this->CheckUnary(
"utf8_isupper", "[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\", \"Ⅰ\", \"Ⅿ\"]",
boolean(), "[false, null, false, true, true, true, false, true, true]");
this->CheckUnary("utf8_is_upper",
"[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\", \"Ⅰ\", \"Ⅿ\"]",
boolean(),
"[false, null, false, true, true, true, false, true, true]");
// * Ⅰ to Ⅿ is a special case (roman capital), as well as Ⓐ to Ⓩ
// * ϒ - \xCF\x92 - Greek Upsilon with Hook Symbol - upper case, but has no direct lower
// case
Expand All @@ -245,7 +246,7 @@ TYPED_TEST(TestStringKernels, IsUpperUnicode) {
// * U+A7BA - Ꞻ - \xEA\x9E\xBA - Latin Capital Letter Glottal A - new in unicode 13
// (not tested since it depends on the version of libutf8proc)
// * U+A7BB - ꞻ - \xEA\x9E\xBB - Latin Small Letter Glottal A - new in unicode 13
this->CheckUnary("utf8_isupper",
this->CheckUnary("utf8_is_upper",
"[\"Ⓐ\", \"Ⓩ\", \"ϒ\", \"ᾈ\", \"\xEA\x9E\xBA\", \"xF0x90x90x80\"]",
boolean(), "[true, true, true, false, true, false]");
}
Expand All @@ -255,61 +256,63 @@ TYPED_TEST(TestStringKernels, IsUpperUnicode) {
#endif // ARROW_WITH_UTF8PROC

TYPED_TEST(TestStringKernels, IsAlphaNumericAscii) {
this->CheckUnary("ascii_isalnum",
this->CheckUnary("ascii_is_alnum",
"[\"ⱭɽⱤoW123\", null, \"Ɑ2\", \"!\", \"\", \"a space\", \"1 space\"]",
boolean(), "[false, null, false, false, false, false, false]");
this->CheckUnary("ascii_isalnum", "[\"aRoW123\", null, \"a2\", \"a\", \"2\", \"\"]",
this->CheckUnary("ascii_is_alnum", "[\"aRoW123\", null, \"a2\", \"a\", \"2\", \"\"]",
boolean(), "[true, null, true, true, true, false]");
}

TYPED_TEST(TestStringKernels, IsAlphaAscii) {
this->CheckUnary("ascii_isalpha", "[\"ⱭɽⱤoW\", \"arrow\", null, \"a2\", \"!\", \"\"]",
this->CheckUnary("ascii_is_alpha", "[\"ⱭɽⱤoW\", \"arrow\", null, \"a2\", \"!\", \"\"]",
boolean(), "[false, true, null, false, false, false]");
}

TYPED_TEST(TestStringKernels, IsDecimalAscii) {
// ٣ is arabic 3
this->CheckUnary("ascii_isdecimal", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]",
this->CheckUnary("ascii_is_decimal", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]",
boolean(), "[true, null, false, false, false, false]");
}

TYPED_TEST(TestStringKernels, IsLowerAscii) {
// ٣ is arabic 3 (decimal), φ lower greek
this->CheckUnary("ascii_islower", "[\"12\", null, \"٣a\", \"٣A\", \"1a\", \"φ\", \"\"]",
boolean(), "[false, null, true, false, true, false, false]");
this->CheckUnary("ascii_is_lower",
"[\"12\", null, \"٣a\", \"٣A\", \"1a\", \"φ\", \"\"]", boolean(),
"[false, null, true, false, true, false, false]");
}
TYPED_TEST(TestStringKernels, IsPrintableAscii) {
// \xe2\x80\x88 is punctuaction space
this->CheckUnary("ascii_isprintable",
// \xe2\x80\x88 is punctuation space
this->CheckUnary("ascii_is_printable",
"[\" 123azAZ!~\", null, \"\xe2\x80\x88\", \"\", \"\\r\"]", boolean(),
"[true, null, false, true, false]");
}

TYPED_TEST(TestStringKernels, IsSpaceAscii) {
// \xe2\x80\x88 is punctuaction space
// \xe2\x80\x88 is punctuation space
// Note: for ascii version, the non-ascii chars are seen as non-cased
this->CheckUnary("ascii_isspace", "[\" \", null, \" \", \"\\t\\r\"]", boolean(),
this->CheckUnary("ascii_is_space", "[\" \", null, \" \", \"\\t\\r\"]", boolean(),
"[true, null, true, true]");
this->CheckUnary("ascii_isspace", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]",
this->CheckUnary("ascii_is_space", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]",
boolean(), "[false, null, false, false, false]");
}

TYPED_TEST(TestStringKernels, IsTitleAscii) {
// ٣ is arabic 3 (decimal), Φ capital
// Note: for ascii version, the non-ascii chars are seen as non-cased
this->CheckUnary("ascii_istitle",
this->CheckUnary("ascii_is_title",
"[\"Is\", null, \"Is Title\", \"Is٣Title\", \"Is_DŽ\", \"Φ\", \"DŽ\"]",
boolean(), "[true, null, true, true, true, false, false]");
this->CheckUnary(
"ascii_istitle",
"ascii_is_title",
"[\"IsN\", null, \"IsNoTitle\", \"Is No T٣tle\", \"IsDŽ\", \"ΦΦ\", \"dž\", \"_\"]",
boolean(), "[false, null, false, false, true, false, false, false]");
}

TYPED_TEST(TestStringKernels, IsUpperAscii) {
// ٣ is arabic 3 (decimal), Φ capital greek
this->CheckUnary("ascii_isupper", "[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\"]",
boolean(), "[false, null, false, true, true, false, false]");
this->CheckUnary("ascii_is_upper",
"[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\"]", boolean(),
"[false, null, false, true, true, false, false]");
}

TYPED_TEST(TestStringKernels, MatchSubstring) {
Expand Down
40 changes: 20 additions & 20 deletions python/pyarrow/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,26 +110,26 @@ def func(left, right):
utf8_upper = _simple_unary_function('utf8_upper')
utf8_lower = _simple_unary_function('utf8_lower')

string_isascii = _simple_unary_function('string_isascii')

ascii_isalnum = _simple_unary_function('ascii_isalnum')
utf8_isalnum = _simple_unary_function('utf8_isalnum')
ascii_isalpha = _simple_unary_function('ascii_isalpha')
utf8_isalpha = _simple_unary_function('utf8_isalpha')
ascii_isdecimal = _simple_unary_function('ascii_isdecimal')
utf8_isdecimal = _simple_unary_function('utf8_isdecimal')
ascii_isdigit = ascii_isdecimal # alias
utf8_isdigit = _simple_unary_function('utf8_isdigit')
ascii_islower = _simple_unary_function('ascii_islower')
utf8_islower = _simple_unary_function('utf8_islower')
ascii_isnumeric = ascii_isdecimal # alias
utf8_isnumeric = _simple_unary_function('utf8_isnumeric')
ascii_isprintable = _simple_unary_function('ascii_isprintable')
utf8_isprintable = _simple_unary_function('utf8_isprintable')
ascii_istitle = _simple_unary_function('ascii_istitle')
utf8_istitle = _simple_unary_function('utf8_istitle')
ascii_isupper = _simple_unary_function('ascii_isupper')
utf8_isupper = _simple_unary_function('utf8_isupper')
string_is_ascii = _simple_unary_function('string_is_ascii')

ascii_is_alnum = _simple_unary_function('ascii_is_alnum')
utf8_is_alnum = _simple_unary_function('utf8_is_alnum')
ascii_is_alpha = _simple_unary_function('ascii_is_alpha')
utf8_is_alpha = _simple_unary_function('utf8_is_alpha')
ascii_is_decimal = _simple_unary_function('ascii_is_decimal')
utf8_is_decimal = _simple_unary_function('utf8_is_decimal')
ascii_is_digit = ascii_is_decimal # alias
utf8_is_digit = _simple_unary_function('utf8_is_digit')
ascii_is_lower = _simple_unary_function('ascii_is_lower')
utf8_is_lower = _simple_unary_function('utf8_is_lower')
ascii_is_numeric = ascii_is_decimal # alias
utf8_is_numeric = _simple_unary_function('utf8_is_numeric')
ascii_is_printable = _simple_unary_function('ascii_is_printable')
utf8_is_printable = _simple_unary_function('utf8_is_printable')
ascii_is_title = _simple_unary_function('ascii_is_title')
utf8_is_title = _simple_unary_function('utf8_is_title')
ascii_is_upper = _simple_unary_function('ascii_is_upper')
utf8_is_upper = _simple_unary_function('utf8_is_upper')

is_valid = _simple_unary_function('is_valid')
is_null = _simple_unary_function('is_null')
Expand Down
29 changes: 15 additions & 14 deletions python/pyarrow/tests/test_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def find_new_unicode_codepoints():
new = set()
characters = [chr(c) for c in range(0x80, 0x11000)
if not (0xD800 <= c < 0xE000)]
is_printable = pc.utf8_isprintable(pa.array(characters)).to_pylist()
is_printable = pc.utf8_is_printable(pa.array(characters)).to_pylist()
for i, c in enumerate(characters):
if is_printable[i] != c.isprintable():
new.add(ord(c))
Expand All @@ -134,9 +134,9 @@ def find_new_unicode_codepoints():

# Python claims there are not alpha, not sure why, they are in
# gc='Other Letter': https://graphemica.com/%E1%B3%B2
unknown_issue_isalpha = {0x1cf2, 0x1cf3}
unknown_issue_is_alpha = {0x1cf2, 0x1cf3}
# utf8proc does not know if codepoints are lower case
utf8proc_issue_islower = {
utf8proc_issue_is_lower = {
0xaa, 0xba, 0x2b0, 0x2b1, 0x2b2, 0x2b3, 0x2b4,
0x2b5, 0x2b6, 0x2b7, 0x2b8, 0x2c0, 0x2c1, 0x2e0,
0x2e1, 0x2e2, 0x2e3, 0x2e4, 0x37a, 0x1d2c, 0x1d2d,
Expand Down Expand Up @@ -208,23 +208,24 @@ def find_new_unicode_codepoints():
0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, }

codepoints_ignore = {
'isalnum': numeric_info_missing | digit_info_missing |
unknown_issue_isalpha,
'isalpha': unknown_issue_isalpha,
'isdigit': digit_info_missing,
'isnumeric': numeric_info_missing,
'islower': utf8proc_issue_islower
'is_alnum': numeric_info_missing | digit_info_missing |
unknown_issue_is_alpha,
'is_alpha': unknown_issue_is_alpha,
'is_digit': digit_info_missing,
'is_numeric': numeric_info_missing,
'is_lower': utf8proc_issue_is_lower
}


@pytest.mark.parametrize('function_name', ['isalnum', 'isalpha', 'isascii',
'isdecimal', 'isdigit', 'islower',
'isnumeric', 'isprintable',
'isspace', 'isupper', ])
@pytest.mark.parametrize('function_name', ['is_alnum', 'is_alpha',
'is_ascii', 'is_decimal',
'is_digit', 'is_lower',
'is_numeric', 'is_printable',
'is_space', 'is_upper', ])
@pytest.mark.parametrize('variant', ['ascii', 'utf8'])
def test_string_py_compat_boolean(function_name, variant):
arrow_name = variant + "_" + function_name
py_name = function_name
py_name = function_name.replace('_', '')
ignore = codepoints_ignore.get(function_name, set()) |\
find_new_unicode_codepoints()
for i in range(128 if ascii else 0x11000):
Expand Down