From 3f6efe45f836cdbd198455334ae7a9dd6b708d9a Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 14 Jul 2020 15:58:36 -0500 Subject: [PATCH 1/2] Add underscores to is* string functions --- .../arrow/compute/kernels/scalar_string.cc | 44 ++++++------ .../compute/kernels/scalar_string_test.cc | 67 ++++++++++--------- python/pyarrow/compute.py | 40 +++++------ python/pyarrow/tests/test_compute.py | 29 ++++---- 4 files changed, 92 insertions(+), 88 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 31517930c06..d2e7e92cdb1 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -944,33 +944,33 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) { MakeUnaryStringBatchKernel("ascii_upper", registry); MakeUnaryStringBatchKernel("ascii_lower", registry); - AddUnaryStringPredicate("string_isascii", registry); - - AddUnaryStringPredicate("ascii_isalnum", registry); - AddUnaryStringPredicate("ascii_isalpha", registry); - AddUnaryStringPredicate("ascii_isdecimal", registry); - // no isdigic for ascii, since it is the same as isdecimal - AddUnaryStringPredicate("ascii_islower", registry); - // no isnumeric for ascii, since it is the same as isdecimal - AddUnaryStringPredicate("ascii_isprintable", registry); - AddUnaryStringPredicate("ascii_isspace", registry); - AddUnaryStringPredicate("ascii_istitle", registry); - AddUnaryStringPredicate("ascii_isupper", registry); + AddUnaryStringPredicate("string_is_ascii", registry); + + AddUnaryStringPredicate("ascii_is_alnum", registry); + AddUnaryStringPredicate("ascii_is_alpha", registry); + AddUnaryStringPredicate("ascii_is_decimal", registry); + // no is_digic for ascii, since it is the same as is_decimal + AddUnaryStringPredicate("ascii_is_lower", registry); + // no is_numeric for ascii, since it is the same as is_decimal + AddUnaryStringPredicate("ascii_is_printable", registry); + AddUnaryStringPredicate("ascii_is_space", registry); + AddUnaryStringPredicate("ascii_is_title", registry); + AddUnaryStringPredicate("ascii_is_upper", registry); #ifdef ARROW_WITH_UTF8PROC MakeUnaryStringUTF8TransformKernel("utf8_upper", registry); MakeUnaryStringUTF8TransformKernel("utf8_lower", registry); - AddUnaryStringPredicate("utf8_isalnum", registry); - AddUnaryStringPredicate("utf8_isalpha", registry); - AddUnaryStringPredicate("utf8_isdecimal", registry); - AddUnaryStringPredicate("utf8_isdigit", registry); - AddUnaryStringPredicate("utf8_islower", registry); - AddUnaryStringPredicate("utf8_isnumeric", registry); - AddUnaryStringPredicate("utf8_isprintable", registry); - AddUnaryStringPredicate("utf8_isspace", registry); - AddUnaryStringPredicate("utf8_istitle", registry); - AddUnaryStringPredicate("utf8_isupper", registry); + AddUnaryStringPredicate("utf8_is_alnum", registry); + AddUnaryStringPredicate("utf8_is_alpha", registry); + AddUnaryStringPredicate("utf8_is_decimal", registry); + AddUnaryStringPredicate("utf8_is_digit", registry); + AddUnaryStringPredicate("utf8_is_lower", registry); + AddUnaryStringPredicate("utf8_is_numeric", registry); + AddUnaryStringPredicate("utf8_is_printable", registry); + AddUnaryStringPredicate("utf8_is_space", registry); + AddUnaryStringPredicate("utf8_is_title", registry); + AddUnaryStringPredicate("utf8_is_upper", registry); #endif AddBinaryLength(registry); diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 0a714f5f6b0..476f56c2dc2 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -149,54 +149,54 @@ TYPED_TEST(TestStringKernels, Utf8Lower) { TYPED_TEST(TestStringKernels, IsAlphaNumericUnicode) { // U+08BE (utf8: \xE0\xA2\xBE) is undefined, but utf8proc things it is // UTF8PROC_CATEGORY_LO - this->CheckUnary("utf8_isalnum", "[\"ⱭɽⱤoW123\", null, \"Ɑ2\", \"!\", \"\"]", boolean(), - "[true, null, true, false, false]"); + this->CheckUnary("utf8_is_alnum", "[\"ⱭɽⱤoW123\", null, \"Ɑ2\", \"!\", \"\"]", + boolean(), "[true, null, true, false, false]"); } TYPED_TEST(TestStringKernels, IsAlphaUnicode) { // U+08BE (utf8: \xE0\xA2\xBE) is undefined, but utf8proc things it is // UTF8PROC_CATEGORY_LO - this->CheckUnary("utf8_isalpha", "[\"ⱭɽⱤoW\", null, \"Ɑ2\", \"!\", \"\"]", boolean(), + this->CheckUnary("utf8_is_alpha", "[\"ⱭɽⱤoW\", null, \"Ɑ2\", \"!\", \"\"]", boolean(), "[true, null, false, false, false]"); } TYPED_TEST(TestStringKernels, IsAscii) { - this->CheckUnary("string_isascii", "[\"azAZ~\", null, \"Ɑ\", \"\"]", boolean(), + this->CheckUnary("string_is_ascii", "[\"azAZ~\", null, \"Ɑ\", \"\"]", boolean(), "[true, null, false, true]"); } TYPED_TEST(TestStringKernels, IsDecimalUnicode) { // ٣ is arabic 3 (decimal), Ⅳ roman (non-decimal) - this->CheckUnary("utf8_isdecimal", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]", + this->CheckUnary("utf8_is_decimal", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]", boolean(), "[true, null, true, false, false, false]"); } TYPED_TEST(TestStringKernels, IsDigitUnicode) { // These are digits according to Python, but we don't have the information in // utf8proc for this - // this->CheckUnary("utf8_isdigit", "[\"²\", \"①\"]", boolean(), "[true, + // this->CheckUnary("utf8_is_digit", "[\"²\", \"①\"]", boolean(), "[true, // true]"); } TYPED_TEST(TestStringKernels, IsNumericUnicode) { // ٣ is arabic 3 (decimal), Ⅳ roman (non-decimal) - this->CheckUnary("utf8_isnumeric", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]", + this->CheckUnary("utf8_is_numeric", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]", boolean(), "[true, null, true, true, false, false]"); // These are numerical according to Python, but we don't have the information in // utf8proc for this - // this->CheckUnary("utf8_isnumeric", "[\"㐅\", \"卌\"]", boolean(), + // this->CheckUnary("utf8_is_numeric", "[\"㐅\", \"卌\"]", boolean(), // "[true, null, true, true, false, false]"); } TYPED_TEST(TestStringKernels, IsLowerUnicode) { // ٣ is arabic 3 (decimal), Φ capital - this->CheckUnary("utf8_islower", + this->CheckUnary("utf8_is_lower", "[\"12\", null, \"٣a\", \"٣A\", \"1a\", \"Φ\", \"\", \"with space\", " "\"With space\"]", boolean(), "[false, null, true, false, true, false, false, true, false]"); // lower case character utf8proc does not know about - // this->CheckUnary("utf8_islower", "[\"ª\", \"ₕ\"]", boolean(), "[true, + // this->CheckUnary("utf8_is_lower", "[\"ª\", \"ₕ\"]", boolean(), "[true, // true]"); } @@ -204,26 +204,26 @@ TYPED_TEST(TestStringKernels, IsPrintableUnicode) { // U+2008 (utf8: \xe2\x80\x88) is punctuaction space, it is NOT printable // U+0378 (utf8: \xCD\xB8) is an undefined char, it has no category this->CheckUnary( - "utf8_isprintable", + "utf8_is_printable", "[\" 123azAZ!~\", null, \"\xe2\x80\x88\", \"\", \"\\r\", \"\xCD\xB8\"]", boolean(), "[true, null, false, true, false, false]"); } TYPED_TEST(TestStringKernels, IsSpaceUnicode) { // U+2008 (utf8: \xe2\x80\x88) is punctuaction space - this->CheckUnary("utf8_isspace", "[\" \", null, \" \", \"\\t\\r\"]", boolean(), + this->CheckUnary("utf8_is_space", "[\" \", null, \" \", \"\\t\\r\"]", boolean(), "[true, null, true, true]"); - this->CheckUnary("utf8_isspace", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]", + this->CheckUnary("utf8_is_space", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]", boolean(), "[false, null, false, false, true]"); } TYPED_TEST(TestStringKernels, IsTitleUnicode) { // ٣ is arabic 3 (decimal), Φ capital - this->CheckUnary("utf8_istitle", + this->CheckUnary("utf8_is_title", "[\"Is\", null, \"Is Title\", \"Is٣Title\", \"Is_DŽ\", \"Φ\", \"DŽ\"]", boolean(), "[true, null, true, true, true, true, true]"); this->CheckUnary( - "utf8_istitle", + "utf8_is_title", "[\"IsN\", null, \"IsNoTitle\", \"Is No T٣tle\", \"IsDŽ\", \"ΦΦ\", \"dž\", \"_\"]", boolean(), "[false, null, false, false, false, false, false, false]"); } @@ -233,9 +233,10 @@ TYPED_TEST(TestStringKernels, IsTitleUnicode) { TYPED_TEST(TestStringKernels, IsUpperUnicode) { // ٣ is arabic 3 (decimal), Φ capital - this->CheckUnary( - "utf8_isupper", "[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\", \"Ⅰ\", \"Ⅿ\"]", - boolean(), "[false, null, false, true, true, true, false, true, true]"); + this->CheckUnary("utf8_is_upper", + "[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\", \"Ⅰ\", \"Ⅿ\"]", + boolean(), + "[false, null, false, true, true, true, false, true, true]"); // * Ⅰ to Ⅿ is a special case (roman capital), as well as Ⓐ to Ⓩ // * ϒ - \xCF\x92 - Greek Upsilon with Hook Symbol - upper case, but has no direct lower // case @@ -245,7 +246,7 @@ TYPED_TEST(TestStringKernels, IsUpperUnicode) { // * U+A7BA - Ꞻ - \xEA\x9E\xBA - Latin Capital Letter Glottal A - new in unicode 13 // (not tested since it depends on the version of libutf8proc) // * U+A7BB - ꞻ - \xEA\x9E\xBB - Latin Small Letter Glottal A - new in unicode 13 - this->CheckUnary("utf8_isupper", + this->CheckUnary("utf8_is_upper", "[\"Ⓐ\", \"Ⓩ\", \"ϒ\", \"ᾈ\", \"\xEA\x9E\xBA\", \"xF0x90x90x80\"]", boolean(), "[true, true, true, false, true, false]"); } @@ -255,32 +256,33 @@ TYPED_TEST(TestStringKernels, IsUpperUnicode) { #endif // ARROW_WITH_UTF8PROC TYPED_TEST(TestStringKernels, IsAlphaNumericAscii) { - this->CheckUnary("ascii_isalnum", + this->CheckUnary("ascii_is_alnum", "[\"ⱭɽⱤoW123\", null, \"Ɑ2\", \"!\", \"\", \"a space\", \"1 space\"]", boolean(), "[false, null, false, false, false, false, false]"); - this->CheckUnary("ascii_isalnum", "[\"aRoW123\", null, \"a2\", \"a\", \"2\", \"\"]", + this->CheckUnary("ascii_is_alnum", "[\"aRoW123\", null, \"a2\", \"a\", \"2\", \"\"]", boolean(), "[true, null, true, true, true, false]"); } TYPED_TEST(TestStringKernels, IsAlphaAscii) { - this->CheckUnary("ascii_isalpha", "[\"ⱭɽⱤoW\", \"arrow\", null, \"a2\", \"!\", \"\"]", + this->CheckUnary("ascii_is_alpha", "[\"ⱭɽⱤoW\", \"arrow\", null, \"a2\", \"!\", \"\"]", boolean(), "[false, true, null, false, false, false]"); } TYPED_TEST(TestStringKernels, IsDecimalAscii) { // ٣ is arabic 3 - this->CheckUnary("ascii_isdecimal", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]", + this->CheckUnary("ascii_is_decimal", "[\"12\", null, \"٣\", \"Ⅳ\", \"1a\", \"\"]", boolean(), "[true, null, false, false, false, false]"); } TYPED_TEST(TestStringKernels, IsLowerAscii) { // ٣ is arabic 3 (decimal), φ lower greek - this->CheckUnary("ascii_islower", "[\"12\", null, \"٣a\", \"٣A\", \"1a\", \"φ\", \"\"]", - boolean(), "[false, null, true, false, true, false, false]"); + this->CheckUnary("ascii_is_lower", + "[\"12\", null, \"٣a\", \"٣A\", \"1a\", \"φ\", \"\"]", boolean(), + "[false, null, true, false, true, false, false]"); } TYPED_TEST(TestStringKernels, IsPrintableAscii) { // \xe2\x80\x88 is punctuaction space - this->CheckUnary("ascii_isprintable", + this->CheckUnary("ascii_is_printable", "[\" 123azAZ!~\", null, \"\xe2\x80\x88\", \"\", \"\\r\"]", boolean(), "[true, null, false, true, false]"); } @@ -288,28 +290,29 @@ TYPED_TEST(TestStringKernels, IsPrintableAscii) { TYPED_TEST(TestStringKernels, IsSpaceAscii) { // \xe2\x80\x88 is punctuaction space // Note: for ascii version, the non-ascii chars are seen as non-cased - this->CheckUnary("ascii_isspace", "[\" \", null, \" \", \"\\t\\r\"]", boolean(), + this->CheckUnary("ascii_is_space", "[\" \", null, \" \", \"\\t\\r\"]", boolean(), "[true, null, true, true]"); - this->CheckUnary("ascii_isspace", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]", + this->CheckUnary("ascii_is_space", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]", boolean(), "[false, null, false, false, false]"); } TYPED_TEST(TestStringKernels, IsTitleAscii) { // ٣ is arabic 3 (decimal), Φ capital // Note: for ascii version, the non-ascii chars are seen as non-cased - this->CheckUnary("ascii_istitle", + this->CheckUnary("ascii_is_title", "[\"Is\", null, \"Is Title\", \"Is٣Title\", \"Is_DŽ\", \"Φ\", \"DŽ\"]", boolean(), "[true, null, true, true, true, false, false]"); this->CheckUnary( - "ascii_istitle", + "ascii_is_title", "[\"IsN\", null, \"IsNoTitle\", \"Is No T٣tle\", \"IsDŽ\", \"ΦΦ\", \"dž\", \"_\"]", boolean(), "[false, null, false, false, true, false, false, false]"); } TYPED_TEST(TestStringKernels, IsUpperAscii) { // ٣ is arabic 3 (decimal), Φ capital greek - this->CheckUnary("ascii_isupper", "[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\"]", - boolean(), "[false, null, false, true, true, false, false]"); + this->CheckUnary("ascii_is_upper", + "[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\"]", boolean(), + "[false, null, false, true, true, false, false]"); } TYPED_TEST(TestStringKernels, MatchSubstring) { diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 70a0e764de3..cc9847e2dce 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -110,26 +110,26 @@ def func(left, right): utf8_upper = _simple_unary_function('utf8_upper') utf8_lower = _simple_unary_function('utf8_lower') -string_isascii = _simple_unary_function('string_isascii') - -ascii_isalnum = _simple_unary_function('ascii_isalnum') -utf8_isalnum = _simple_unary_function('utf8_isalnum') -ascii_isalpha = _simple_unary_function('ascii_isalpha') -utf8_isalpha = _simple_unary_function('utf8_isalpha') -ascii_isdecimal = _simple_unary_function('ascii_isdecimal') -utf8_isdecimal = _simple_unary_function('utf8_isdecimal') -ascii_isdigit = ascii_isdecimal # alias -utf8_isdigit = _simple_unary_function('utf8_isdigit') -ascii_islower = _simple_unary_function('ascii_islower') -utf8_islower = _simple_unary_function('utf8_islower') -ascii_isnumeric = ascii_isdecimal # alias -utf8_isnumeric = _simple_unary_function('utf8_isnumeric') -ascii_isprintable = _simple_unary_function('ascii_isprintable') -utf8_isprintable = _simple_unary_function('utf8_isprintable') -ascii_istitle = _simple_unary_function('ascii_istitle') -utf8_istitle = _simple_unary_function('utf8_istitle') -ascii_isupper = _simple_unary_function('ascii_isupper') -utf8_isupper = _simple_unary_function('utf8_isupper') +string_is_ascii = _simple_unary_function('string_is_ascii') + +ascii_is_alnum = _simple_unary_function('ascii_is_alnum') +utf8_is_alnum = _simple_unary_function('utf8_is_alnum') +ascii_is_alpha = _simple_unary_function('ascii_is_alpha') +utf8_is_alpha = _simple_unary_function('utf8_is_alpha') +ascii_is_decimal = _simple_unary_function('ascii_is_decimal') +utf8_is_decimal = _simple_unary_function('utf8_is_decimal') +ascii_is_digit = ascii_is_decimal # alias +utf8_is_digit = _simple_unary_function('utf8_is_digit') +ascii_is_lower = _simple_unary_function('ascii_is_lower') +utf8_is_lower = _simple_unary_function('utf8_is_lower') +ascii_is_numeric = ascii_is_decimal # alias +utf8_is_numeric = _simple_unary_function('utf8_is_numeric') +ascii_is_printable = _simple_unary_function('ascii_is_printable') +utf8_is_printable = _simple_unary_function('utf8_is_printable') +ascii_is_title = _simple_unary_function('ascii_is_title') +utf8_is_title = _simple_unary_function('utf8_is_title') +ascii_is_upper = _simple_unary_function('ascii_is_upper') +utf8_is_upper = _simple_unary_function('utf8_is_upper') is_valid = _simple_unary_function('is_valid') is_null = _simple_unary_function('is_null') diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 5f8c445162f..f93366d15f2 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -125,7 +125,7 @@ def find_new_unicode_codepoints(): new = set() characters = [chr(c) for c in range(0x80, 0x11000) if not (0xD800 <= c < 0xE000)] - is_printable = pc.utf8_isprintable(pa.array(characters)).to_pylist() + is_printable = pc.utf8_is_printable(pa.array(characters)).to_pylist() for i, c in enumerate(characters): if is_printable[i] != c.isprintable(): new.add(ord(c)) @@ -134,9 +134,9 @@ def find_new_unicode_codepoints(): # Python claims there are not alpha, not sure why, they are in # gc='Other Letter': https://graphemica.com/%E1%B3%B2 -unknown_issue_isalpha = {0x1cf2, 0x1cf3} +unknown_issue_is_alpha = {0x1cf2, 0x1cf3} # utf8proc does not know if codepoints are lower case -utf8proc_issue_islower = { +utf8proc_issue_is_lower = { 0xaa, 0xba, 0x2b0, 0x2b1, 0x2b2, 0x2b3, 0x2b4, 0x2b5, 0x2b6, 0x2b7, 0x2b8, 0x2c0, 0x2c1, 0x2e0, 0x2e1, 0x2e2, 0x2e3, 0x2e4, 0x37a, 0x1d2c, 0x1d2d, @@ -208,23 +208,24 @@ def find_new_unicode_codepoints(): 0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, } codepoints_ignore = { - 'isalnum': numeric_info_missing | digit_info_missing | - unknown_issue_isalpha, - 'isalpha': unknown_issue_isalpha, - 'isdigit': digit_info_missing, - 'isnumeric': numeric_info_missing, - 'islower': utf8proc_issue_islower + 'is_alnum': numeric_info_missing | digit_info_missing | + unknown_issue_is_alpha, + 'is_alpha': unknown_issue_is_alpha, + 'is_digit': digit_info_missing, + 'is_numeric': numeric_info_missing, + 'is_lower': utf8proc_issue_is_lower } -@pytest.mark.parametrize('function_name', ['isalnum', 'isalpha', 'isascii', - 'isdecimal', 'isdigit', 'islower', - 'isnumeric', 'isprintable', - 'isspace', 'isupper', ]) +@pytest.mark.parametrize('function_name', ['is_alnum', 'is_alpha', + 'is_ascii', 'is_decimal', + 'is_digit', 'is_lower', + 'is_numeric', 'is_printable', + 'is_space', 'is_upper', ]) @pytest.mark.parametrize('variant', ['ascii', 'utf8']) def test_string_py_compat_boolean(function_name, variant): arrow_name = variant + "_" + function_name - py_name = function_name + py_name = function_name.replace('_', '') ignore = codepoints_ignore.get(function_name, set()) |\ find_new_unicode_codepoints() for i in range(128 if ascii else 0x11000): From 97ee59f6b340bcdc3e27e898206c9176d1c7aa81 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 14 Jul 2020 16:08:51 -0500 Subject: [PATCH 2/2] Fix some typos --- cpp/src/arrow/compute/kernels/scalar_string.cc | 2 +- cpp/src/arrow/compute/kernels/scalar_string_test.cc | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index d2e7e92cdb1..0d6b8da3903 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -949,7 +949,7 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) { AddUnaryStringPredicate("ascii_is_alnum", registry); AddUnaryStringPredicate("ascii_is_alpha", registry); AddUnaryStringPredicate("ascii_is_decimal", registry); - // no is_digic for ascii, since it is the same as is_decimal + // no is_digit for ascii, since it is the same as is_decimal AddUnaryStringPredicate("ascii_is_lower", registry); // no is_numeric for ascii, since it is the same as is_decimal AddUnaryStringPredicate("ascii_is_printable", registry); diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 476f56c2dc2..a96716ad39c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -128,7 +128,7 @@ TYPED_TEST(TestStringKernels, Utf8Lower) { this->CheckUnary("utf8_lower", "[\"aAazZæÆ&\", null, \"\", \"b\"]", this->type(), "[\"aaazzææ&\", null, \"\", \"b\"]"); - // test varying encoding lenghts and thus changing indices/offsets + // test varying encoding lengths and thus changing indices/offsets this->CheckUnary("utf8_lower", "[\"ⱭɽⱤoW\", null, \"ıI\", \"B\"]", this->type(), "[\"ɑɽɽow\", null, \"ıi\", \"b\"]"); @@ -201,7 +201,7 @@ TYPED_TEST(TestStringKernels, IsLowerUnicode) { } TYPED_TEST(TestStringKernels, IsPrintableUnicode) { - // U+2008 (utf8: \xe2\x80\x88) is punctuaction space, it is NOT printable + // U+2008 (utf8: \xe2\x80\x88) is punctuation space, it is NOT printable // U+0378 (utf8: \xCD\xB8) is an undefined char, it has no category this->CheckUnary( "utf8_is_printable", @@ -210,7 +210,7 @@ TYPED_TEST(TestStringKernels, IsPrintableUnicode) { } TYPED_TEST(TestStringKernels, IsSpaceUnicode) { - // U+2008 (utf8: \xe2\x80\x88) is punctuaction space + // U+2008 (utf8: \xe2\x80\x88) is punctuation space this->CheckUnary("utf8_is_space", "[\" \", null, \" \", \"\\t\\r\"]", boolean(), "[true, null, true, true]"); this->CheckUnary("utf8_is_space", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]", @@ -281,14 +281,14 @@ TYPED_TEST(TestStringKernels, IsLowerAscii) { "[false, null, true, false, true, false, false]"); } TYPED_TEST(TestStringKernels, IsPrintableAscii) { - // \xe2\x80\x88 is punctuaction space + // \xe2\x80\x88 is punctuation space this->CheckUnary("ascii_is_printable", "[\" 123azAZ!~\", null, \"\xe2\x80\x88\", \"\", \"\\r\"]", boolean(), "[true, null, false, true, false]"); } TYPED_TEST(TestStringKernels, IsSpaceAscii) { - // \xe2\x80\x88 is punctuaction space + // \xe2\x80\x88 is punctuation space // Note: for ascii version, the non-ascii chars are seen as non-cased this->CheckUnary("ascii_is_space", "[\" \", null, \" \", \"\\t\\r\"]", boolean(), "[true, null, true, true]");