From c9d0a478683e920867016aed7fcbef070a05ca6c Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Wed, 4 Aug 2021 08:20:04 -0400 Subject: [PATCH 01/23] initial impl of ASCII title --- .../arrow/compute/kernels/scalar_string.cc | 55 ++++++++++++++----- 1 file changed, 41 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index f5de81acd67..16bba2aeef4 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -81,6 +81,22 @@ static inline uint8_t ascii_toupper(uint8_t utf8_code_unit) { : utf8_code_unit; } +static inline uint8_t ascii_swapcase(uint8_t utf8_code_unit) { + if (IsLowerCaseCharacterAscii(utf8_code_unit)) { + utf8_code_unit -= 32; + } else if (IsUpperCaseCharacterAscii(utf8_code_unit)) { + utf8_code_unit += 32; + } + return utf8_code_unit; +} + +// IsAlpha/Digit etc + +template +static inline bool IsAsciiCharacter(T character) { + return character < 128; +} + static inline bool IsLowerCaseCharacterAscii(uint8_t ascii_character) { return (ascii_character >= 'a') && (ascii_character <= 'z'); } @@ -117,20 +133,6 @@ static inline bool IsPrintableCharacterAscii(uint8_t ascii_character) { return ((ascii_character >= ' ') && (ascii_character <= '~')); } -static inline uint8_t ascii_swapcase(uint8_t utf8_code_unit) { - if (IsLowerCaseCharacterAscii(utf8_code_unit)) { - utf8_code_unit -= 32; - } else if (IsUpperCaseCharacterAscii(utf8_code_unit)) { - utf8_code_unit += 32; - } - return utf8_code_unit; -} - -template -static inline bool IsAsciiCharacter(T character) { - return character < 128; -} - struct BinaryLength { template static OutValue Call(KernelContext*, Arg0Value val, Status*) { @@ -677,6 +679,31 @@ struct AsciiCapitalizeTransform : public StringTransformBase { template using AsciiCapitalize = StringTransformExec; +struct AsciiTitleTransform : public StringTransformBase { + int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, uint8_t* output) { + uint8_t* c = input; + const uint8_t* end = input + input_string_ncodeunits; + while (c <= end) { + uint8_t* c_; + // Uppercase first caseable character of current word + while ((c_ = c++) <= end) { + if (IsCasedCharacterAscii(*c_)) { + *c_ = ascii_toupper(*c_); + break; + } + } + // Lowercase characters until a whitespace is found + while ((c_ = c++) <= end) { + if (IsSpaceCharacterAscii(*c_)) { + break; + } + *c_ = ascii_tolower(*c_); + } + } + return input_string_ncodeunits; + } +}; + // ---------------------------------------------------------------------- // exact pattern detection From e05fa30503a814afd28fbcc50c3eab51b6c2009d Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Wed, 4 Aug 2021 14:14:42 -0400 Subject: [PATCH 02/23] update ASCII title kernel --- cpp/src/arrow/compute/kernels/scalar_string.cc | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 16bba2aeef4..1e2e974ac32 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -685,7 +685,7 @@ struct AsciiTitleTransform : public StringTransformBase { const uint8_t* end = input + input_string_ncodeunits; while (c <= end) { uint8_t* c_; - // Uppercase first caseable character of current word + // Uppercase first alpha character of current word while ((c_ = c++) <= end) { if (IsCasedCharacterAscii(*c_)) { *c_ = ascii_toupper(*c_); @@ -704,6 +704,9 @@ struct AsciiTitleTransform : public StringTransformBase { } }; +template +using AsciiTitle = StringTransformExec; + // ---------------------------------------------------------------------- // exact pattern detection @@ -4154,6 +4157,13 @@ const FunctionDoc ascii_capitalize_doc( "non-ASCII characters, use \"utf8_capitalize\" instead."), {"strings"}); +const FunctionDoc ascii_title_doc( + "Transform ASCII input to a string where the first alpha character in each word is uppercase and all other characters are lowercase", + ("For each string in `strings`, return a title version.\n\n" + "This function assumes the input is fully ASCII. If it may contain\n" + "non-ASCII characters, use \"utf8_title\" instead."), + {"strings"}); + const FunctionDoc ascii_reverse_doc( "Reverse ASCII input", ("For each ASCII string in `strings`, return a reversed version.\n\n" @@ -4201,6 +4211,7 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) { "ascii_swapcase", registry, &ascii_swapcase_doc, MemAllocation::NO_PREALLOCATE); MakeUnaryStringBatchKernel("ascii_capitalize", registry, &ascii_capitalize_doc); + MakeUnaryStringBatchKernel("ascii_title", registry, &ascii_title_doc); MakeUnaryStringBatchKernel("ascii_trim_whitespace", registry, &ascii_trim_whitespace_doc); MakeUnaryStringBatchKernel("ascii_ltrim_whitespace", registry, From b34e198149c56cf4add9ce6eb7ce2e433e1146a7 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Thu, 5 Aug 2021 07:23:23 -0400 Subject: [PATCH 03/23] add ASCII title kernel --- .../arrow/compute/kernels/scalar_string.cc | 68 ++++++++++--------- .../compute/kernels/scalar_string_test.cc | 14 +++- 2 files changed, 47 insertions(+), 35 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 1e2e974ac32..3cf25a6d11c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -66,29 +66,10 @@ Status RegexStatus(const RE2& regex) { } #endif -// Code units in the range [a-z] can only be an encoding of an ascii -// character/codepoint, not the 2nd, 3rd or 4th code unit (byte) of an different -// codepoint. This guaranteed by non-overlap design of the unicode standard. (see -// section 2.5 of Unicode Standard Core Specification v13.0) - -static inline uint8_t ascii_tolower(uint8_t utf8_code_unit) { - return ((utf8_code_unit >= 'A') && (utf8_code_unit <= 'Z')) ? (utf8_code_unit + 32) - : utf8_code_unit; -} - -static inline uint8_t ascii_toupper(uint8_t utf8_code_unit) { - return ((utf8_code_unit >= 'a') && (utf8_code_unit <= 'z')) ? (utf8_code_unit - 32) - : utf8_code_unit; -} - -static inline uint8_t ascii_swapcase(uint8_t utf8_code_unit) { - if (IsLowerCaseCharacterAscii(utf8_code_unit)) { - utf8_code_unit -= 32; - } else if (IsUpperCaseCharacterAscii(utf8_code_unit)) { - utf8_code_unit += 32; - } - return utf8_code_unit; -} +// Code units in the range [a-z] can only be an encoding of an ASCII +// character/codepoint, not the 2nd, 3rd or 4th code unit (byte) of a different +// codepoint. This is guaranteed by the non-overlap design of the Unicode +// standard. (see section 2.5 of Unicode Standard Core Specification v13.0) // IsAlpha/Digit etc @@ -149,6 +130,25 @@ struct Utf8Length { } }; +static inline uint8_t ascii_tolower(uint8_t utf8_code_unit) { + return ((utf8_code_unit >= 'A') && (utf8_code_unit <= 'Z')) ? (utf8_code_unit + 32) + : utf8_code_unit; +} + +static inline uint8_t ascii_toupper(uint8_t utf8_code_unit) { + return ((utf8_code_unit >= 'a') && (utf8_code_unit <= 'z')) ? (utf8_code_unit - 32) + : utf8_code_unit; +} + +static inline uint8_t ascii_swapcase(uint8_t utf8_code_unit) { + if (IsLowerCaseCharacterAscii(utf8_code_unit)) { + utf8_code_unit -= 32; + } else if (IsUpperCaseCharacterAscii(utf8_code_unit)) { + utf8_code_unit += 32; + } + return utf8_code_unit; +} + #ifdef ARROW_WITH_UTF8PROC // Direct lookup tables for unicode properties @@ -681,23 +681,24 @@ using AsciiCapitalize = StringTransformExec; struct AsciiTitleTransform : public StringTransformBase { int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, uint8_t* output) { - uint8_t* c = input; - const uint8_t* end = input + input_string_ncodeunits; - while (c <= end) { - uint8_t* c_; + const uint8_t* const end = input + input_string_ncodeunits; + while (input <= end) { + const uint8_t* c_; // Uppercase first alpha character of current word - while ((c_ = c++) <= end) { + while ((c_ = input++) <= end) { if (IsCasedCharacterAscii(*c_)) { - *c_ = ascii_toupper(*c_); + *output++ = ascii_toupper(*c_); break; } + *output++ = *c_; } // Lowercase characters until a whitespace is found - while ((c_ = c++) <= end) { + while ((c_ = input++) <= end) { if (IsSpaceCharacterAscii(*c_)) { + *output++ = *c_; break; } - *c_ = ascii_tolower(*c_); + *output++ = ascii_tolower(*c_); } } return input_string_ncodeunits; @@ -4201,8 +4202,9 @@ const FunctionDoc utf8_reverse_doc( } // namespace void RegisterScalarStringAscii(FunctionRegistry* registry) { - // ascii_upper and ascii_lower are able to reuse the original offsets buffer, - // so don't preallocate them in the output. + // Some kernels are able to reuse the original offsets buffer, so don't + // preallocate them in the output. Only kernels that invoke + // "StringDataTransform()" support no preallocation. MakeUnaryStringBatchKernel("ascii_upper", registry, &ascii_upper_doc, MemAllocation::NO_PREALLOCATE); MakeUnaryStringBatchKernel("ascii_lower", registry, &ascii_lower_doc, diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 271ecb0b415..80aeed6ca7d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -413,6 +413,16 @@ TYPED_TEST(TestStringKernels, AsciiCapitalize) { "\"!hello, world!\"]"); } +TYPED_TEST(TestStringKernels, AsciiTitle) { + this->CheckUnary("ascii_title", "[]", this->type(), "[]"); + this->CheckUnary("ascii_title", + "[\"aAazZæÆ&\", null, \"\", \"bBB\", \"hEllO, WoRld!\", \"$. A3\", " + "\"!hELlo, wORLd!\", \" a b cd\"]", + this->type(), + "[\"AaazzæÆ&\", null, \"\", \"Bbb\", \"Hello, World!\", \"$. A3\", " + "\"!Hello, World!\", \" A B Cd\"]"); +} + TYPED_TEST(TestStringKernels, AsciiReverse) { this->CheckUnary("ascii_reverse", "[]", this->type(), "[]"); this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(), @@ -522,7 +532,7 @@ TYPED_TEST(TestStringKernels, Utf8SwapCase) { // test maximum buffer growth this->CheckUnary("utf8_swapcase", "[\"ȺȺȺȺ\"]", this->type(), "[\"ⱥⱥⱥⱥ\"]"); - this->CheckUnary("ascii_swapcase", "[\"hEllO, WoRld!\", \"$. A35?\"]", this->type(), + this->CheckUnary("utf8_swapcase", "[\"hEllO, WoRld!\", \"$. A35?\"]", this->type(), "[\"HeLLo, wOrLD!\", \"$. a35?\"]"); // Test invalid data @@ -532,7 +542,7 @@ TYPED_TEST(TestStringKernels, Utf8SwapCase) { } TYPED_TEST(TestStringKernels, Utf8Capitalize) { - this->CheckUnary("ascii_capitalize", "[]", this->type(), "[]"); + this->CheckUnary("utf8_capitalize", "[]", this->type(), "[]"); this->CheckUnary("utf8_capitalize", "[\"aAazZæÆ&\", null, \"\", \"b\", \"ɑɽⱤoW\", \"ıI\", \"ⱥⱥⱥȺ\", " "\"hEllO, WoRld!\", \"$. A3\", \"!ɑⱤⱤow\"]", From daa4cb7dfa6169be7fe1a47ba34e3f96f1f6ae33 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Thu, 5 Aug 2021 07:27:23 -0400 Subject: [PATCH 04/23] update docs for ASCII title --- docs/source/cpp/compute.rst | 6 ++++-- docs/source/python/api/compute.rst | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 465500e8dae..26ffb640b21 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -697,7 +697,7 @@ String transforms +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | Function name | Arity | Input types | Output type | Options class | Notes | +=========================+=======+========================+========================+===================================+=======+ -| ascii_capitalize | Unary | String-like | String-like | | | +| ascii_capitalize | Unary | String-like | String-like | | \(1) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | ascii_lower | Unary | String-like | String-like | | \(1) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ @@ -705,6 +705,8 @@ String transforms +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | ascii_swapcase | Unary | String-like | String-like | | \(1) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| ascii_title | Unary | String-like | String-like | | \(1) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | ascii_upper | Unary | String-like | String-like | | \(1) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | binary_length | Unary | Binary- or String-like | Int32 or Int64 | | \(3) | @@ -715,7 +717,7 @@ String transforms +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | replace_substring_regex | Unary | String-like | String-like | :struct:`ReplaceSubstringOptions` | \(6) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ -| utf8_capitalize | Unary | String-like | String-like | | | +| utf8_capitalize | Unary | String-like | String-like | | \(8) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | utf8_length | Unary | String-like | Int32 or Int64 | | \(7) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index b737439147e..96679babaa9 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -264,6 +264,7 @@ String Transforms ascii_rtrim ascii_rtrim_whitespace ascii_swapcase + ascii_title ascii_trim ascii_upper binary_length From 70945058155245b1d1da1cf1dfba22bfacbaa9c2 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Thu, 5 Aug 2021 08:12:13 -0400 Subject: [PATCH 05/23] fix lint error --- cpp/src/arrow/compute/kernels/scalar_string.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 3cf25a6d11c..44d91a459fa 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -4204,7 +4204,7 @@ const FunctionDoc utf8_reverse_doc( void RegisterScalarStringAscii(FunctionRegistry* registry) { // Some kernels are able to reuse the original offsets buffer, so don't // preallocate them in the output. Only kernels that invoke - // "StringDataTransform()" support no preallocation. + // "StringDataTransform" support no preallocation. MakeUnaryStringBatchKernel("ascii_upper", registry, &ascii_upper_doc, MemAllocation::NO_PREALLOCATE); MakeUnaryStringBatchKernel("ascii_lower", registry, &ascii_lower_doc, From 8e4c02735e7bfe5564733fa0601fba56cfa58c81 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Wed, 18 Aug 2021 16:40:49 -0400 Subject: [PATCH 06/23] fix linting errors --- cpp/src/arrow/compute/kernels/scalar_string.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 44d91a459fa..b5f2e978be1 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -680,7 +680,8 @@ template using AsciiCapitalize = StringTransformExec; struct AsciiTitleTransform : public StringTransformBase { - int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, uint8_t* output) { + int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, + uint8_t* output) { const uint8_t* const end = input + input_string_ncodeunits; while (input <= end) { const uint8_t* c_; @@ -4159,7 +4160,8 @@ const FunctionDoc ascii_capitalize_doc( {"strings"}); const FunctionDoc ascii_title_doc( - "Transform ASCII input to a string where the first alpha character in each word is uppercase and all other characters are lowercase", + "Transform ASCII input to a string where the first alpha character in each word is " + "uppercase and all other characters are lowercase", ("For each string in `strings`, return a title version.\n\n" "This function assumes the input is fully ASCII. If it may contain\n" "non-ASCII characters, use \"utf8_title\" instead."), From 3b8c4ce208c6123adcc9a6ebbf1ca082dbeb8f77 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Fri, 27 Aug 2021 06:36:16 -0400 Subject: [PATCH 07/23] update capitalize/title and add UTF8 title --- .../arrow/compute/kernels/scalar_string.cc | 148 +++++++++++------- 1 file changed, 95 insertions(+), 53 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index b5f2e978be1..618120ce97d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -422,17 +422,25 @@ struct StringTransformExecWithState #ifdef ARROW_WITH_UTF8PROC -template -struct StringTransformCodepoint : public StringTransformBase { +struct StringTransformCodepointBase : public StringTransformBase { Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override { EnsureLookupTablesFilled(); return Status::OK(); } int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override { - return CodepointTransform::MaxCodeunits(ninputs, input_ncodeunits); + // Section 5.18 of the Unicode spec claims that the number of codepoints for case + // mapping can grow by a factor of 3. This means grow by a factor of 3 in bytes + // However, since we don't support all casings (SpecialCasing.txt) the growth + // in bytes is actually only at max 3/2 (as covered by the unittest). + // Note that rounding down the 3/2 is ok, since only codepoints encoded by + // two code units (even) can grow to 3 code units. + return static_cast(input_ncodeunits) * 3 / 2; } +}; +template +struct StringTransformCodepoint : public StringTransformCodepointBase { int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, uint8_t* output) { uint8_t* output_start = output; @@ -445,20 +453,7 @@ struct StringTransformCodepoint : public StringTransformBase { } }; -// struct CaseMappingMixin { -struct CaseMappingTransform { - static int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) { - // Section 5.18 of the Unicode spec claims that the number of codepoints for case - // mapping can grow by a factor of 3. This means grow by a factor of 3 in bytes - // However, since we don't support all casings (SpecialCasing.txt) the growth - // in bytes is actually only at max 3/2 (as covered by the unittest). - // Note that rounding down the 3/2 is ok, since only codepoints encoded by - // two code units (even) can grow to 3 code units. - return static_cast(input_ncodeunits) * 3 / 2; - } -}; - -struct UTF8UpperTransform : public CaseMappingTransform { +struct UTF8UpperTransform : public StringTransformCodepointBase { static uint32_t TransformCodepoint(uint32_t codepoint) { return codepoint <= kMaxCodepointLookup ? lut_upper_codepoint[codepoint] : utf8proc_toupper(codepoint); @@ -468,7 +463,7 @@ struct UTF8UpperTransform : public CaseMappingTransform { template using UTF8Upper = StringTransformExec>; -struct UTF8LowerTransform : public CaseMappingTransform { +struct UTF8LowerTransform : public StringTransformCodepointBase { static uint32_t TransformCodepoint(uint32_t codepoint) { return codepoint <= kMaxCodepointLookup ? lut_lower_codepoint[codepoint] : utf8proc_tolower(codepoint); @@ -478,7 +473,7 @@ struct UTF8LowerTransform : public CaseMappingTransform { template using UTF8Lower = StringTransformExec>; -struct UTF8SwapCaseTransform : public CaseMappingTransform { +struct UTF8SwapCaseTransform : public StringTransformCodepointBase { static uint32_t TransformCodepoint(uint32_t codepoint) { if (codepoint <= kMaxCodepointLookup) { return lut_swapcase_codepoint[codepoint]; @@ -498,29 +493,20 @@ template using UTF8SwapCase = StringTransformExec>; -struct Utf8CapitalizeTransform : public StringTransformBase { +struct Utf8CapitalizeTransform : public StringTransformCodepointBase { int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, uint8_t* output) { uint8_t* output_start = output; - if (input_string_ncodeunits > 0) { - // Get number of code units in first code point - uint32_t codepoint = 0; - const uint8_t* i = input; - if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&i, &codepoint))) { - return kTransformError; - } - int64_t codepoint_ncodeunits = - std::min(static_cast(i - input), input_string_ncodeunits); - if (ARROW_PREDICT_FALSE( - !util::UTF8Transform(input, input + codepoint_ncodeunits, &output, - UTF8UpperTransform::TransformCodepoint))) { - return kTransformError; - } - if (ARROW_PREDICT_FALSE(!util::UTF8Transform( - input + codepoint_ncodeunits, input + input_string_ncodeunits, &output, - UTF8LowerTransform::TransformCodepoint))) { - return kTransformError; - } + const uint8_t* end = input + input_string_ncodeunits; + const uint8_t* next = input; + if (ARROW_PREDICT_FALSE(!util::UTF8AdvanceCodepoints(input, end, &next, 1))) { + return kTransformError; + } + if (ARROW_PREDICT_FALSE(!util::UTF8Transform(input, next, &output, UTF8UpperTransform::TransformCodepoint))) { + return kTransformError; + } + if (ARROW_PREDICT_FALSE(!util::UTF8Transform(next, end, &output, UTF8LowerTransform::TransformCodepoint))) { + return kTransformError; } return output - output_start; } @@ -529,6 +515,53 @@ struct Utf8CapitalizeTransform : public StringTransformBase { template using Utf8Capitalize = StringTransformExec; +struct Utf8TitleTransform: public StringTransformCodepointBase { + int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, + uint8_t* output) { + uint8_t* output_start = output; + const uint8_t* end = input + input_string_ncodeunits; + const uint8_t* curr = NULLPTR; + uint32_t codepoint = 0; + + do { + // Uppercase first alpha character of current word + while (input < end) { + curr = input; + if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&curr, &codepoint))) { + return kTransformError; + } + if (IsCasedCharacterUnicode(codepoint)) { + output = util::UTF8Encode(output, UTF8UpperTransform::TransformCodepoint(codepoint)); + input = curr; + break; + } + output = std::copy(input, curr, output); + input = curr; + } + + // Lowercase characters until a whitespace is found + while (input < end) { + curr = input; + if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&curr, &codepoint))) { + return kTransformError; + } + if (IsSpaceCharacterUnicode(codepoint)) { + output = std::copy(input, curr, output); + input = curr; + break; + } + output = util::UTF8Encode(output, UTF8LowerTransform::TransformCodepoint(codepoint)); + input = curr; + } + } while (input < end); + + return output - output_start; + } +}; + +template +using Utf8Title = StringTransformExec; + #endif // ARROW_WITH_UTF8PROC struct AsciiReverseTransform : public StringTransformBase { @@ -669,8 +702,8 @@ struct AsciiCapitalizeTransform : public StringTransformBase { int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, uint8_t* output) { if (input_string_ncodeunits > 0) { - *output = ascii_toupper(*input); - TransformAsciiLower(input + 1, input_string_ncodeunits - 1, output + 1); + *output++ = ascii_toupper(*input++); + TransformAsciiLower(input, input_string_ncodeunits - 1, output); } return input_string_ncodeunits; } @@ -682,26 +715,29 @@ using AsciiCapitalize = StringTransformExec; struct AsciiTitleTransform : public StringTransformBase { int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, uint8_t* output) { - const uint8_t* const end = input + input_string_ncodeunits; - while (input <= end) { - const uint8_t* c_; + const uint8_t* end = input + input_string_ncodeunits; + const uint8_t* curr = NULLPTR; + + do { // Uppercase first alpha character of current word - while ((c_ = input++) <= end) { - if (IsCasedCharacterAscii(*c_)) { - *output++ = ascii_toupper(*c_); + while ((curr = input++) < end) { + if (IsCasedCharacterAscii(*curr)) { + *output++ = ascii_toupper(*curr); break; } - *output++ = *c_; + *output++ = *curr; } + // Lowercase characters until a whitespace is found - while ((c_ = input++) <= end) { - if (IsSpaceCharacterAscii(*c_)) { - *output++ = *c_; + while ((curr = input++) < end) { + if (IsSpaceCharacterAscii(*curr)) { + *output++ = *curr; break; } - *output++ = ascii_tolower(*c_); + *output++ = ascii_tolower(*curr); } - } + } while (input < end); + return input_string_ncodeunits; } }; @@ -4193,6 +4229,11 @@ const FunctionDoc utf8_capitalize_doc( "with the first character uppercased and the others lowercased."), {"strings"}); +const FunctionDoc utf8_title_doc( + "Transform input to a string where the first alpha character in each word is " + "uppercase and all other characters are lowercase", + ("For each string in `strings`, return a title version."), {"strings"}); + const FunctionDoc utf8_reverse_doc( "Reverse input", ("For each string in `strings`, return a reversed version.\n\n" @@ -4263,6 +4304,7 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) { &utf8_swapcase_doc); MakeUnaryStringBatchKernel("utf8_capitalize", registry, &utf8_capitalize_doc); + MakeUnaryStringBatchKernel("utf8_title", registry, &utf8_title_doc); MakeUnaryStringBatchKernel("utf8_trim_whitespace", registry, &utf8_trim_whitespace_doc); MakeUnaryStringBatchKernel("utf8_ltrim_whitespace", registry, From e77abe9debcd31e350b96be29a8831dede2730d7 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Fri, 27 Aug 2021 06:36:34 -0400 Subject: [PATCH 08/23] add UTF8 title tests --- cpp/src/arrow/compute/kernels/scalar_string_test.cc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 80aeed6ca7d..a7ac9949be7 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -551,6 +551,16 @@ TYPED_TEST(TestStringKernels, Utf8Capitalize) { "\"Hello, world!\", \"$. a3\", \"!ɑɽɽow\"]"); } +TYPED_TEST(TestStringKernels, Utf8Title) { + this->CheckUnary("utf8_title", "[]", this->type(), "[]"); + this->CheckUnary("utf8_title", + "[\"aAazZæÆ&\", null, \"\", \"b\", \"ɑɽⱤoW\", \"ıI\", \"ⱥⱥⱥȺ\", " + "\"hEllO, WoRld!\", \"$. A3\", \"!ɑⱤⱤow\"]", + this->type(), + "[\"Aaazzææ&\", null, \"\", \"B\", \"Ɑɽɽow\", \"Ii\", \"Ⱥⱥⱥⱥ\", " + "\"Hello, World!\", \"$. A3\", \"!Ɑɽɽow\"]"); +} + TYPED_TEST(TestStringKernels, IsAlphaNumericUnicode) { // U+08BE (utf8: \xE0\xA2\xBE) is undefined, but utf8proc things it is // UTF8PROC_CATEGORY_LO From 8fcff85cde57d4da8b5de63d6d8b12c0e0d36f5d Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Fri, 27 Aug 2021 06:40:38 -0400 Subject: [PATCH 09/23] update docs --- docs/source/cpp/compute.rst | 2 ++ docs/source/python/api/compute.rst | 1 + 2 files changed, 3 insertions(+) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 26ffb640b21..7263d77acf2 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -729,6 +729,8 @@ String transforms +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | utf8_swapcase | Unary | String-like | String-like | | \(8) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| utf8_title | Unary | String-like | String-like | | \(8) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | utf8_upper | Unary | String-like | String-like | | \(8) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index 96679babaa9..f7f740c24e5 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -284,6 +284,7 @@ String Transforms utf8_rtrim utf8_rtrim_whitespace utf8_swapcase + utf8_title utf8_trim utf8_upper From d0f277cb1104f22fa5e21ce0b9323ad99b80e3c1 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Fri, 27 Aug 2021 06:42:32 -0400 Subject: [PATCH 10/23] apply lint changes --- cpp/src/arrow/compute/kernels/scalar_string.cc | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 618120ce97d..938daffa011 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -502,10 +502,12 @@ struct Utf8CapitalizeTransform : public StringTransformCodepointBase { if (ARROW_PREDICT_FALSE(!util::UTF8AdvanceCodepoints(input, end, &next, 1))) { return kTransformError; } - if (ARROW_PREDICT_FALSE(!util::UTF8Transform(input, next, &output, UTF8UpperTransform::TransformCodepoint))) { + if (ARROW_PREDICT_FALSE(!util::UTF8Transform( + input, next, &output, UTF8UpperTransform::TransformCodepoint))) { return kTransformError; } - if (ARROW_PREDICT_FALSE(!util::UTF8Transform(next, end, &output, UTF8LowerTransform::TransformCodepoint))) { + if (ARROW_PREDICT_FALSE(!util::UTF8Transform( + next, end, &output, UTF8LowerTransform::TransformCodepoint))) { return kTransformError; } return output - output_start; @@ -515,7 +517,7 @@ struct Utf8CapitalizeTransform : public StringTransformCodepointBase { template using Utf8Capitalize = StringTransformExec; -struct Utf8TitleTransform: public StringTransformCodepointBase { +struct Utf8TitleTransform : public StringTransformCodepointBase { int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, uint8_t* output) { uint8_t* output_start = output; @@ -531,7 +533,8 @@ struct Utf8TitleTransform: public StringTransformCodepointBase { return kTransformError; } if (IsCasedCharacterUnicode(codepoint)) { - output = util::UTF8Encode(output, UTF8UpperTransform::TransformCodepoint(codepoint)); + output = + util::UTF8Encode(output, UTF8UpperTransform::TransformCodepoint(codepoint)); input = curr; break; } @@ -550,7 +553,8 @@ struct Utf8TitleTransform: public StringTransformCodepointBase { input = curr; break; } - output = util::UTF8Encode(output, UTF8LowerTransform::TransformCodepoint(codepoint)); + output = + util::UTF8Encode(output, UTF8LowerTransform::TransformCodepoint(codepoint)); input = curr; } } while (input < end); From 27009136ed2cbd79cf72c6f48b1a7d2071410f5f Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Mon, 30 Aug 2021 08:01:02 -0400 Subject: [PATCH 11/23] update function description, change copy for memcpy, add input size check --- .../arrow/compute/kernels/scalar_string.cc | 47 +++++++++++-------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 938daffa011..8ce0f2a767f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -499,16 +499,18 @@ struct Utf8CapitalizeTransform : public StringTransformCodepointBase { uint8_t* output_start = output; const uint8_t* end = input + input_string_ncodeunits; const uint8_t* next = input; - if (ARROW_PREDICT_FALSE(!util::UTF8AdvanceCodepoints(input, end, &next, 1))) { - return kTransformError; - } - if (ARROW_PREDICT_FALSE(!util::UTF8Transform( - input, next, &output, UTF8UpperTransform::TransformCodepoint))) { - return kTransformError; - } - if (ARROW_PREDICT_FALSE(!util::UTF8Transform( - next, end, &output, UTF8LowerTransform::TransformCodepoint))) { - return kTransformError; + if (input_string_ncodeunits > 0) { + if (ARROW_PREDICT_FALSE(!util::UTF8AdvanceCodepoints(input, end, &next, 1))) { + return kTransformError; + } + if (ARROW_PREDICT_FALSE(!util::UTF8Transform( + input, next, &output, UTF8UpperTransform::TransformCodepoint))) { + return kTransformError; + } + if (ARROW_PREDICT_FALSE(!util::UTF8Transform( + next, end, &output, UTF8LowerTransform::TransformCodepoint))) { + return kTransformError; + } } return output - output_start; } @@ -522,7 +524,7 @@ struct Utf8TitleTransform : public StringTransformCodepointBase { uint8_t* output) { uint8_t* output_start = output; const uint8_t* end = input + input_string_ncodeunits; - const uint8_t* curr = NULLPTR; + const uint8_t* curr = nullptr; uint32_t codepoint = 0; do { @@ -538,7 +540,8 @@ struct Utf8TitleTransform : public StringTransformCodepointBase { input = curr; break; } - output = std::copy(input, curr, output); + std::memcpy(output, input, curr - input); + output += curr - input; input = curr; } @@ -549,7 +552,8 @@ struct Utf8TitleTransform : public StringTransformCodepointBase { return kTransformError; } if (IsSpaceCharacterUnicode(codepoint)) { - output = std::copy(input, curr, output); + std::memcpy(output, input, curr - input); + output += curr - input; input = curr; break; } @@ -720,7 +724,7 @@ struct AsciiTitleTransform : public StringTransformBase { int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, uint8_t* output) { const uint8_t* end = input + input_string_ncodeunits; - const uint8_t* curr = NULLPTR; + const uint8_t* curr = nullptr; do { // Uppercase first alpha character of current word @@ -4200,9 +4204,10 @@ const FunctionDoc ascii_capitalize_doc( {"strings"}); const FunctionDoc ascii_title_doc( - "Transform ASCII input to a string where the first alpha character in each word is " - "uppercase and all other characters are lowercase", - ("For each string in `strings`, return a title version.\n\n" + "Titlecase each word of ASCII input", + ("For each string in `strings`, return a titlecased version.\n" + "Each word in the output will start with an uppercase character and its\n" + "remaining characters will be lowercase.\n\n" "This function assumes the input is fully ASCII. If it may contain\n" "non-ASCII characters, use \"utf8_title\" instead."), {"strings"}); @@ -4234,9 +4239,11 @@ const FunctionDoc utf8_capitalize_doc( {"strings"}); const FunctionDoc utf8_title_doc( - "Transform input to a string where the first alpha character in each word is " - "uppercase and all other characters are lowercase", - ("For each string in `strings`, return a title version."), {"strings"}); + "Titlecase each word of input", + ("For each string in `strings`, return a titlecased version.\n" + "Each word in the output will start with an uppercase character and its\n" + "remaining characters will be lowercase."), + {"strings"}); const FunctionDoc utf8_reverse_doc( "Reverse input", From 98b6b336a0555b4c81d8afbdc4e5ce6974ecc6a5 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Mon, 30 Aug 2021 21:32:02 -0400 Subject: [PATCH 12/23] update title implementation --- .../arrow/compute/kernels/scalar_string.cc | 89 +++++++------------ .../compute/kernels/scalar_string_test.cc | 25 +++--- 2 files changed, 41 insertions(+), 73 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 8ce0f2a767f..285ee657c2d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -522,47 +522,27 @@ using Utf8Capitalize = StringTransformExec; struct Utf8TitleTransform : public StringTransformCodepointBase { int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, uint8_t* output) { - uint8_t* output_start = output; + const uint8_t* next = input; const uint8_t* end = input + input_string_ncodeunits; - const uint8_t* curr = nullptr; - uint32_t codepoint = 0; - - do { - // Uppercase first alpha character of current word - while (input < end) { - curr = input; - if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&curr, &codepoint))) { - return kTransformError; - } - if (IsCasedCharacterUnicode(codepoint)) { - output = - util::UTF8Encode(output, UTF8UpperTransform::TransformCodepoint(codepoint)); - input = curr; - break; - } - std::memcpy(output, input, curr - input); - output += curr - input; - input = curr; + uint8_t* output_start = output; + uint32_t (*TransformCodepoint)(uint32_t) = UTF8UpperTransform::TransformCodepoint; + while ((input = next) < end) { + uint32_t codepoint; + if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&next, &codepoint))) { + return kTransformError; } - - // Lowercase characters until a whitespace is found - while (input < end) { - curr = input; - if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&curr, &codepoint))) { - return kTransformError; - } - if (IsSpaceCharacterUnicode(codepoint)) { - std::memcpy(output, input, curr - input); - output += curr - input; - input = curr; - break; - } - output = - util::UTF8Encode(output, UTF8LowerTransform::TransformCodepoint(codepoint)); - input = curr; + if (IsCasedCharacterUnicode(codepoint)) { + // Transform codepoint (lower/upper), prepare to lowercase next consecutive cased + // codepoints + output = util::UTF8Encode(output, TransformCodepoint(codepoint)); + TransformCodepoint = UTF8LowerTransform::TransformCodepoint; + } else { + // Copy non-cased character, prepare to uppercase next caseable codepoint + std::memcpy(output, input, next - input); + output += next - input; + TransformCodepoint = UTF8UpperTransform::TransformCodepoint; } - } while (input < end); - + } return output - output_start; } }; @@ -724,28 +704,19 @@ struct AsciiTitleTransform : public StringTransformBase { int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, uint8_t* output) { const uint8_t* end = input + input_string_ncodeunits; - const uint8_t* curr = nullptr; - - do { - // Uppercase first alpha character of current word - while ((curr = input++) < end) { - if (IsCasedCharacterAscii(*curr)) { - *output++ = ascii_toupper(*curr); - break; - } - *output++ = *curr; - } - - // Lowercase characters until a whitespace is found - while ((curr = input++) < end) { - if (IsSpaceCharacterAscii(*curr)) { - *output++ = *curr; - break; - } - *output++ = ascii_tolower(*curr); + uint8_t (*ascii_toxxx)(uint8_t) = ascii_toupper; + while (input < end) { + if (IsCasedCharacterAscii(*input)) { + // Transform codepoint (lower/upper), prepare to lowercase next consecutive cased + // codepoints + *output++ = ascii_toxxx(*input++); + ascii_toxxx = ascii_tolower; + } else { + // Copy non-cased character, prepare to uppercase next caseable codepoint + *output++ = *input++; + ascii_toxxx = ascii_toupper; } - } while (input < end); - + } return input_string_ncodeunits; } }; diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index a7ac9949be7..5b93cbae361 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -16,6 +16,7 @@ // under the License. #include +#include #include #include @@ -414,13 +415,11 @@ TYPED_TEST(TestStringKernels, AsciiCapitalize) { } TYPED_TEST(TestStringKernels, AsciiTitle) { - this->CheckUnary("ascii_title", "[]", this->type(), "[]"); - this->CheckUnary("ascii_title", - "[\"aAazZæÆ&\", null, \"\", \"bBB\", \"hEllO, WoRld!\", \"$. A3\", " - "\"!hELlo, wORLd!\", \" a b cd\"]", - this->type(), - "[\"AaazzæÆ&\", null, \"\", \"Bbb\", \"Hello, World!\", \"$. A3\", " - "\"!Hello, World!\", \" A B Cd\"]"); + this->CheckUnary( + "ascii_title", + R"([null, "", "b", "aAaz;ZeA&", "arRoW", "iI", "a.a.a..A", "hEllO, WoRld!", "foo baR;heHe0zOP", "!%$^.,;"])", + this->type(), + R"([null, "", "B", "Aaaz;Zea&", "Arrow", "Ii", "A.A.A..A", "Hello, World!", "Foo Bar;Hehe0Zop", "!%$^.,;"])"); } TYPED_TEST(TestStringKernels, AsciiReverse) { @@ -552,13 +551,11 @@ TYPED_TEST(TestStringKernels, Utf8Capitalize) { } TYPED_TEST(TestStringKernels, Utf8Title) { - this->CheckUnary("utf8_title", "[]", this->type(), "[]"); - this->CheckUnary("utf8_title", - "[\"aAazZæÆ&\", null, \"\", \"b\", \"ɑɽⱤoW\", \"ıI\", \"ⱥⱥⱥȺ\", " - "\"hEllO, WoRld!\", \"$. A3\", \"!ɑⱤⱤow\"]", - this->type(), - "[\"Aaazzææ&\", null, \"\", \"B\", \"Ɑɽɽow\", \"Ii\", \"Ⱥⱥⱥⱥ\", " - "\"Hello, World!\", \"$. A3\", \"!Ɑɽɽow\"]"); + this->CheckUnary( + "utf8_title", + R"([null, "", "b", "aAaz;ZæÆ&", "ɑɽⱤoW", "ıI", "ⱥ.ⱥ.ⱥ..Ⱥ", "hEllO, WoRld!", "foo baR;héHé0zOP", "!%$^.,;"])", + this->type(), + R"([null, "", "B", "Aaaz;Zææ&", "Ɑɽɽow", "Ii", "Ⱥ.Ⱥ.Ⱥ..Ⱥ", "Hello, World!", "Foo Bar;Héhé0Zop", "!%$^.,;"])"); } TYPED_TEST(TestStringKernels, IsAlphaNumericUnicode) { From c7f184073fc9083a36f07f14fe82b812b5a0f418 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Tue, 31 Aug 2021 00:21:00 -0400 Subject: [PATCH 13/23] change while to for-loop in kernels --- .../arrow/compute/kernels/scalar_string.cc | 30 +++++++++---------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 285ee657c2d..1672af79c3c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -522,22 +522,21 @@ using Utf8Capitalize = StringTransformExec; struct Utf8TitleTransform : public StringTransformCodepointBase { int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, uint8_t* output) { - const uint8_t* next = input; - const uint8_t* end = input + input_string_ncodeunits; uint8_t* output_start = output; uint32_t (*TransformCodepoint)(uint32_t) = UTF8UpperTransform::TransformCodepoint; - while ((input = next) < end) { + for (const uint8_t *next = input, *end = input + input_string_ncodeunits; next < end; + input = next) { uint32_t codepoint; if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&next, &codepoint))) { return kTransformError; } if (IsCasedCharacterUnicode(codepoint)) { - // Transform codepoint (lower/upper), prepare to lowercase next consecutive cased + // Lower/uppercase codepoint, prepare to lowercase next consecutive cased // codepoints output = util::UTF8Encode(output, TransformCodepoint(codepoint)); TransformCodepoint = UTF8LowerTransform::TransformCodepoint; } else { - // Copy non-cased character, prepare to uppercase next caseable codepoint + // Copy non-cased codepoint, prepare to uppercase next cased codepoint std::memcpy(output, input, next - input); output += next - input; TransformCodepoint = UTF8UpperTransform::TransformCodepoint; @@ -703,18 +702,17 @@ using AsciiCapitalize = StringTransformExec; struct AsciiTitleTransform : public StringTransformBase { int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, uint8_t* output) { - const uint8_t* end = input + input_string_ncodeunits; - uint8_t (*ascii_toxxx)(uint8_t) = ascii_toupper; - while (input < end) { - if (IsCasedCharacterAscii(*input)) { - // Transform codepoint (lower/upper), prepare to lowercase next consecutive cased - // codepoints - *output++ = ascii_toxxx(*input++); - ascii_toxxx = ascii_tolower; + uint8_t (*ascii_to_ul)(uint8_t) = ascii_toupper; + for (const uint8_t* ch = input; ch < (input + input_string_ncodeunits); ++ch) { + if (IsCasedCharacterAscii(*ch)) { + // Lower/uppercase character, prepare to lowercase next consecutive cased + // characters + *output++ = ascii_to_ul(*ch); + ascii_to_ul = ascii_tolower; } else { - // Copy non-cased character, prepare to uppercase next caseable codepoint - *output++ = *input++; - ascii_toxxx = ascii_toupper; + // Copy non-cased character, prepare to uppercase next cased character + *output++ = *ch; + ascii_to_ul = ascii_toupper; } } return input_string_ncodeunits; From 6c9b829e12f00d95d7430a17e162882d1917add3 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Tue, 31 Aug 2021 02:45:44 -0400 Subject: [PATCH 14/23] minor change to IsTitle --- .../arrow/compute/kernels/scalar_string.cc | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 1672af79c3c..3390ee9d10d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -536,7 +536,7 @@ struct Utf8TitleTransform : public StringTransformCodepointBase { output = util::UTF8Encode(output, TransformCodepoint(codepoint)); TransformCodepoint = UTF8LowerTransform::TransformCodepoint; } else { - // Copy non-cased codepoint, prepare to uppercase next cased codepoint + // Copy uncased codepoint, prepare to uppercase next cased codepoint std::memcpy(output, input, next - input); output += next - input; TransformCodepoint = UTF8UpperTransform::TransformCodepoint; @@ -710,7 +710,7 @@ struct AsciiTitleTransform : public StringTransformBase { *output++ = ascii_to_ul(*ch); ascii_to_ul = ascii_tolower; } else { - // Copy non-cased character, prepare to uppercase next cased character + // Copy uncased character, prepare to uppercase next cased character *output++ = *ch; ascii_to_ul = ascii_toupper; } @@ -1800,9 +1800,9 @@ struct IsTitleUnicode { static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits, Status* st) { // rules: - // * 1: lower case follows cased - // * 2: upper case follows uncased - // * 3: at least 1 cased character (which logically should be upper/title) + // 1. lower case follows cased + // 2. upper case follows uncased + // 3. at least 1 cased character (which logically should be upper/title) bool rules_1_and_2; bool previous_cased = false; // in LL, LU or LT bool rule_3 = false; @@ -1811,14 +1811,15 @@ struct IsTitleUnicode { [&previous_cased, &rule_3](uint32_t codepoint) { if (IsLowerCaseCharacterUnicode(codepoint)) { if (!previous_cased) return false; // rule 1 broken + // next should be more lower case or uncased previous_cased = true; - } else if (IsCasedCharacterUnicode(codepoint)) { + } else if (IsUpperCaseCharacterUnicode(codepoint)) { if (previous_cased) return false; // rule 2 broken // next should be a lower case or uncased previous_cased = true; rule_3 = true; // rule 3 obeyed } else { - // a non-cased char, like _ or 1 + // an uncased char, like _ or 1 // next should be upper case or more uncased previous_cased = false; } @@ -1837,13 +1838,13 @@ struct IsTitleAscii { static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits, Status*) { // rules: - // * 1: lower case follows cased - // * 2: upper case follows uncased - // * 3: at least 1 cased character (which logically should be upper/title) + // 1. lower case follows cased + // 2. upper case follows uncased + // 3. at least 1 cased character (which logically should be upper/title) + // NOTE: Non-ASCII characters are seen as uncased. bool rules_1_and_2 = true; bool previous_cased = false; // in LL, LU or LT bool rule_3 = false; - // we cannot rely on std::all_of because we need guaranteed order for (const uint8_t* c = input; c < input + input_string_ncodeunits; ++c) { if (IsLowerCaseCharacterAscii(*c)) { if (!previous_cased) { @@ -1851,8 +1852,9 @@ struct IsTitleAscii { rules_1_and_2 = false; break; } + // next should be more lower case or uncased previous_cased = true; - } else if (IsCasedCharacterAscii(*c)) { + } else if (IsUpperCaseCharacterAscii(*c)) { if (previous_cased) { // rule 2 broken rules_1_and_2 = false; @@ -1862,7 +1864,7 @@ struct IsTitleAscii { previous_cased = true; rule_3 = true; // rule 3 obeyed } else { - // a non-cased char, like _ or 1 + // an uncased character, like _ or 1 // next should be upper case or more uncased previous_cased = false; } @@ -4115,7 +4117,7 @@ const auto ascii_is_title_doc = StringPredicateDoc( "Classify strings as ASCII titlecase", ("For each string in `strings`, emit true iff the string is title-cased,\n" "i.e. it has at least one cased character, each uppercase character\n" - "follows a non-cased character, and each lowercase character follows\n" + "follows a uncased character, and each lowercase character follows\n" "an uppercase character.\n")); const auto utf8_is_alnum_doc = @@ -4140,7 +4142,7 @@ const auto utf8_is_title_doc = StringPredicateDoc( "Classify strings as titlecase", ("For each string in `strings`, emit true iff the string is title-cased,\n" "i.e. it has at least one cased character, each uppercase character\n" - "follows a non-cased character, and each lowercase character follows\n" + "follows a uncased character, and each lowercase character follows\n" "an uppercase character.\n")); const FunctionDoc ascii_upper_doc( From c002e610e662c0348e7375f43b9ad62d215f96d9 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Tue, 31 Aug 2021 12:08:50 -0400 Subject: [PATCH 15/23] remove temp header --- cpp/src/arrow/compute/kernels/scalar_string_test.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 5b93cbae361..56b10fe2693 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -16,7 +16,6 @@ // under the License. #include -#include #include #include From a1de3a9a52338e604c25cf07d845373e7831be5a Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Tue, 31 Aug 2021 22:03:13 -0400 Subject: [PATCH 16/23] add kernel to R mapping --- r/R/expression.R | 1 + 1 file changed, 1 insertion(+) diff --git a/r/R/expression.R b/r/R/expression.R index 57466cc3c71..aa9af9270c9 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -51,6 +51,7 @@ # str_pad is defined in dplyr-functions.R # str_sub is defined in dplyr-functions.R "str_to_lower" = "utf8_lower", + "str_to_title" = "utf8_title", "str_to_upper" = "utf8_upper", # str_trim is defined in dplyr-functions.R "stri_reverse" = "utf8_reverse", From 021a6f4996ed17804f4ede3e6961e42b2cf8e923 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Tue, 31 Aug 2021 22:38:09 -0400 Subject: [PATCH 17/23] use caseless term --- .../arrow/compute/kernels/scalar_string.cc | 30 +++++++++---------- .../compute/kernels/scalar_string_test.cc | 4 +-- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 3390ee9d10d..f7424aeb87e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -536,7 +536,7 @@ struct Utf8TitleTransform : public StringTransformCodepointBase { output = util::UTF8Encode(output, TransformCodepoint(codepoint)); TransformCodepoint = UTF8LowerTransform::TransformCodepoint; } else { - // Copy uncased codepoint, prepare to uppercase next cased codepoint + // Copy caseless codepoint, prepare to uppercase next cased codepoint std::memcpy(output, input, next - input); output += next - input; TransformCodepoint = UTF8UpperTransform::TransformCodepoint; @@ -710,7 +710,7 @@ struct AsciiTitleTransform : public StringTransformBase { *output++ = ascii_to_ul(*ch); ascii_to_ul = ascii_tolower; } else { - // Copy uncased character, prepare to uppercase next cased character + // Copy caseless character, prepare to uppercase next cased character *output++ = *ch; ascii_to_ul = ascii_toupper; } @@ -1801,7 +1801,7 @@ struct IsTitleUnicode { Status* st) { // rules: // 1. lower case follows cased - // 2. upper case follows uncased + // 2. upper case follows caseless // 3. at least 1 cased character (which logically should be upper/title) bool rules_1_and_2; bool previous_cased = false; // in LL, LU or LT @@ -1811,16 +1811,16 @@ struct IsTitleUnicode { [&previous_cased, &rule_3](uint32_t codepoint) { if (IsLowerCaseCharacterUnicode(codepoint)) { if (!previous_cased) return false; // rule 1 broken - // next should be more lower case or uncased + // next should be more lower case or caseless previous_cased = true; } else if (IsUpperCaseCharacterUnicode(codepoint)) { if (previous_cased) return false; // rule 2 broken - // next should be a lower case or uncased + // next should be a lower case or caseless previous_cased = true; rule_3 = true; // rule 3 obeyed } else { - // an uncased char, like _ or 1 - // next should be upper case or more uncased + // a caseless char, like _ or 1 + // next should be upper case or more caseless previous_cased = false; } return true; @@ -1839,9 +1839,9 @@ struct IsTitleAscii { Status*) { // rules: // 1. lower case follows cased - // 2. upper case follows uncased + // 2. upper case follows caseless // 3. at least 1 cased character (which logically should be upper/title) - // NOTE: Non-ASCII characters are seen as uncased. + // NOTE: Non-ASCII characters are seen as caseless. bool rules_1_and_2 = true; bool previous_cased = false; // in LL, LU or LT bool rule_3 = false; @@ -1852,7 +1852,7 @@ struct IsTitleAscii { rules_1_and_2 = false; break; } - // next should be more lower case or uncased + // next should be more lower case or caseless previous_cased = true; } else if (IsUpperCaseCharacterAscii(*c)) { if (previous_cased) { @@ -1860,12 +1860,12 @@ struct IsTitleAscii { rules_1_and_2 = false; break; } - // next should be a lower case or uncased + // next should be a lower case or caseless previous_cased = true; rule_3 = true; // rule 3 obeyed } else { - // an uncased character, like _ or 1 - // next should be upper case or more uncased + // a caseless character, like _ or 1 + // next should be upper case or more caseless previous_cased = false; } } @@ -4117,7 +4117,7 @@ const auto ascii_is_title_doc = StringPredicateDoc( "Classify strings as ASCII titlecase", ("For each string in `strings`, emit true iff the string is title-cased,\n" "i.e. it has at least one cased character, each uppercase character\n" - "follows a uncased character, and each lowercase character follows\n" + "follows a caseless character, and each lowercase character follows\n" "an uppercase character.\n")); const auto utf8_is_alnum_doc = @@ -4142,7 +4142,7 @@ const auto utf8_is_title_doc = StringPredicateDoc( "Classify strings as titlecase", ("For each string in `strings`, emit true iff the string is title-cased,\n" "i.e. it has at least one cased character, each uppercase character\n" - "follows a uncased character, and each lowercase character follows\n" + "follows a caseless character, and each lowercase character follows\n" "an uppercase character.\n")); const FunctionDoc ascii_upper_doc( diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 56b10fe2693..9d4a6bb0381 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -700,7 +700,7 @@ TYPED_TEST(TestStringKernels, IsPrintableAscii) { TYPED_TEST(TestStringKernels, IsSpaceAscii) { // \xe2\x80\x88 is punctuation space - // Note: for ascii version, the non-ascii chars are seen as non-cased + // Note: for ascii version, the non-ascii chars are seen as caseless this->CheckUnary("ascii_is_space", "[\" \", null, \" \", \"\\t\\r\"]", boolean(), "[true, null, true, true]"); this->CheckUnary("ascii_is_space", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]", @@ -709,7 +709,7 @@ TYPED_TEST(TestStringKernels, IsSpaceAscii) { TYPED_TEST(TestStringKernels, IsTitleAscii) { // ٣ is arabic 3 (decimal), Φ capital - // Note: for ascii version, the non-ascii chars are seen as non-cased + // Note: for ascii version, the non-ascii chars are seen as caseless this->CheckUnary("ascii_is_title", "[\"Is\", null, \"Is Title\", \"Is٣Title\", \"Is_DŽ\", \"Φ\", \"DŽ\"]", boolean(), "[true, null, true, true, true, false, false]"); From 983a90c7bead9b0a0ee798590fe183d998bbef14 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Tue, 31 Aug 2021 23:06:27 -0400 Subject: [PATCH 18/23] use uncased term and update some comments --- .../arrow/compute/kernels/scalar_string.cc | 36 +++++++++---------- .../compute/kernels/scalar_string_test.cc | 4 +-- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index f7424aeb87e..2fef5fca98b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -87,12 +87,13 @@ static inline bool IsUpperCaseCharacterAscii(uint8_t ascii_character) { } static inline bool IsCasedCharacterAscii(uint8_t ascii_character) { + // Note: Non-ASCII characters are seen as uncased. return IsLowerCaseCharacterAscii(ascii_character) || IsUpperCaseCharacterAscii(ascii_character); } static inline bool IsAlphaCharacterAscii(uint8_t ascii_character) { - return IsCasedCharacterAscii(ascii_character); // same + return IsCasedCharacterAscii(ascii_character); } static inline bool IsAlphaNumericCharacterAscii(uint8_t ascii_character) { @@ -106,7 +107,7 @@ static inline bool IsDecimalCharacterAscii(uint8_t ascii_character) { } static inline bool IsSpaceCharacterAscii(uint8_t ascii_character) { - return ((ascii_character >= 0x09) && (ascii_character <= 0x0D)) || + return ((ascii_character >= 9) && (ascii_character <= 13)) || (ascii_character == ' '); } @@ -536,7 +537,7 @@ struct Utf8TitleTransform : public StringTransformCodepointBase { output = util::UTF8Encode(output, TransformCodepoint(codepoint)); TransformCodepoint = UTF8LowerTransform::TransformCodepoint; } else { - // Copy caseless codepoint, prepare to uppercase next cased codepoint + // Copy uncased codepoint, prepare to uppercase next cased codepoint std::memcpy(output, input, next - input); output += next - input; TransformCodepoint = UTF8UpperTransform::TransformCodepoint; @@ -710,7 +711,7 @@ struct AsciiTitleTransform : public StringTransformBase { *output++ = ascii_to_ul(*ch); ascii_to_ul = ascii_tolower; } else { - // Copy caseless character, prepare to uppercase next cased character + // Copy uncased character, prepare to uppercase next cased character *output++ = *ch; ascii_to_ul = ascii_toupper; } @@ -1801,7 +1802,7 @@ struct IsTitleUnicode { Status* st) { // rules: // 1. lower case follows cased - // 2. upper case follows caseless + // 2. upper case follows uncased // 3. at least 1 cased character (which logically should be upper/title) bool rules_1_and_2; bool previous_cased = false; // in LL, LU or LT @@ -1811,16 +1812,16 @@ struct IsTitleUnicode { [&previous_cased, &rule_3](uint32_t codepoint) { if (IsLowerCaseCharacterUnicode(codepoint)) { if (!previous_cased) return false; // rule 1 broken - // next should be more lower case or caseless + // next should be more lower case or uncased previous_cased = true; } else if (IsUpperCaseCharacterUnicode(codepoint)) { if (previous_cased) return false; // rule 2 broken - // next should be a lower case or caseless + // next should be a lower case or uncased previous_cased = true; rule_3 = true; // rule 3 obeyed } else { - // a caseless char, like _ or 1 - // next should be upper case or more caseless + // an uncased char, like _ or 1 + // next should be upper case or more uncased previous_cased = false; } return true; @@ -1837,11 +1838,10 @@ struct IsTitleUnicode { struct IsTitleAscii { static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits, Status*) { - // rules: + // Rules: // 1. lower case follows cased - // 2. upper case follows caseless + // 2. upper case follows uncased // 3. at least 1 cased character (which logically should be upper/title) - // NOTE: Non-ASCII characters are seen as caseless. bool rules_1_and_2 = true; bool previous_cased = false; // in LL, LU or LT bool rule_3 = false; @@ -1852,7 +1852,7 @@ struct IsTitleAscii { rules_1_and_2 = false; break; } - // next should be more lower case or caseless + // next should be more lower case or uncased previous_cased = true; } else if (IsUpperCaseCharacterAscii(*c)) { if (previous_cased) { @@ -1860,12 +1860,12 @@ struct IsTitleAscii { rules_1_and_2 = false; break; } - // next should be a lower case or caseless + // next should be a lower case or uncased previous_cased = true; rule_3 = true; // rule 3 obeyed } else { - // a caseless character, like _ or 1 - // next should be upper case or more caseless + // an uncased character, like _ or 1 + // next should be upper case or more uncased previous_cased = false; } } @@ -4117,7 +4117,7 @@ const auto ascii_is_title_doc = StringPredicateDoc( "Classify strings as ASCII titlecase", ("For each string in `strings`, emit true iff the string is title-cased,\n" "i.e. it has at least one cased character, each uppercase character\n" - "follows a caseless character, and each lowercase character follows\n" + "follows an uncased character, and each lowercase character follows\n" "an uppercase character.\n")); const auto utf8_is_alnum_doc = @@ -4142,7 +4142,7 @@ const auto utf8_is_title_doc = StringPredicateDoc( "Classify strings as titlecase", ("For each string in `strings`, emit true iff the string is title-cased,\n" "i.e. it has at least one cased character, each uppercase character\n" - "follows a caseless character, and each lowercase character follows\n" + "follows an uncased character, and each lowercase character follows\n" "an uppercase character.\n")); const FunctionDoc ascii_upper_doc( diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 9d4a6bb0381..3ac19ead5e8 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -700,7 +700,6 @@ TYPED_TEST(TestStringKernels, IsPrintableAscii) { TYPED_TEST(TestStringKernels, IsSpaceAscii) { // \xe2\x80\x88 is punctuation space - // Note: for ascii version, the non-ascii chars are seen as caseless this->CheckUnary("ascii_is_space", "[\" \", null, \" \", \"\\t\\r\"]", boolean(), "[true, null, true, true]"); this->CheckUnary("ascii_is_space", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]", @@ -708,8 +707,7 @@ TYPED_TEST(TestStringKernels, IsSpaceAscii) { } TYPED_TEST(TestStringKernels, IsTitleAscii) { - // ٣ is arabic 3 (decimal), Φ capital - // Note: for ascii version, the non-ascii chars are seen as caseless + // ٣ is Arabic 3 (decimal), Φ capital this->CheckUnary("ascii_is_title", "[\"Is\", null, \"Is Title\", \"Is٣Title\", \"Is_DŽ\", \"Φ\", \"DŽ\"]", boolean(), "[true, null, true, true, true, false, false]"); From 2c768289acd8054ba47e614e88332c52c1322b87 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Tue, 31 Aug 2021 23:07:08 -0400 Subject: [PATCH 19/23] compact statement in utf8_length --- cpp/src/arrow/util/utf8.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/utf8.h b/cpp/src/arrow/util/utf8.h index 0c9a368d3dd..c5a100ff267 100644 --- a/cpp/src/arrow/util/utf8.h +++ b/cpp/src/arrow/util/utf8.h @@ -560,8 +560,7 @@ static inline bool UTF8AllOf(const uint8_t* first, const uint8_t* last, bool* re static inline int64_t UTF8Length(const uint8_t* first, const uint8_t* last) { int64_t length = 0; while (first != last) { - length += ((*first & 0xc0) != 0x80); - ++first; + length += ((*first++ & 0xc0) != 0x80); } return length; } From 147f7b026ced9e6550c5f67d74f9d8ef699a3a1c Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Wed, 1 Sep 2021 13:08:15 -0400 Subject: [PATCH 20/23] rename to FunctionalCaseMappingTransform --- cpp/src/arrow/compute/kernels/scalar_string.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 2fef5fca98b..32d7cff4ca8 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -423,7 +423,7 @@ struct StringTransformExecWithState #ifdef ARROW_WITH_UTF8PROC -struct StringTransformCodepointBase : public StringTransformBase { +struct FunctionalCaseMappingTransform : public StringTransformBase { Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override { EnsureLookupTablesFilled(); return Status::OK(); @@ -441,7 +441,7 @@ struct StringTransformCodepointBase : public StringTransformBase { }; template -struct StringTransformCodepoint : public StringTransformCodepointBase { +struct StringTransformCodepoint : public FunctionalCaseMappingTransform { int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, uint8_t* output) { uint8_t* output_start = output; @@ -454,7 +454,7 @@ struct StringTransformCodepoint : public StringTransformCodepointBase { } }; -struct UTF8UpperTransform : public StringTransformCodepointBase { +struct UTF8UpperTransform : public FunctionalCaseMappingTransform { static uint32_t TransformCodepoint(uint32_t codepoint) { return codepoint <= kMaxCodepointLookup ? lut_upper_codepoint[codepoint] : utf8proc_toupper(codepoint); @@ -464,7 +464,7 @@ struct UTF8UpperTransform : public StringTransformCodepointBase { template using UTF8Upper = StringTransformExec>; -struct UTF8LowerTransform : public StringTransformCodepointBase { +struct UTF8LowerTransform : public FunctionalCaseMappingTransform { static uint32_t TransformCodepoint(uint32_t codepoint) { return codepoint <= kMaxCodepointLookup ? lut_lower_codepoint[codepoint] : utf8proc_tolower(codepoint); @@ -474,7 +474,7 @@ struct UTF8LowerTransform : public StringTransformCodepointBase { template using UTF8Lower = StringTransformExec>; -struct UTF8SwapCaseTransform : public StringTransformCodepointBase { +struct UTF8SwapCaseTransform : public FunctionalCaseMappingTransform { static uint32_t TransformCodepoint(uint32_t codepoint) { if (codepoint <= kMaxCodepointLookup) { return lut_swapcase_codepoint[codepoint]; @@ -494,7 +494,7 @@ template using UTF8SwapCase = StringTransformExec>; -struct Utf8CapitalizeTransform : public StringTransformCodepointBase { +struct Utf8CapitalizeTransform : public FunctionalCaseMappingTransform { int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, uint8_t* output) { uint8_t* output_start = output; @@ -520,7 +520,7 @@ struct Utf8CapitalizeTransform : public StringTransformCodepointBase { template using Utf8Capitalize = StringTransformExec; -struct Utf8TitleTransform : public StringTransformCodepointBase { +struct Utf8TitleTransform : public FunctionalCaseMappingTransform { int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, uint8_t* output) { uint8_t* output_start = output; From dbe7229e0840a640c0eef3f34be1adabbc0d42e8 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Wed, 1 Sep 2021 13:37:16 -0400 Subject: [PATCH 21/23] use boolean flag and change for-while loop --- .../arrow/compute/kernels/scalar_string.cc | 47 +++++++++++-------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 32d7cff4ca8..28d0de77573 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -524,23 +524,29 @@ struct Utf8TitleTransform : public FunctionalCaseMappingTransform { int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, uint8_t* output) { uint8_t* output_start = output; - uint32_t (*TransformCodepoint)(uint32_t) = UTF8UpperTransform::TransformCodepoint; - for (const uint8_t *next = input, *end = input + input_string_ncodeunits; next < end; - input = next) { + const uint8_t* end = input + input_string_ncodeunits; + const uint8_t* next = input; + bool is_next_upper = true; + while ((input = next) < end) { uint32_t codepoint; if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&next, &codepoint))) { return kTransformError; } if (IsCasedCharacterUnicode(codepoint)) { - // Lower/uppercase codepoint, prepare to lowercase next consecutive cased - // codepoints - output = util::UTF8Encode(output, TransformCodepoint(codepoint)); - TransformCodepoint = UTF8LowerTransform::TransformCodepoint; + // Lower/uppercase current codepoint and + // prepare to lowercase next consecutive cased codepoints + output = is_next_upper + ? util::UTF8Encode(output, + UTF8UpperTransform::TransformCodepoint(codepoint)) + : util::UTF8Encode( + output, UTF8LowerTransform::TransformCodepoint(codepoint)); + is_next_upper = false; } else { - // Copy uncased codepoint, prepare to uppercase next cased codepoint + // Copy current uncased codepoint and + // prepare to uppercase next cased codepoint std::memcpy(output, input, next - input); output += next - input; - TransformCodepoint = UTF8UpperTransform::TransformCodepoint; + is_next_upper = true; } } return output - output_start; @@ -703,17 +709,20 @@ using AsciiCapitalize = StringTransformExec; struct AsciiTitleTransform : public StringTransformBase { int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, uint8_t* output) { - uint8_t (*ascii_to_ul)(uint8_t) = ascii_toupper; - for (const uint8_t* ch = input; ch < (input + input_string_ncodeunits); ++ch) { - if (IsCasedCharacterAscii(*ch)) { - // Lower/uppercase character, prepare to lowercase next consecutive cased - // characters - *output++ = ascii_to_ul(*ch); - ascii_to_ul = ascii_tolower; + const uint8_t* end = input + input_string_ncodeunits; + const uint8_t* next = input; + bool is_next_upper = true; + while ((input = next++) < end) { + if (IsCasedCharacterAscii(*input)) { + // Lower/uppercase current character and + // prepare to lowercase next consecutive cased characters + *output++ = is_next_upper ? ascii_toupper(*input) : ascii_tolower(*input); + is_next_upper = false; } else { - // Copy uncased character, prepare to uppercase next cased character - *output++ = *ch; - ascii_to_ul = ascii_toupper; + // Copy current uncased character and + // prepare to uppercase next cased character + *output++ = *input; + is_next_upper = true; } } return input_string_ncodeunits; From 5955c4e8bc5581cbbd431fefe68d990caed2ac5e Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Wed, 1 Sep 2021 13:45:17 -0400 Subject: [PATCH 22/23] revert case call in IsTitle kernels --- cpp/src/arrow/compute/kernels/scalar_string.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 28d0de77573..df0827f67fd 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -1823,7 +1823,7 @@ struct IsTitleUnicode { if (!previous_cased) return false; // rule 1 broken // next should be more lower case or uncased previous_cased = true; - } else if (IsUpperCaseCharacterUnicode(codepoint)) { + } else if (IsCasedCharacterUnicode(codepoint)) { if (previous_cased) return false; // rule 2 broken // next should be a lower case or uncased previous_cased = true; @@ -1863,7 +1863,7 @@ struct IsTitleAscii { } // next should be more lower case or uncased previous_cased = true; - } else if (IsUpperCaseCharacterAscii(*c)) { + } else if (IsCasedCharacterAscii(*c)) { if (previous_cased) { // rule 2 broken rules_1_and_2 = false; From 23c93385849f9c460c62444406816409958960a6 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Wed, 1 Sep 2021 15:12:28 -0400 Subject: [PATCH 23/23] fix lint error --- cpp/src/arrow/compute/kernels/scalar_string.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index df0827f67fd..aa953119d47 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -107,8 +107,7 @@ static inline bool IsDecimalCharacterAscii(uint8_t ascii_character) { } static inline bool IsSpaceCharacterAscii(uint8_t ascii_character) { - return ((ascii_character >= 9) && (ascii_character <= 13)) || - (ascii_character == ' '); + return ((ascii_character >= 9) && (ascii_character <= 13)) || (ascii_character == ' '); } static inline bool IsPrintableCharacterAscii(uint8_t ascii_character) {