diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index f5de81acd67..aa953119d47 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -66,19 +66,16 @@ Status RegexStatus(const RE2& regex) { } #endif -// Code units in the range [a-z] can only be an encoding of an ascii -// character/codepoint, not the 2nd, 3rd or 4th code unit (byte) of an different -// codepoint. This guaranteed by non-overlap design of the unicode standard. (see -// section 2.5 of Unicode Standard Core Specification v13.0) +// Code units in the range [a-z] can only be an encoding of an ASCII +// character/codepoint, not the 2nd, 3rd or 4th code unit (byte) of a different +// codepoint. This is guaranteed by the non-overlap design of the Unicode +// standard. (see section 2.5 of Unicode Standard Core Specification v13.0) -static inline uint8_t ascii_tolower(uint8_t utf8_code_unit) { - return ((utf8_code_unit >= 'A') && (utf8_code_unit <= 'Z')) ? (utf8_code_unit + 32) - : utf8_code_unit; -} +// IsAlpha/Digit etc -static inline uint8_t ascii_toupper(uint8_t utf8_code_unit) { - return ((utf8_code_unit >= 'a') && (utf8_code_unit <= 'z')) ? (utf8_code_unit - 32) - : utf8_code_unit; +template +static inline bool IsAsciiCharacter(T character) { + return character < 128; } static inline bool IsLowerCaseCharacterAscii(uint8_t ascii_character) { @@ -90,12 +87,13 @@ static inline bool IsUpperCaseCharacterAscii(uint8_t ascii_character) { } static inline bool IsCasedCharacterAscii(uint8_t ascii_character) { + // Note: Non-ASCII characters are seen as uncased. return IsLowerCaseCharacterAscii(ascii_character) || IsUpperCaseCharacterAscii(ascii_character); } static inline bool IsAlphaCharacterAscii(uint8_t ascii_character) { - return IsCasedCharacterAscii(ascii_character); // same + return IsCasedCharacterAscii(ascii_character); } static inline bool IsAlphaNumericCharacterAscii(uint8_t ascii_character) { @@ -109,28 +107,13 @@ static inline bool IsDecimalCharacterAscii(uint8_t ascii_character) { } static inline bool IsSpaceCharacterAscii(uint8_t ascii_character) { - return ((ascii_character >= 0x09) && (ascii_character <= 0x0D)) || - (ascii_character == ' '); + return ((ascii_character >= 9) && (ascii_character <= 13)) || (ascii_character == ' '); } static inline bool IsPrintableCharacterAscii(uint8_t ascii_character) { return ((ascii_character >= ' ') && (ascii_character <= '~')); } -static inline uint8_t ascii_swapcase(uint8_t utf8_code_unit) { - if (IsLowerCaseCharacterAscii(utf8_code_unit)) { - utf8_code_unit -= 32; - } else if (IsUpperCaseCharacterAscii(utf8_code_unit)) { - utf8_code_unit += 32; - } - return utf8_code_unit; -} - -template -static inline bool IsAsciiCharacter(T character) { - return character < 128; -} - struct BinaryLength { template static OutValue Call(KernelContext*, Arg0Value val, Status*) { @@ -147,6 +130,25 @@ struct Utf8Length { } }; +static inline uint8_t ascii_tolower(uint8_t utf8_code_unit) { + return ((utf8_code_unit >= 'A') && (utf8_code_unit <= 'Z')) ? (utf8_code_unit + 32) + : utf8_code_unit; +} + +static inline uint8_t ascii_toupper(uint8_t utf8_code_unit) { + return ((utf8_code_unit >= 'a') && (utf8_code_unit <= 'z')) ? (utf8_code_unit - 32) + : utf8_code_unit; +} + +static inline uint8_t ascii_swapcase(uint8_t utf8_code_unit) { + if (IsLowerCaseCharacterAscii(utf8_code_unit)) { + utf8_code_unit -= 32; + } else if (IsUpperCaseCharacterAscii(utf8_code_unit)) { + utf8_code_unit += 32; + } + return utf8_code_unit; +} + #ifdef ARROW_WITH_UTF8PROC // Direct lookup tables for unicode properties @@ -420,17 +422,25 @@ struct StringTransformExecWithState #ifdef ARROW_WITH_UTF8PROC -template -struct StringTransformCodepoint : public StringTransformBase { +struct FunctionalCaseMappingTransform : public StringTransformBase { Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override { EnsureLookupTablesFilled(); return Status::OK(); } int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override { - return CodepointTransform::MaxCodeunits(ninputs, input_ncodeunits); + // Section 5.18 of the Unicode spec claims that the number of codepoints for case + // mapping can grow by a factor of 3. This means grow by a factor of 3 in bytes + // However, since we don't support all casings (SpecialCasing.txt) the growth + // in bytes is actually only at max 3/2 (as covered by the unittest). + // Note that rounding down the 3/2 is ok, since only codepoints encoded by + // two code units (even) can grow to 3 code units. + return static_cast(input_ncodeunits) * 3 / 2; } +}; +template +struct StringTransformCodepoint : public FunctionalCaseMappingTransform { int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, uint8_t* output) { uint8_t* output_start = output; @@ -443,20 +453,7 @@ struct StringTransformCodepoint : public StringTransformBase { } }; -// struct CaseMappingMixin { -struct CaseMappingTransform { - static int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) { - // Section 5.18 of the Unicode spec claims that the number of codepoints for case - // mapping can grow by a factor of 3. This means grow by a factor of 3 in bytes - // However, since we don't support all casings (SpecialCasing.txt) the growth - // in bytes is actually only at max 3/2 (as covered by the unittest). - // Note that rounding down the 3/2 is ok, since only codepoints encoded by - // two code units (even) can grow to 3 code units. - return static_cast(input_ncodeunits) * 3 / 2; - } -}; - -struct UTF8UpperTransform : public CaseMappingTransform { +struct UTF8UpperTransform : public FunctionalCaseMappingTransform { static uint32_t TransformCodepoint(uint32_t codepoint) { return codepoint <= kMaxCodepointLookup ? lut_upper_codepoint[codepoint] : utf8proc_toupper(codepoint); @@ -466,7 +463,7 @@ struct UTF8UpperTransform : public CaseMappingTransform { template using UTF8Upper = StringTransformExec>; -struct UTF8LowerTransform : public CaseMappingTransform { +struct UTF8LowerTransform : public FunctionalCaseMappingTransform { static uint32_t TransformCodepoint(uint32_t codepoint) { return codepoint <= kMaxCodepointLookup ? lut_lower_codepoint[codepoint] : utf8proc_tolower(codepoint); @@ -476,7 +473,7 @@ struct UTF8LowerTransform : public CaseMappingTransform { template using UTF8Lower = StringTransformExec>; -struct UTF8SwapCaseTransform : public CaseMappingTransform { +struct UTF8SwapCaseTransform : public FunctionalCaseMappingTransform { static uint32_t TransformCodepoint(uint32_t codepoint) { if (codepoint <= kMaxCodepointLookup) { return lut_swapcase_codepoint[codepoint]; @@ -496,27 +493,22 @@ template using UTF8SwapCase = StringTransformExec>; -struct Utf8CapitalizeTransform : public StringTransformBase { +struct Utf8CapitalizeTransform : public FunctionalCaseMappingTransform { int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, uint8_t* output) { uint8_t* output_start = output; + const uint8_t* end = input + input_string_ncodeunits; + const uint8_t* next = input; if (input_string_ncodeunits > 0) { - // Get number of code units in first code point - uint32_t codepoint = 0; - const uint8_t* i = input; - if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&i, &codepoint))) { + if (ARROW_PREDICT_FALSE(!util::UTF8AdvanceCodepoints(input, end, &next, 1))) { return kTransformError; } - int64_t codepoint_ncodeunits = - std::min(static_cast(i - input), input_string_ncodeunits); - if (ARROW_PREDICT_FALSE( - !util::UTF8Transform(input, input + codepoint_ncodeunits, &output, - UTF8UpperTransform::TransformCodepoint))) { + if (ARROW_PREDICT_FALSE(!util::UTF8Transform( + input, next, &output, UTF8UpperTransform::TransformCodepoint))) { return kTransformError; } if (ARROW_PREDICT_FALSE(!util::UTF8Transform( - input + codepoint_ncodeunits, input + input_string_ncodeunits, &output, - UTF8LowerTransform::TransformCodepoint))) { + next, end, &output, UTF8LowerTransform::TransformCodepoint))) { return kTransformError; } } @@ -527,6 +519,42 @@ struct Utf8CapitalizeTransform : public StringTransformBase { template using Utf8Capitalize = StringTransformExec; +struct Utf8TitleTransform : public FunctionalCaseMappingTransform { + int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, + uint8_t* output) { + uint8_t* output_start = output; + const uint8_t* end = input + input_string_ncodeunits; + const uint8_t* next = input; + bool is_next_upper = true; + while ((input = next) < end) { + uint32_t codepoint; + if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&next, &codepoint))) { + return kTransformError; + } + if (IsCasedCharacterUnicode(codepoint)) { + // Lower/uppercase current codepoint and + // prepare to lowercase next consecutive cased codepoints + output = is_next_upper + ? util::UTF8Encode(output, + UTF8UpperTransform::TransformCodepoint(codepoint)) + : util::UTF8Encode( + output, UTF8LowerTransform::TransformCodepoint(codepoint)); + is_next_upper = false; + } else { + // Copy current uncased codepoint and + // prepare to uppercase next cased codepoint + std::memcpy(output, input, next - input); + output += next - input; + is_next_upper = true; + } + } + return output - output_start; + } +}; + +template +using Utf8Title = StringTransformExec; + #endif // ARROW_WITH_UTF8PROC struct AsciiReverseTransform : public StringTransformBase { @@ -667,8 +695,8 @@ struct AsciiCapitalizeTransform : public StringTransformBase { int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, uint8_t* output) { if (input_string_ncodeunits > 0) { - *output = ascii_toupper(*input); - TransformAsciiLower(input + 1, input_string_ncodeunits - 1, output + 1); + *output++ = ascii_toupper(*input++); + TransformAsciiLower(input, input_string_ncodeunits - 1, output); } return input_string_ncodeunits; } @@ -677,6 +705,32 @@ struct AsciiCapitalizeTransform : public StringTransformBase { template using AsciiCapitalize = StringTransformExec; +struct AsciiTitleTransform : public StringTransformBase { + int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, + uint8_t* output) { + const uint8_t* end = input + input_string_ncodeunits; + const uint8_t* next = input; + bool is_next_upper = true; + while ((input = next++) < end) { + if (IsCasedCharacterAscii(*input)) { + // Lower/uppercase current character and + // prepare to lowercase next consecutive cased characters + *output++ = is_next_upper ? ascii_toupper(*input) : ascii_tolower(*input); + is_next_upper = false; + } else { + // Copy current uncased character and + // prepare to uppercase next cased character + *output++ = *input; + is_next_upper = true; + } + } + return input_string_ncodeunits; + } +}; + +template +using AsciiTitle = StringTransformExec; + // ---------------------------------------------------------------------- // exact pattern detection @@ -1755,9 +1809,9 @@ struct IsTitleUnicode { static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits, Status* st) { // rules: - // * 1: lower case follows cased - // * 2: upper case follows uncased - // * 3: at least 1 cased character (which logically should be upper/title) + // 1. lower case follows cased + // 2. upper case follows uncased + // 3. at least 1 cased character (which logically should be upper/title) bool rules_1_and_2; bool previous_cased = false; // in LL, LU or LT bool rule_3 = false; @@ -1766,6 +1820,7 @@ struct IsTitleUnicode { [&previous_cased, &rule_3](uint32_t codepoint) { if (IsLowerCaseCharacterUnicode(codepoint)) { if (!previous_cased) return false; // rule 1 broken + // next should be more lower case or uncased previous_cased = true; } else if (IsCasedCharacterUnicode(codepoint)) { if (previous_cased) return false; // rule 2 broken @@ -1773,7 +1828,7 @@ struct IsTitleUnicode { previous_cased = true; rule_3 = true; // rule 3 obeyed } else { - // a non-cased char, like _ or 1 + // an uncased char, like _ or 1 // next should be upper case or more uncased previous_cased = false; } @@ -1791,14 +1846,13 @@ struct IsTitleUnicode { struct IsTitleAscii { static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits, Status*) { - // rules: - // * 1: lower case follows cased - // * 2: upper case follows uncased - // * 3: at least 1 cased character (which logically should be upper/title) + // Rules: + // 1. lower case follows cased + // 2. upper case follows uncased + // 3. at least 1 cased character (which logically should be upper/title) bool rules_1_and_2 = true; bool previous_cased = false; // in LL, LU or LT bool rule_3 = false; - // we cannot rely on std::all_of because we need guaranteed order for (const uint8_t* c = input; c < input + input_string_ncodeunits; ++c) { if (IsLowerCaseCharacterAscii(*c)) { if (!previous_cased) { @@ -1806,6 +1860,7 @@ struct IsTitleAscii { rules_1_and_2 = false; break; } + // next should be more lower case or uncased previous_cased = true; } else if (IsCasedCharacterAscii(*c)) { if (previous_cased) { @@ -1817,7 +1872,7 @@ struct IsTitleAscii { previous_cased = true; rule_3 = true; // rule 3 obeyed } else { - // a non-cased char, like _ or 1 + // an uncased character, like _ or 1 // next should be upper case or more uncased previous_cased = false; } @@ -4070,7 +4125,7 @@ const auto ascii_is_title_doc = StringPredicateDoc( "Classify strings as ASCII titlecase", ("For each string in `strings`, emit true iff the string is title-cased,\n" "i.e. it has at least one cased character, each uppercase character\n" - "follows a non-cased character, and each lowercase character follows\n" + "follows an uncased character, and each lowercase character follows\n" "an uppercase character.\n")); const auto utf8_is_alnum_doc = @@ -4095,7 +4150,7 @@ const auto utf8_is_title_doc = StringPredicateDoc( "Classify strings as titlecase", ("For each string in `strings`, emit true iff the string is title-cased,\n" "i.e. it has at least one cased character, each uppercase character\n" - "follows a non-cased character, and each lowercase character follows\n" + "follows an uncased character, and each lowercase character follows\n" "an uppercase character.\n")); const FunctionDoc ascii_upper_doc( @@ -4127,6 +4182,15 @@ const FunctionDoc ascii_capitalize_doc( "non-ASCII characters, use \"utf8_capitalize\" instead."), {"strings"}); +const FunctionDoc ascii_title_doc( + "Titlecase each word of ASCII input", + ("For each string in `strings`, return a titlecased version.\n" + "Each word in the output will start with an uppercase character and its\n" + "remaining characters will be lowercase.\n\n" + "This function assumes the input is fully ASCII. If it may contain\n" + "non-ASCII characters, use \"utf8_title\" instead."), + {"strings"}); + const FunctionDoc ascii_reverse_doc( "Reverse ASCII input", ("For each ASCII string in `strings`, return a reversed version.\n\n" @@ -4153,6 +4217,13 @@ const FunctionDoc utf8_capitalize_doc( "with the first character uppercased and the others lowercased."), {"strings"}); +const FunctionDoc utf8_title_doc( + "Titlecase each word of input", + ("For each string in `strings`, return a titlecased version.\n" + "Each word in the output will start with an uppercase character and its\n" + "remaining characters will be lowercase."), + {"strings"}); + const FunctionDoc utf8_reverse_doc( "Reverse input", ("For each string in `strings`, return a reversed version.\n\n" @@ -4164,8 +4235,9 @@ const FunctionDoc utf8_reverse_doc( } // namespace void RegisterScalarStringAscii(FunctionRegistry* registry) { - // ascii_upper and ascii_lower are able to reuse the original offsets buffer, - // so don't preallocate them in the output. + // Some kernels are able to reuse the original offsets buffer, so don't + // preallocate them in the output. Only kernels that invoke + // "StringDataTransform" support no preallocation. MakeUnaryStringBatchKernel("ascii_upper", registry, &ascii_upper_doc, MemAllocation::NO_PREALLOCATE); MakeUnaryStringBatchKernel("ascii_lower", registry, &ascii_lower_doc, @@ -4174,6 +4246,7 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) { "ascii_swapcase", registry, &ascii_swapcase_doc, MemAllocation::NO_PREALLOCATE); MakeUnaryStringBatchKernel("ascii_capitalize", registry, &ascii_capitalize_doc); + MakeUnaryStringBatchKernel("ascii_title", registry, &ascii_title_doc); MakeUnaryStringBatchKernel("ascii_trim_whitespace", registry, &ascii_trim_whitespace_doc); MakeUnaryStringBatchKernel("ascii_ltrim_whitespace", registry, @@ -4221,6 +4294,7 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) { &utf8_swapcase_doc); MakeUnaryStringBatchKernel("utf8_capitalize", registry, &utf8_capitalize_doc); + MakeUnaryStringBatchKernel("utf8_title", registry, &utf8_title_doc); MakeUnaryStringBatchKernel("utf8_trim_whitespace", registry, &utf8_trim_whitespace_doc); MakeUnaryStringBatchKernel("utf8_ltrim_whitespace", registry, diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 271ecb0b415..3ac19ead5e8 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -413,6 +413,14 @@ TYPED_TEST(TestStringKernels, AsciiCapitalize) { "\"!hello, world!\"]"); } +TYPED_TEST(TestStringKernels, AsciiTitle) { + this->CheckUnary( + "ascii_title", + R"([null, "", "b", "aAaz;ZeA&", "arRoW", "iI", "a.a.a..A", "hEllO, WoRld!", "foo baR;heHe0zOP", "!%$^.,;"])", + this->type(), + R"([null, "", "B", "Aaaz;Zea&", "Arrow", "Ii", "A.A.A..A", "Hello, World!", "Foo Bar;Hehe0Zop", "!%$^.,;"])"); +} + TYPED_TEST(TestStringKernels, AsciiReverse) { this->CheckUnary("ascii_reverse", "[]", this->type(), "[]"); this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(), @@ -522,7 +530,7 @@ TYPED_TEST(TestStringKernels, Utf8SwapCase) { // test maximum buffer growth this->CheckUnary("utf8_swapcase", "[\"ȺȺȺȺ\"]", this->type(), "[\"ⱥⱥⱥⱥ\"]"); - this->CheckUnary("ascii_swapcase", "[\"hEllO, WoRld!\", \"$. A35?\"]", this->type(), + this->CheckUnary("utf8_swapcase", "[\"hEllO, WoRld!\", \"$. A35?\"]", this->type(), "[\"HeLLo, wOrLD!\", \"$. a35?\"]"); // Test invalid data @@ -532,7 +540,7 @@ TYPED_TEST(TestStringKernels, Utf8SwapCase) { } TYPED_TEST(TestStringKernels, Utf8Capitalize) { - this->CheckUnary("ascii_capitalize", "[]", this->type(), "[]"); + this->CheckUnary("utf8_capitalize", "[]", this->type(), "[]"); this->CheckUnary("utf8_capitalize", "[\"aAazZæÆ&\", null, \"\", \"b\", \"ɑɽⱤoW\", \"ıI\", \"ⱥⱥⱥȺ\", " "\"hEllO, WoRld!\", \"$. A3\", \"!ɑⱤⱤow\"]", @@ -541,6 +549,14 @@ TYPED_TEST(TestStringKernels, Utf8Capitalize) { "\"Hello, world!\", \"$. a3\", \"!ɑɽɽow\"]"); } +TYPED_TEST(TestStringKernels, Utf8Title) { + this->CheckUnary( + "utf8_title", + R"([null, "", "b", "aAaz;ZæÆ&", "ɑɽⱤoW", "ıI", "ⱥ.ⱥ.ⱥ..Ⱥ", "hEllO, WoRld!", "foo baR;héHé0zOP", "!%$^.,;"])", + this->type(), + R"([null, "", "B", "Aaaz;Zææ&", "Ɑɽɽow", "Ii", "Ⱥ.Ⱥ.Ⱥ..Ⱥ", "Hello, World!", "Foo Bar;Héhé0Zop", "!%$^.,;"])"); +} + TYPED_TEST(TestStringKernels, IsAlphaNumericUnicode) { // U+08BE (utf8: \xE0\xA2\xBE) is undefined, but utf8proc things it is // UTF8PROC_CATEGORY_LO @@ -684,7 +700,6 @@ TYPED_TEST(TestStringKernels, IsPrintableAscii) { TYPED_TEST(TestStringKernels, IsSpaceAscii) { // \xe2\x80\x88 is punctuation space - // Note: for ascii version, the non-ascii chars are seen as non-cased this->CheckUnary("ascii_is_space", "[\" \", null, \" \", \"\\t\\r\"]", boolean(), "[true, null, true, true]"); this->CheckUnary("ascii_is_space", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]", @@ -692,8 +707,7 @@ TYPED_TEST(TestStringKernels, IsSpaceAscii) { } TYPED_TEST(TestStringKernels, IsTitleAscii) { - // ٣ is arabic 3 (decimal), Φ capital - // Note: for ascii version, the non-ascii chars are seen as non-cased + // ٣ is Arabic 3 (decimal), Φ capital this->CheckUnary("ascii_is_title", "[\"Is\", null, \"Is Title\", \"Is٣Title\", \"Is_DŽ\", \"Φ\", \"DŽ\"]", boolean(), "[true, null, true, true, true, false, false]"); diff --git a/cpp/src/arrow/util/utf8.h b/cpp/src/arrow/util/utf8.h index 0c9a368d3dd..c5a100ff267 100644 --- a/cpp/src/arrow/util/utf8.h +++ b/cpp/src/arrow/util/utf8.h @@ -560,8 +560,7 @@ static inline bool UTF8AllOf(const uint8_t* first, const uint8_t* last, bool* re static inline int64_t UTF8Length(const uint8_t* first, const uint8_t* last) { int64_t length = 0; while (first != last) { - length += ((*first & 0xc0) != 0x80); - ++first; + length += ((*first++ & 0xc0) != 0x80); } return length; } diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 465500e8dae..7263d77acf2 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -697,7 +697,7 @@ String transforms +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | Function name | Arity | Input types | Output type | Options class | Notes | +=========================+=======+========================+========================+===================================+=======+ -| ascii_capitalize | Unary | String-like | String-like | | | +| ascii_capitalize | Unary | String-like | String-like | | \(1) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | ascii_lower | Unary | String-like | String-like | | \(1) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ @@ -705,6 +705,8 @@ String transforms +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | ascii_swapcase | Unary | String-like | String-like | | \(1) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| ascii_title | Unary | String-like | String-like | | \(1) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | ascii_upper | Unary | String-like | String-like | | \(1) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | binary_length | Unary | Binary- or String-like | Int32 or Int64 | | \(3) | @@ -715,7 +717,7 @@ String transforms +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | replace_substring_regex | Unary | String-like | String-like | :struct:`ReplaceSubstringOptions` | \(6) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ -| utf8_capitalize | Unary | String-like | String-like | | | +| utf8_capitalize | Unary | String-like | String-like | | \(8) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | utf8_length | Unary | String-like | Int32 or Int64 | | \(7) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ @@ -727,6 +729,8 @@ String transforms +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | utf8_swapcase | Unary | String-like | String-like | | \(8) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ +| utf8_title | Unary | String-like | String-like | | \(8) | ++-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ | utf8_upper | Unary | String-like | String-like | | \(8) | +-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+ diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index b737439147e..f7f740c24e5 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -264,6 +264,7 @@ String Transforms ascii_rtrim ascii_rtrim_whitespace ascii_swapcase + ascii_title ascii_trim ascii_upper binary_length @@ -283,6 +284,7 @@ String Transforms utf8_rtrim utf8_rtrim_whitespace utf8_swapcase + utf8_title utf8_trim utf8_upper diff --git a/r/R/expression.R b/r/R/expression.R index 57466cc3c71..aa9af9270c9 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -51,6 +51,7 @@ # str_pad is defined in dplyr-functions.R # str_sub is defined in dplyr-functions.R "str_to_lower" = "utf8_lower", + "str_to_title" = "utf8_title", "str_to_upper" = "utf8_upper", # str_trim is defined in dplyr-functions.R "stri_reverse" = "utf8_reverse",