Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 72 additions & 10 deletions cpp/src/arrow/compute/kernels/scalar_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -446,10 +446,10 @@ struct StringTransformCodepoint : public StringTransformBase {
// struct CaseMappingMixin {
struct CaseMappingTransform {
static int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) {
// Section 5.18 of the Unicode spec claim that the number of codepoints for case
// Section 5.18 of the Unicode spec claims that the number of codepoints for case
// mapping can grow by a factor of 3. This means grow by a factor of 3 in bytes
// However, since we don't support all casings (SpecialCasing.txt) the growth
// in bytes iss actually only at max 3/2 (as covered by the unittest).
// in bytes is actually only at max 3/2 (as covered by the unittest).
// Note that rounding down the 3/2 is ok, since only codepoints encoded by
// two code units (even) can grow to 3 code units.
return static_cast<int64_t>(input_ncodeunits) * 3 / 2;
Expand Down Expand Up @@ -496,6 +496,37 @@ template <typename Type>
using UTF8SwapCase =
StringTransformExec<Type, StringTransformCodepoint<UTF8SwapCaseTransform>>;

struct Utf8CapitalizeTransform : public StringTransformBase {
int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
uint8_t* output) {
uint8_t* output_start = output;
if (input_string_ncodeunits > 0) {
// Get number of code units in first code point
uint32_t codepoint = 0;
const uint8_t* i = input;
if (ARROW_PREDICT_FALSE(!util::UTF8Decode(&i, &codepoint))) {
return kTransformError;
}
int64_t codepoint_ncodeunits =
std::min(static_cast<int64_t>(i - input), input_string_ncodeunits);
if (ARROW_PREDICT_FALSE(
!util::UTF8Transform(input, input + codepoint_ncodeunits, &output,
UTF8UpperTransform::TransformCodepoint))) {
return kTransformError;
}
if (ARROW_PREDICT_FALSE(!util::UTF8Transform(
input + codepoint_ncodeunits, input + input_string_ncodeunits, &output,
UTF8LowerTransform::TransformCodepoint))) {
return kTransformError;
}
}
return output - output_start;
}
};

template <typename Type>
using Utf8Capitalize = StringTransformExec<Type, Utf8CapitalizeTransform>;

#endif // ARROW_WITH_UTF8PROC

struct AsciiReverseTransform : public StringTransformBase {
Expand Down Expand Up @@ -632,6 +663,20 @@ struct AsciiSwapCase {
}
};

struct AsciiCapitalizeTransform : public StringTransformBase {
int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
uint8_t* output) {
if (input_string_ncodeunits > 0) {
*output = ascii_toupper(*input);
TransformAsciiLower(input + 1, input_string_ncodeunits - 1, output + 1);
}
return input_string_ncodeunits;
}
};

template <typename Type>
using AsciiCapitalize = StringTransformExec<Type, AsciiCapitalizeTransform>;

// ----------------------------------------------------------------------
// exact pattern detection

Expand Down Expand Up @@ -4074,6 +4119,20 @@ const FunctionDoc ascii_swapcase_doc(
"non-ASCII characters, use \"utf8_swapcase\" instead."),
{"strings"});

const FunctionDoc ascii_capitalize_doc(
"Capitalize the first character of ASCII input",
("For each string in `strings`, return a capitalized version.\n\n"
"This function assumes the input is fully ASCII. If it may contain\n"
"non-ASCII characters, use \"utf8_capitalize\" instead."),
{"strings"});

const FunctionDoc ascii_reverse_doc(
"Reverse ASCII input",
("For each ASCII string in `strings`, return a reversed version.\n\n"
"This function assumes the input is fully ASCII. If it may contain\n"
"non-ASCII characters, use \"utf8_reverse\" instead."),
{"strings"});

const FunctionDoc utf8_upper_doc(
"Transform input to uppercase",
("For each string in `strings`, return an uppercase version."), {"strings"});
Expand All @@ -4087,17 +4146,16 @@ const FunctionDoc utf8_swapcase_doc(
"lowercase",
("For each string in `strings`, return an opposite case version."), {"strings"});

const FunctionDoc ascii_reverse_doc(
"Reverse ASCII input",
("For each ASCII string in `strings`, return a reversed version.\n\n"
"This function assumes the input is fully ASCII. If it may contain\n"
"non-ASCII characters, use \"utf8_reverse\" instead."),
const FunctionDoc utf8_capitalize_doc(
"Capitalize the first character of input",
("For each string in `strings`, return a capitalized version,\n"
"with the first character uppercased and the others lowercased."),
{"strings"});

const FunctionDoc utf8_reverse_doc(
"Reverse utf8 input",
("For each utf8 string in `strings`, return a reversed version.\n\n"
"This function operates on codepoints/UTF-8 code units, not grapheme\n"
"Reverse input",
("For each string in `strings`, return a reversed version.\n\n"
"This function operates on Unicode codepoints, not grapheme\n"
"clusters. Hence, it will not correctly reverse grapheme clusters\n"
"composed of multiple codepoints."),
{"strings"});
Expand All @@ -4113,6 +4171,8 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
MemAllocation::NO_PREALLOCATE);
MakeUnaryStringBatchKernel<AsciiSwapCase>(
"ascii_swapcase", registry, &ascii_swapcase_doc, MemAllocation::NO_PREALLOCATE);
MakeUnaryStringBatchKernel<AsciiCapitalize>("ascii_capitalize", registry,
&ascii_capitalize_doc);
MakeUnaryStringBatchKernel<AsciiTrimWhitespace>("ascii_trim_whitespace", registry,
&ascii_trim_whitespace_doc);
MakeUnaryStringBatchKernel<AsciiLTrimWhitespace>("ascii_ltrim_whitespace", registry,
Expand Down Expand Up @@ -4158,6 +4218,8 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
MakeUnaryStringUTF8TransformKernel<UTF8Lower>("utf8_lower", registry, &utf8_lower_doc);
MakeUnaryStringUTF8TransformKernel<UTF8SwapCase>("utf8_swapcase", registry,
&utf8_swapcase_doc);
MakeUnaryStringBatchKernel<Utf8Capitalize>("utf8_capitalize", registry,
&utf8_capitalize_doc);
MakeUnaryStringBatchKernel<UTF8TrimWhitespace>("utf8_trim_whitespace", registry,
&utf8_trim_whitespace_doc);
MakeUnaryStringBatchKernel<UTF8LTrimWhitespace>("utf8_ltrim_whitespace", registry,
Expand Down
22 changes: 21 additions & 1 deletion cpp/src/arrow/compute/kernels/scalar_string_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,16 @@ TYPED_TEST(TestStringKernels, AsciiSwapCase) {
"[\"HeLLo, wOrLD!\", \"$. a35?\"]");
}

TYPED_TEST(TestStringKernels, AsciiCapitalize) {
this->CheckUnary("ascii_capitalize", "[]", this->type(), "[]");
this->CheckUnary("ascii_capitalize",
"[\"aAazZæÆ&\", null, \"\", \"bBB\", \"hEllO, WoRld!\", \"$. A3\", "
"\"!hELlo, wORLd!\"]",
this->type(),
"[\"AaazzæÆ&\", null, \"\", \"Bbb\", \"Hello, world!\", \"$. a3\", "
"\"!hello, world!\"]");
}

TYPED_TEST(TestStringKernels, AsciiReverse) {
this->CheckUnary("ascii_reverse", "[]", this->type(), "[]");
this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
Expand Down Expand Up @@ -462,7 +472,7 @@ TYPED_TEST(TestStringKernels, Utf8Upper) {
this->CheckUnary("utf8_upper", "[\"aAazZæÆ&\", null, \"\", \"b\"]", this->type(),
"[\"AAAZZÆÆ&\", null, \"\", \"B\"]");

// test varying encoding lenghts and thus changing indices/offsets
// test varying encoding lengths and thus changing indices/offsets
this->CheckUnary("utf8_upper", "[\"ɑɽⱤoW\", null, \"ıI\", \"b\"]", this->type(),
"[\"ⱭⱤⱤOW\", null, \"II\", \"B\"]");

Expand Down Expand Up @@ -521,6 +531,16 @@ TYPED_TEST(TestStringKernels, Utf8SwapCase) {
CallFunction("utf8_swapcase", {invalid_input}));
}

TYPED_TEST(TestStringKernels, Utf8Capitalize) {
this->CheckUnary("ascii_capitalize", "[]", this->type(), "[]");
this->CheckUnary("utf8_capitalize",
"[\"aAazZæÆ&\", null, \"\", \"b\", \"ɑɽⱤoW\", \"ıI\", \"ⱥⱥⱥȺ\", "
"\"hEllO, WoRld!\", \"$. A3\", \"!ɑⱤⱤow\"]",
this->type(),
"[\"Aaazzææ&\", null, \"\", \"B\", \"Ɑɽɽow\", \"Ii\", \"Ⱥⱥⱥⱥ\", "
"\"Hello, world!\", \"$. a3\", \"!ɑɽɽow\"]");
}

TYPED_TEST(TestStringKernels, IsAlphaNumericUnicode) {
// U+08BE (utf8: \xE0\xA2\xBE) is undefined, but utf8proc things it is
// UTF8PROC_CATEGORY_LO
Expand Down
4 changes: 4 additions & 0 deletions docs/source/cpp/compute.rst
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,8 @@ String transforms
+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
| Function name | Arity | Input types | Output type | Options class | Notes |
+=========================+=======+========================+========================+===================================+=======+
| ascii_capitalize | Unary | String-like | String-like | | |
+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
| ascii_lower | Unary | String-like | String-like | | \(1) |
+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
| ascii_reverse | Unary | String-like | String-like | | \(2) |
Expand All @@ -603,6 +605,8 @@ String transforms
+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
| replace_substring_regex | Unary | String-like | String-like | :struct:`ReplaceSubstringOptions` | \(6) |
+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
| utf8_capitalize | Unary | String-like | String-like | | |
+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
| utf8_length | Unary | String-like | Int32 or Int64 | | \(7) |
+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
| utf8_lower | Unary | String-like | String-like | | \(8) |
Expand Down
2 changes: 2 additions & 0 deletions docs/source/python/api/compute.rst
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ String Transforms
.. autosummary::
:toctree: ../generated/

ascii_capitalize
ascii_center
ascii_lpad
ascii_ltrim
Expand All @@ -266,6 +267,7 @@ String Transforms
binary_replace_slice
replace_substring
replace_substring_regex
utf8_capitalize
utf8_center
utf8_length
utf8_lower
Expand Down