Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 64 additions & 3 deletions cpp/src/arrow/compute/kernels/scalar_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,11 @@ struct StringTransform {
static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
return Derived().Execute(ctx, batch, out);
}

static Status InvalidStatus() {
return Status::Invalid("Invalid UTF8 sequence in input");
}

Status Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
if (batch[0].kind() == Datum::ARRAY) {
const ArrayData& input = *batch[0].array();
Expand Down Expand Up @@ -173,7 +178,7 @@ struct StringTransform {
if (ARROW_PREDICT_FALSE(!static_cast<Derived&>(*this).Transform(
input_string, input_string_ncodeunits, output_str + output_ncodeunits,
&encoded_nbytes))) {
return Status::Invalid("Invalid UTF8 sequence in input");
return Derived::InvalidStatus();
}
output_ncodeunits += encoded_nbytes;
output_string_offsets[i + 1] = output_ncodeunits;
Expand All @@ -199,7 +204,7 @@ struct StringTransform {
if (ARROW_PREDICT_FALSE(!static_cast<Derived&>(*this).Transform(
input.value->data(), data_nbytes, value_buffer->mutable_data(),
&encoded_nbytes))) {
return Status::Invalid("Invalid UTF8 sequence in input");
return Derived::InvalidStatus();
}
RETURN_NOT_OK(value_buffer->Resize(encoded_nbytes, /*shrink_to_fit=*/true));
}
Expand Down Expand Up @@ -266,6 +271,45 @@ void EnsureLookupTablesFilled() {}

#endif // ARROW_WITH_UTF8PROC

template <typename Type>
struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
using Base = StringTransform<Type, AsciiReverse<Type>>;
using offset_type = typename Base::offset_type;

bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
uint8_t* output, offset_type* output_written) {
uint8_t utf8_char_found = 0;
for (offset_type i = 0; i < input_string_ncodeunits; i++) {
// if a utf8 char is found, report to utf8_char_found
utf8_char_found |= input[i] & 0x80;
output[input_string_ncodeunits - i - 1] = input[i];
}
*output_written = input_string_ncodeunits;
return utf8_char_found == 0;
}

static Status InvalidStatus() { return Status::Invalid("Non-ASCII sequence in input"); }
};

template <typename Type>
struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
using Base = StringTransform<Type, Utf8Reverse<Type>>;
using offset_type = typename Base::offset_type;

bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
uint8_t* output, offset_type* output_written) {
offset_type i = 0;
while (i < input_string_ncodeunits) {
uint8_t offset = util::ValidUtf8CodepointByteSize(input + i);
offset_type stride = std::min(i + offset, input_string_ncodeunits);
std::copy(input + i, input + stride, output + input_string_ncodeunits - stride);
i += offset;
}
*output_written = input_string_ncodeunits;
return true;
}
};

using TransformFunc = std::function<void(const uint8_t*, int64_t, uint8_t*)>;

// Transform a buffer of offsets to one which begins with 0 and has same
Expand Down Expand Up @@ -2305,7 +2349,7 @@ const FunctionDoc ascii_upper_doc(
const FunctionDoc ascii_lower_doc(
"Transform ASCII input to lowercase",
("For each string in `strings`, return a lowercase version.\n\n"
"This function assumes the input is fully ASCII. It it may contain\n"
"This function assumes the input is fully ASCII. If it may contain\n"
"non-ASCII characters, use \"utf8_lower\" instead."),
{"strings"});

Expand All @@ -2317,6 +2361,21 @@ const FunctionDoc utf8_lower_doc(
"Transform input to lowercase",
("For each string in `strings`, return a lowercase version."), {"strings"});

const FunctionDoc ascii_reverse_doc(
"Reverse ASCII input",
("For each ASCII string in `strings`, return a reversed version.\n\n"
"This function assumes the input is fully ASCII. If it may contain\n"
"non-ASCII characters, use \"utf8_reverse\" instead."),
{"strings"});

const FunctionDoc utf8_reverse_doc(
"Reverse utf8 input",
("For each utf8 string in `strings`, return a reversed version.\n\n"
"This function operates on codepoints/UTF-8 code units, not grapheme\n"
"clusters. Hence, it will not correctly reverse grapheme clusters\n"
"composed of multiple codepoints."),
{"strings"});

} // namespace

void RegisterScalarStringAscii(FunctionRegistry* registry) {
Expand All @@ -2332,6 +2391,8 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
&ascii_ltrim_whitespace_doc);
MakeUnaryStringBatchKernel<AsciiRTrimWhitespace>("ascii_rtrim_whitespace", registry,
&ascii_rtrim_whitespace_doc);
MakeUnaryStringBatchKernel<AsciiReverse>("ascii_reverse", registry, &ascii_reverse_doc);
MakeUnaryStringBatchKernel<Utf8Reverse>("utf8_reverse", registry, &utf8_reverse_doc);
MakeUnaryStringBatchKernelWithState<AsciiTrim>("ascii_trim", registry,
&ascii_lower_doc);
MakeUnaryStringBatchKernelWithState<AsciiLTrim>("ascii_ltrim", registry,
Expand Down
25 changes: 25 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_string_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,31 @@ TYPED_TEST(TestStringKernels, AsciiLower) {
"[\"aaazzæÆ&\", null, \"\", \"bbb\"]");
}

TYPED_TEST(TestStringKernels, AsciiReverse) {
this->CheckUnary("ascii_reverse", "[]", this->type(), "[]");
this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
R"(["dcba", null, "", "bbb"])");

Datum invalid_input = ArrayFromJSON(this->type(), R"(["aAazZæÆ&", null, "", "bbb"])");
EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
testing::HasSubstr("Non-ASCII sequence in input"),
CallFunction("ascii_reverse", {invalid_input}));
}

TYPED_TEST(TestStringKernels, Utf8Reverse) {
this->CheckUnary("utf8_reverse", "[]", this->type(), "[]");
this->CheckUnary("utf8_reverse", R"(["abcd", null, "", "bbb"])", this->type(),
R"(["dcba", null, "", "bbb"])");
this->CheckUnary("utf8_reverse", R"(["aAazZæÆ&", null, "", "bbb", "ɑɽⱤæÆ"])",
this->type(), R"(["&ÆæZzaAa", null, "", "bbb", "ÆæⱤɽɑ"])");

// inputs with malformed utf8 chars would produce garbage output, but the end result
// would produce arrays with same lengths. Hence checking offset buffer equality
auto malformed_input = ArrayFromJSON(this->type(), "[\"ɑ\xFFɑa\", \"ɽ\xe1\xbdɽa\"]");
const Result<Datum>& res = CallFunction("utf8_reverse", {malformed_input});
ASSERT_TRUE(res->array()->buffers[1]->Equals(*malformed_input->data()->buffers[1]));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a nit, but FTR we probably have a AssertBufferEquals (or perhaps AssertBuffersEqual :-)).

}

TEST(TestStringKernels, LARGE_MEMORY_TEST(Utf8Upper32bitGrowth)) {
// 0x7fff * 0xffff is the max a 32 bit string array can hold
// since the utf8_upper kernel can grow it by 3/2, the max we should accept is is
Expand Down
2 changes: 2 additions & 0 deletions cpp/src/arrow/util/utf8.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ const uint8_t utf8_small_table[] = { // NOLINT

uint16_t utf8_large_table[9 * 256] = {0xffff};

const uint8_t utf8_byte_size_table[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};

static void InitializeLargeTable() {
for (uint32_t state = 0; state < 9; ++state) {
for (uint32_t byte = 0; byte < 256; ++byte) {
Expand Down
14 changes: 14 additions & 0 deletions cpp/src/arrow/util/utf8.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ static constexpr uint8_t kUTF8DecodeReject = 12;
// In this table states are multiples of 256.
ARROW_EXPORT extern uint16_t utf8_large_table[9 * 256];

ARROW_EXPORT extern const uint8_t utf8_byte_size_table[16];

// Success / reject states when looked up in the large table
static constexpr uint16_t kUTF8ValidateAccept = 0;
static constexpr uint16_t kUTF8ValidateReject = 256;
Expand Down Expand Up @@ -293,6 +295,18 @@ Result<const uint8_t*> SkipUTF8BOM(const uint8_t* data, int64_t size);

static constexpr uint32_t kMaxUnicodeCodepoint = 0x110000;

// size of a valid UTF8 can be determined by looking at leading 4 bits of BYTE1
// utf8_byte_size_table[0..7] --> pure ascii chars --> 1B length
// utf8_byte_size_table[8..11] --> internal bytes --> 1B length
// utf8_byte_size_table[12,13] --> 2B long UTF8 chars
// utf8_byte_size_table[14] --> 3B long UTF8 chars
// utf8_byte_size_table[15] --> 4B long UTF8 chars
// NOTE: Results for invalid/ malformed utf-8 sequences are undefined.
// ex: \xFF... returns 4B
static inline uint8_t ValidUtf8CodepointByteSize(const uint8_t* codeunit) {
return internal::utf8_byte_size_table[*codeunit >> 4];
}

static inline bool Utf8IsContinuation(const uint8_t codeunit) {
return (codeunit & 0xC0) == 0x80; // upper two bits should be 10
}
Expand Down
33 changes: 22 additions & 11 deletions docs/source/cpp/compute.rst
Original file line number Diff line number Diff line change
Expand Up @@ -435,47 +435,58 @@ String transforms
+==========================+============+=========================+=====================+=========+=======================================+
| ascii_lower | Unary | String-like | String-like | \(1) | |
+--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
| ascii_reverse | Unary | String-like | String-like | \(2) | |
+--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
| ascii_upper | Unary | String-like | String-like | \(1) | |
+--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
| binary_length | Unary | Binary- or String-like | Int32 or Int64 | \(2) | |
| binary_length | Unary | Binary- or String-like | Int32 or Int64 | \(3) | |
+--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
| replace_substring | Unary | String-like | String-like | \(4) | :struct:`ReplaceSubstringOptions` |
+--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
| replace_substring | Unary | String-like | String-like | \(3) | :struct:`ReplaceSubstringOptions` |
| replace_substring_regex | Unary | String-like | String-like | \(5) | :struct:`ReplaceSubstringOptions` |
+--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
| replace_substring_regex | Unary | String-like | String-like | \(4) | :struct:`ReplaceSubstringOptions` |
| utf8_length | Unary | String-like | Int32 or Int64 | \(6) | |
+--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
| utf8_length | Unary | String-like | Int32 or Int64 | \(5) | |
| utf8_lower | Unary | String-like | String-like | \(7) | |
+--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
| utf8_lower | Unary | String-like | String-like | \(6) | |
| utf8_reverse | Unary | String-like | String-like | \(8) | |
+--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
| utf8_upper | Unary | String-like | String-like | \(6) | |
| utf8_upper | Unary | String-like | String-like | \(7) | |
+--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+


* \(1) Each ASCII character in the input is converted to lowercase or
uppercase. Non-ASCII characters are left untouched.

* \(2) Output is the physical length in bytes of each input element. Output
* \(2) ASCII input is reversed to the output. If non-ASCII characters
are present, ``Invalid`` :class:`Status` will be returned.

* \(3) Output is the physical length in bytes of each input element. Output
type is Int32 for Binary / String, Int64 for LargeBinary / LargeString.

* \(3) Replace non-overlapping substrings that match to
* \(4) Replace non-overlapping substrings that match to
:member:`ReplaceSubstringOptions::pattern` by
:member:`ReplaceSubstringOptions::replacement`. If
:member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the
maximum number of replacements made, counting from the left.

* \(4) Replace non-overlapping substrings that match to the regular expression
* \(5) Replace non-overlapping substrings that match to the regular expression
:member:`ReplaceSubstringOptions::pattern` by
:member:`ReplaceSubstringOptions::replacement`, using the Google RE2 library. If
:member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the
maximum number of replacements made, counting from the left. Note that if the
pattern contains groups, backreferencing can be used.

* \(5) Output is the number of characters (not bytes) of each input element.
* \(6) Output is the number of characters (not bytes) of each input element.
Output type is Int32 for String, Int64 for LargeString.

* \(6) Each UTF8-encoded character in the input is converted to lowercase or
* \(7) Each UTF8-encoded character in the input is converted to lowercase or
uppercase.

* \(8) Each UTF8-encoded code unit is written in reverse order to the output.
If the input is not valid UTF8, then the output is undefined (but the size of output
buffers will be preserved).


String trimming
~~~~~~~~~~~~~~~
Expand Down
2 changes: 2 additions & 0 deletions docs/source/python/api/compute.rst
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,10 @@ String Transforms
:toctree: ../generated/

ascii_lower
ascii_reverse
ascii_upper
utf8_lower
utf8_reverse
utf8_upper

Containment tests
Expand Down