diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 65196b2a491..00b1fcca9e2 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -142,6 +142,11 @@ struct StringTransform { static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { return Derived().Execute(ctx, batch, out); } + + static Status InvalidStatus() { + return Status::Invalid("Invalid UTF8 sequence in input"); + } + Status Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) { if (batch[0].kind() == Datum::ARRAY) { const ArrayData& input = *batch[0].array(); @@ -173,7 +178,7 @@ struct StringTransform { if (ARROW_PREDICT_FALSE(!static_cast(*this).Transform( input_string, input_string_ncodeunits, output_str + output_ncodeunits, &encoded_nbytes))) { - return Status::Invalid("Invalid UTF8 sequence in input"); + return Derived::InvalidStatus(); } output_ncodeunits += encoded_nbytes; output_string_offsets[i + 1] = output_ncodeunits; @@ -199,7 +204,7 @@ struct StringTransform { if (ARROW_PREDICT_FALSE(!static_cast(*this).Transform( input.value->data(), data_nbytes, value_buffer->mutable_data(), &encoded_nbytes))) { - return Status::Invalid("Invalid UTF8 sequence in input"); + return Derived::InvalidStatus(); } RETURN_NOT_OK(value_buffer->Resize(encoded_nbytes, /*shrink_to_fit=*/true)); } @@ -266,6 +271,45 @@ void EnsureLookupTablesFilled() {} #endif // ARROW_WITH_UTF8PROC +template +struct AsciiReverse : StringTransform> { + using Base = StringTransform>; + using offset_type = typename Base::offset_type; + + bool Transform(const uint8_t* input, offset_type input_string_ncodeunits, + uint8_t* output, offset_type* output_written) { + uint8_t utf8_char_found = 0; + for (offset_type i = 0; i < input_string_ncodeunits; i++) { + // if a utf8 char is found, report to utf8_char_found + utf8_char_found |= input[i] & 0x80; + output[input_string_ncodeunits - i - 1] = input[i]; + } + *output_written = input_string_ncodeunits; + return utf8_char_found == 0; + } + + static Status InvalidStatus() { return Status::Invalid("Non-ASCII sequence in input"); } +}; + +template +struct Utf8Reverse : StringTransform> { + using Base = StringTransform>; + using offset_type = typename Base::offset_type; + + bool Transform(const uint8_t* input, offset_type input_string_ncodeunits, + uint8_t* output, offset_type* output_written) { + offset_type i = 0; + while (i < input_string_ncodeunits) { + uint8_t offset = util::ValidUtf8CodepointByteSize(input + i); + offset_type stride = std::min(i + offset, input_string_ncodeunits); + std::copy(input + i, input + stride, output + input_string_ncodeunits - stride); + i += offset; + } + *output_written = input_string_ncodeunits; + return true; + } +}; + using TransformFunc = std::function; // Transform a buffer of offsets to one which begins with 0 and has same @@ -2305,7 +2349,7 @@ const FunctionDoc ascii_upper_doc( const FunctionDoc ascii_lower_doc( "Transform ASCII input to lowercase", ("For each string in `strings`, return a lowercase version.\n\n" - "This function assumes the input is fully ASCII. It it may contain\n" + "This function assumes the input is fully ASCII. If it may contain\n" "non-ASCII characters, use \"utf8_lower\" instead."), {"strings"}); @@ -2317,6 +2361,21 @@ const FunctionDoc utf8_lower_doc( "Transform input to lowercase", ("For each string in `strings`, return a lowercase version."), {"strings"}); +const FunctionDoc ascii_reverse_doc( + "Reverse ASCII input", + ("For each ASCII string in `strings`, return a reversed version.\n\n" + "This function assumes the input is fully ASCII. If it may contain\n" + "non-ASCII characters, use \"utf8_reverse\" instead."), + {"strings"}); + +const FunctionDoc utf8_reverse_doc( + "Reverse utf8 input", + ("For each utf8 string in `strings`, return a reversed version.\n\n" + "This function operates on codepoints/UTF-8 code units, not grapheme\n" + "clusters. Hence, it will not correctly reverse grapheme clusters\n" + "composed of multiple codepoints."), + {"strings"}); + } // namespace void RegisterScalarStringAscii(FunctionRegistry* registry) { @@ -2332,6 +2391,8 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) { &ascii_ltrim_whitespace_doc); MakeUnaryStringBatchKernel("ascii_rtrim_whitespace", registry, &ascii_rtrim_whitespace_doc); + MakeUnaryStringBatchKernel("ascii_reverse", registry, &ascii_reverse_doc); + MakeUnaryStringBatchKernel("utf8_reverse", registry, &utf8_reverse_doc); MakeUnaryStringBatchKernelWithState("ascii_trim", registry, &ascii_lower_doc); MakeUnaryStringBatchKernelWithState("ascii_ltrim", registry, diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index a59634b7be8..2deac9bc5c4 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -91,6 +91,31 @@ TYPED_TEST(TestStringKernels, AsciiLower) { "[\"aaazzæÆ&\", null, \"\", \"bbb\"]"); } +TYPED_TEST(TestStringKernels, AsciiReverse) { + this->CheckUnary("ascii_reverse", "[]", this->type(), "[]"); + this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(), + R"(["dcba", null, "", "bbb"])"); + + Datum invalid_input = ArrayFromJSON(this->type(), R"(["aAazZæÆ&", null, "", "bbb"])"); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, + testing::HasSubstr("Non-ASCII sequence in input"), + CallFunction("ascii_reverse", {invalid_input})); +} + +TYPED_TEST(TestStringKernels, Utf8Reverse) { + this->CheckUnary("utf8_reverse", "[]", this->type(), "[]"); + this->CheckUnary("utf8_reverse", R"(["abcd", null, "", "bbb"])", this->type(), + R"(["dcba", null, "", "bbb"])"); + this->CheckUnary("utf8_reverse", R"(["aAazZæÆ&", null, "", "bbb", "ɑɽⱤæÆ"])", + this->type(), R"(["&ÆæZzaAa", null, "", "bbb", "ÆæⱤɽɑ"])"); + + // inputs with malformed utf8 chars would produce garbage output, but the end result + // would produce arrays with same lengths. Hence checking offset buffer equality + auto malformed_input = ArrayFromJSON(this->type(), "[\"ɑ\xFFɑa\", \"ɽ\xe1\xbdɽa\"]"); + const Result& res = CallFunction("utf8_reverse", {malformed_input}); + ASSERT_TRUE(res->array()->buffers[1]->Equals(*malformed_input->data()->buffers[1])); +} + TEST(TestStringKernels, LARGE_MEMORY_TEST(Utf8Upper32bitGrowth)) { // 0x7fff * 0xffff is the max a 32 bit string array can hold // since the utf8_upper kernel can grow it by 3/2, the max we should accept is is diff --git a/cpp/src/arrow/util/utf8.cc b/cpp/src/arrow/util/utf8.cc index 478d8ade95f..11394d2e64c 100644 --- a/cpp/src/arrow/util/utf8.cc +++ b/cpp/src/arrow/util/utf8.cc @@ -64,6 +64,8 @@ const uint8_t utf8_small_table[] = { // NOLINT uint16_t utf8_large_table[9 * 256] = {0xffff}; +const uint8_t utf8_byte_size_table[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4}; + static void InitializeLargeTable() { for (uint32_t state = 0; state < 9; ++state) { for (uint32_t byte = 0; byte < 256; ++byte) { diff --git a/cpp/src/arrow/util/utf8.h b/cpp/src/arrow/util/utf8.h index 2d94ca4986e..310d6913403 100644 --- a/cpp/src/arrow/util/utf8.h +++ b/cpp/src/arrow/util/utf8.h @@ -65,6 +65,8 @@ static constexpr uint8_t kUTF8DecodeReject = 12; // In this table states are multiples of 256. ARROW_EXPORT extern uint16_t utf8_large_table[9 * 256]; +ARROW_EXPORT extern const uint8_t utf8_byte_size_table[16]; + // Success / reject states when looked up in the large table static constexpr uint16_t kUTF8ValidateAccept = 0; static constexpr uint16_t kUTF8ValidateReject = 256; @@ -293,6 +295,18 @@ Result SkipUTF8BOM(const uint8_t* data, int64_t size); static constexpr uint32_t kMaxUnicodeCodepoint = 0x110000; +// size of a valid UTF8 can be determined by looking at leading 4 bits of BYTE1 +// utf8_byte_size_table[0..7] --> pure ascii chars --> 1B length +// utf8_byte_size_table[8..11] --> internal bytes --> 1B length +// utf8_byte_size_table[12,13] --> 2B long UTF8 chars +// utf8_byte_size_table[14] --> 3B long UTF8 chars +// utf8_byte_size_table[15] --> 4B long UTF8 chars +// NOTE: Results for invalid/ malformed utf-8 sequences are undefined. +// ex: \xFF... returns 4B +static inline uint8_t ValidUtf8CodepointByteSize(const uint8_t* codeunit) { + return internal::utf8_byte_size_table[*codeunit >> 4]; +} + static inline bool Utf8IsContinuation(const uint8_t codeunit) { return (codeunit & 0xC0) == 0x80; // upper two bits should be 10 } diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 592dc4ec1b0..1cee7bcf266 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -435,47 +435,58 @@ String transforms +==========================+============+=========================+=====================+=========+=======================================+ | ascii_lower | Unary | String-like | String-like | \(1) | | +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+ +| ascii_reverse | Unary | String-like | String-like | \(2) | | ++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+ | ascii_upper | Unary | String-like | String-like | \(1) | | +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+ -| binary_length | Unary | Binary- or String-like | Int32 or Int64 | \(2) | | +| binary_length | Unary | Binary- or String-like | Int32 or Int64 | \(3) | | ++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+ +| replace_substring | Unary | String-like | String-like | \(4) | :struct:`ReplaceSubstringOptions` | +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+ -| replace_substring | Unary | String-like | String-like | \(3) | :struct:`ReplaceSubstringOptions` | +| replace_substring_regex | Unary | String-like | String-like | \(5) | :struct:`ReplaceSubstringOptions` | +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+ -| replace_substring_regex | Unary | String-like | String-like | \(4) | :struct:`ReplaceSubstringOptions` | +| utf8_length | Unary | String-like | Int32 or Int64 | \(6) | | +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+ -| utf8_length | Unary | String-like | Int32 or Int64 | \(5) | | +| utf8_lower | Unary | String-like | String-like | \(7) | | +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+ -| utf8_lower | Unary | String-like | String-like | \(6) | | +| utf8_reverse | Unary | String-like | String-like | \(8) | | +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+ -| utf8_upper | Unary | String-like | String-like | \(6) | | +| utf8_upper | Unary | String-like | String-like | \(7) | | +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+ * \(1) Each ASCII character in the input is converted to lowercase or uppercase. Non-ASCII characters are left untouched. -* \(2) Output is the physical length in bytes of each input element. Output +* \(2) ASCII input is reversed to the output. If non-ASCII characters + are present, ``Invalid`` :class:`Status` will be returned. + +* \(3) Output is the physical length in bytes of each input element. Output type is Int32 for Binary / String, Int64 for LargeBinary / LargeString. -* \(3) Replace non-overlapping substrings that match to +* \(4) Replace non-overlapping substrings that match to :member:`ReplaceSubstringOptions::pattern` by :member:`ReplaceSubstringOptions::replacement`. If :member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the maximum number of replacements made, counting from the left. -* \(4) Replace non-overlapping substrings that match to the regular expression +* \(5) Replace non-overlapping substrings that match to the regular expression :member:`ReplaceSubstringOptions::pattern` by :member:`ReplaceSubstringOptions::replacement`, using the Google RE2 library. If :member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the maximum number of replacements made, counting from the left. Note that if the pattern contains groups, backreferencing can be used. -* \(5) Output is the number of characters (not bytes) of each input element. +* \(6) Output is the number of characters (not bytes) of each input element. Output type is Int32 for String, Int64 for LargeString. -* \(6) Each UTF8-encoded character in the input is converted to lowercase or +* \(7) Each UTF8-encoded character in the input is converted to lowercase or uppercase. +* \(8) Each UTF8-encoded code unit is written in reverse order to the output. + If the input is not valid UTF8, then the output is undefined (but the size of output + buffers will be preserved). + String trimming ~~~~~~~~~~~~~~~ diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index da16ccdfa29..d74732933e7 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -144,8 +144,10 @@ String Transforms :toctree: ../generated/ ascii_lower + ascii_reverse ascii_upper utf8_lower + utf8_reverse utf8_upper Containment tests