From 4e8374be036b4879a9669a40eb451670ce44bdff Mon Sep 17 00:00:00 2001 From: Johan Peltenburg Date: Tue, 7 Dec 2021 16:16:39 +0100 Subject: [PATCH 1/7] Add binary reverse kernel --- .../arrow/compute/kernels/scalar_string.cc | 29 +++++++++++++++++++ .../compute/kernels/scalar_string_test.cc | 5 ++++ 2 files changed, 34 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 3084753208e..77f147df102 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -3016,6 +3016,34 @@ void AddBinaryRepeat(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); } +struct BinaryReverseTransform : public StringTransformBase { + int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, + uint8_t* output) { + for (int64_t i = 0; i < input_string_ncodeunits; i++) { + output[input_string_ncodeunits - i - 1] = input[i]; + } + return input_string_ncodeunits; + } +}; + +template +using BinaryReverse = StringTransformExec; + +const FunctionDoc binary_reverse_doc( + "Reverse binary input", + ("For each sequence of bytes in `binaries`, return a reversed version.\n\n" + "This function reverses the binary data at a byte-level."), + {"binaries"}); + +void AddBinaryReverse(FunctionRegistry* registry) { + auto func = std::make_shared("binary_reverse", Arity::Unary(), + &binary_reverse_doc); + for (const auto& ty : BaseBinaryTypes()) { + DCHECK_OK(func->AddKernel({ty}, ty, GenerateVarBinaryToVarBinary(ty))); + } + DCHECK_OK(registry->AddFunction(std::move(func))); +} + // ---------------------------------------------------------------------- // Replace substring (plain, regex) @@ -5211,6 +5239,7 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) { AddStrptime(registry); AddBinaryJoin(registry); AddBinaryRepeat(registry); + AddBinaryReverse(registry); } } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index fd67e6c4d33..d495765cf33 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -137,6 +137,11 @@ TYPED_TEST(TestBaseBinaryKernels, BinaryLength) { this->offset_type(), "[3, 4, 2]"); } +TYPED_TEST(TestBinaryKernels, BinaryReverse) { + this->CheckUnary("binary_reverse", this->MakeArray({"abc123", "\x01", "", "\x01\xfe"}), + this->MakeArray({"321cba", "\x01", "", "\xfe\x01"})); +} + // The NonUtf8XXX tests use kernels that do not accept invalid UTF-8 when // processing [Large]StringType data. These tests use invalid UTF-8 inputs. TYPED_TEST(TestBinaryKernels, NonUtf8) { From 1114c9357f40f29791871267056cfe2c6ff5bfda Mon Sep 17 00:00:00 2001 From: Johan Peltenburg Date: Wed, 8 Dec 2021 09:14:39 +0100 Subject: [PATCH 2/7] Add docs --- docs/source/cpp/compute.rst | 7 +++++++ docs/source/python/api/compute.rst | 1 + 2 files changed, 8 insertions(+) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 67db2cf21e7..cfba963b63d 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -851,6 +851,8 @@ String transforms +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ | binary_replace_slice | Unary | String-like | Binary- or String-like | :struct:`ReplaceSliceOptions` | \(5) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ +| binary_reverse | Unary | Binary- or String-like | Binary- or String-like | | \(11) | ++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ | replace_substring | Unary | String-like | String-like | :struct:`ReplaceSubstringOptions` | \(6) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ | replace_substring_regex | Unary | String-like | String-like | :struct:`ReplaceSubstringOptions` | \(7) | @@ -911,6 +913,11 @@ String transforms If the input is not valid UTF8, then the output is undefined (but the size of output buffers will be preserved). +* \(11) Performs a byte-level reverse. Does not take encoding into consideration. + Applying `binary_reverse` to strings using specific encodings with multiple bytes + per code point (e.g. UTF8) will produce invalid strings. Use e.g. `utf8_reverse` + instead. + String padding ~~~~~~~~~~~~~~ diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index 00897a24983..704a6cbea74 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -269,6 +269,7 @@ String Transforms binary_length binary_repeat binary_replace_slice + binary_reverse replace_substring replace_substring_regex utf8_capitalize From deb6fdf7d665be93defe265689f655aa6ec5caa8 Mon Sep 17 00:00:00 2001 From: Johan Peltenburg Date: Wed, 8 Dec 2021 09:57:03 +0100 Subject: [PATCH 3/7] Test String type as well --- .../arrow/compute/kernels/scalar_string.cc | 8 ++++--- .../compute/kernels/scalar_string_test.cc | 22 ++++++++++++++----- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 77f147df102..1554acfcc29 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -3031,9 +3031,11 @@ using BinaryReverse = StringTransformExec; const FunctionDoc binary_reverse_doc( "Reverse binary input", - ("For each sequence of bytes in `binaries`, return a reversed version.\n\n" - "This function reverses the binary data at a byte-level."), - {"binaries"}); + ("For each binary string in `strings`, return a reversed version.\n\n" + "This function reverses the binary data at a byte-level.\n\n" + "Applying `binary_reverse` to strings using specific encodings with\n" + "multiple bytes per code point (e.g. UTF8) may produce invalid strings."), + {"strings"}); void AddBinaryReverse(FunctionRegistry* registry) { auto func = std::make_shared("binary_reverse", Arity::Unary(), diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index d495765cf33..cc6d6f775aa 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -137,11 +137,6 @@ TYPED_TEST(TestBaseBinaryKernels, BinaryLength) { this->offset_type(), "[3, 4, 2]"); } -TYPED_TEST(TestBinaryKernels, BinaryReverse) { - this->CheckUnary("binary_reverse", this->MakeArray({"abc123", "\x01", "", "\x01\xfe"}), - this->MakeArray({"321cba", "\x01", "", "\xfe\x01"})); -} - // The NonUtf8XXX tests use kernels that do not accept invalid UTF-8 when // processing [Large]StringType data. These tests use invalid UTF-8 inputs. TYPED_TEST(TestBinaryKernels, NonUtf8) { @@ -397,6 +392,23 @@ TYPED_TEST(TestBinaryKernels, NonUtf8WithNullRegex) { } #endif +TYPED_TEST(TestBinaryKernels, BinaryReverse) { + this->CheckUnary( + "binary_reverse", + this->template MakeArray( + {{"abc123", 6}, {"\x00\x00\x42\xfe\xff", 5}, {"\xf0", 1}, {"", 0}}), + this->template MakeArray( + {{"321cba", 6}, {"\xff\xfe\x42\x00\x00", 5}, {"\xf0", 1}, {"", 0}})); +} + +TYPED_TEST(TestBaseBinaryKernels, BinaryReverse) { + this->CheckUnary("binary_reverse", + this->template MakeArray( + {{"abc123", 6}, {"01.23", 5}, {"q", 1}, {"", 0}}), + this->template MakeArray( + {{"321cba", 6}, {"32.10", 5}, {"q", 1}, {"", 0}})); +} + TYPED_TEST(TestBaseBinaryKernels, BinaryReplaceSlice) { ReplaceSliceOptions options{0, 1, "XX"}; this->CheckUnary("binary_replace_slice", "[]", this->type(), "[]", &options); From 8d86cbb1400fc6def19bafa7618522e9a711efd1 Mon Sep 17 00:00:00 2001 From: Johan Peltenburg Date: Wed, 8 Dec 2021 14:44:23 +0100 Subject: [PATCH 4/7] Add BinaryTypes() type set for binary strings without specified encoding --- cpp/src/arrow/type.cc | 5 +++++ cpp/src/arrow/type.h | 2 ++ 2 files changed, 7 insertions(+) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index a01ba0532c5..1f085de8c7c 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -2400,6 +2400,11 @@ const std::vector>& BaseBinaryTypes() { return g_base_binary_types; } +const std::vector>& BinaryTypes() { + static DataTypeVector types = {binary(), large_binary()}; + return types; +} + const std::vector>& StringTypes() { static DataTypeVector types = {utf8(), large_utf8()}; return types; diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index fd461b7b14e..c273cf028bc 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -2038,6 +2038,8 @@ const std::vector>& NumericTypes(); ARROW_EXPORT const std::vector>& BaseBinaryTypes(); ARROW_EXPORT +const std::vector>& BinaryTypes(); +ARROW_EXPORT const std::vector>& StringTypes(); // Temporal types including time and timestamps for each unit ARROW_EXPORT From 9c513385eb97aafece01f557447358c3b5151a32 Mon Sep 17 00:00:00 2001 From: Johan Peltenburg Date: Wed, 8 Dec 2021 14:47:01 +0100 Subject: [PATCH 5/7] Allow binary_reverse to be applied only to BinaryTypes --- cpp/src/arrow/compute/kernels/scalar_string.cc | 6 ++---- cpp/src/arrow/compute/kernels/scalar_string_test.cc | 8 -------- docs/source/cpp/compute.rst | 7 ++----- 3 files changed, 4 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 1554acfcc29..e0e4a354d79 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -3032,15 +3032,13 @@ using BinaryReverse = StringTransformExec; const FunctionDoc binary_reverse_doc( "Reverse binary input", ("For each binary string in `strings`, return a reversed version.\n\n" - "This function reverses the binary data at a byte-level.\n\n" - "Applying `binary_reverse` to strings using specific encodings with\n" - "multiple bytes per code point (e.g. UTF8) may produce invalid strings."), + "This function reverses the binary data at a byte-level."), {"strings"}); void AddBinaryReverse(FunctionRegistry* registry) { auto func = std::make_shared("binary_reverse", Arity::Unary(), &binary_reverse_doc); - for (const auto& ty : BaseBinaryTypes()) { + for (const auto& ty : BinaryTypes()) { DCHECK_OK(func->AddKernel({ty}, ty, GenerateVarBinaryToVarBinary(ty))); } DCHECK_OK(registry->AddFunction(std::move(func))); diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index cc6d6f775aa..e3b17da4bf7 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -401,14 +401,6 @@ TYPED_TEST(TestBinaryKernels, BinaryReverse) { {{"321cba", 6}, {"\xff\xfe\x42\x00\x00", 5}, {"\xf0", 1}, {"", 0}})); } -TYPED_TEST(TestBaseBinaryKernels, BinaryReverse) { - this->CheckUnary("binary_reverse", - this->template MakeArray( - {{"abc123", 6}, {"01.23", 5}, {"q", 1}, {"", 0}}), - this->template MakeArray( - {{"321cba", 6}, {"32.10", 5}, {"q", 1}, {"", 0}})); -} - TYPED_TEST(TestBaseBinaryKernels, BinaryReplaceSlice) { ReplaceSliceOptions options{0, 1, "XX"}; this->CheckUnary("binary_replace_slice", "[]", this->type(), "[]", &options); diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index cfba963b63d..e8fe3ad088f 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -851,7 +851,7 @@ String transforms +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ | binary_replace_slice | Unary | String-like | Binary- or String-like | :struct:`ReplaceSliceOptions` | \(5) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ -| binary_reverse | Unary | Binary- or String-like | Binary- or String-like | | \(11) | +| binary_reverse | Unary | Binary | Binary | | \(11) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ | replace_substring | Unary | String-like | String-like | :struct:`ReplaceSubstringOptions` | \(6) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ @@ -913,10 +913,7 @@ String transforms If the input is not valid UTF8, then the output is undefined (but the size of output buffers will be preserved). -* \(11) Performs a byte-level reverse. Does not take encoding into consideration. - Applying `binary_reverse` to strings using specific encodings with multiple bytes - per code point (e.g. UTF8) will produce invalid strings. Use e.g. `utf8_reverse` - instead. +* \(11) Performs a byte-level reverse. Does not make any assumptions about encoding. String padding ~~~~~~~~~~~~~~ From 93914b733ebf4bebbf071f42895779957e3f0361 Mon Sep 17 00:00:00 2001 From: Johan Peltenburg Date: Wed, 8 Dec 2021 20:14:05 +0100 Subject: [PATCH 6/7] Make notes appear in ascending order --- docs/source/cpp/compute.rst | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index e8fe3ad088f..5a9066fdee5 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -851,27 +851,27 @@ String transforms +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ | binary_replace_slice | Unary | String-like | Binary- or String-like | :struct:`ReplaceSliceOptions` | \(5) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ -| binary_reverse | Unary | Binary | Binary | | \(11) | +| binary_reverse | Unary | Binary | Binary | | \(6) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ -| replace_substring | Unary | String-like | String-like | :struct:`ReplaceSubstringOptions` | \(6) | +| replace_substring | Unary | String-like | String-like | :struct:`ReplaceSubstringOptions` | \(7) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ -| replace_substring_regex | Unary | String-like | String-like | :struct:`ReplaceSubstringOptions` | \(7) | +| replace_substring_regex | Unary | String-like | String-like | :struct:`ReplaceSubstringOptions` | \(8) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ -| utf8_capitalize | Unary | String-like | String-like | | \(8) | +| utf8_capitalize | Unary | String-like | String-like | | \(9) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ -| utf8_length | Unary | String-like | Int32 or Int64 | | \(9) | +| utf8_length | Unary | String-like | Int32 or Int64 | | \(10) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ -| utf8_lower | Unary | String-like | String-like | | \(8) | +| utf8_lower | Unary | String-like | String-like | | \(9) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ -| utf8_replace_slice | Unary | String-like | String-like | :struct:`ReplaceSliceOptions` | \(6) | +| utf8_replace_slice | Unary | String-like | String-like | :struct:`ReplaceSliceOptions` | \(7) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ -| utf8_reverse | Unary | String-like | String-like | | \(10) | +| utf8_reverse | Unary | String-like | String-like | | \(11) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ -| utf8_swapcase | Unary | String-like | String-like | | \(8) | +| utf8_swapcase | Unary | String-like | String-like | | \(9) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ -| utf8_title | Unary | String-like | String-like | | \(8) | +| utf8_title | Unary | String-like | String-like | | \(9) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ -| utf8_upper | Unary | String-like | String-like | | \(8) | +| utf8_upper | Unary | String-like | String-like | | \(9) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ * \(1) Each ASCII character in the input is converted to lowercase or @@ -890,31 +890,31 @@ String transforms :member:`ReplaceSubstringOptions::replacement`. The binary kernel measures the slice in bytes, while the UTF8 kernel measures the slice in codeunits. -* \(6) Replace non-overlapping substrings that match to +* \(6) Perform a byte-level reverse. + +* \(7) Replace non-overlapping substrings that match to :member:`ReplaceSubstringOptions::pattern` by :member:`ReplaceSubstringOptions::replacement`. If :member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the maximum number of replacements made, counting from the left. -* \(7) Replace non-overlapping substrings that match to the regular expression +* \(8) Replace non-overlapping substrings that match to the regular expression :member:`ReplaceSubstringOptions::pattern` by :member:`ReplaceSubstringOptions::replacement`, using the Google RE2 library. If :member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the maximum number of replacements made, counting from the left. Note that if the pattern contains groups, backreferencing can be used. -* \(8) Each UTF8-encoded character in the input is converted to lowercase or +* \(9) Each UTF8-encoded character in the input is converted to lowercase or uppercase. -* \(9) Output is the number of characters (not bytes) of each input element. +* \(10) Output is the number of characters (not bytes) of each input element. Output type is Int32 for String, Int64 for LargeString. -* \(10) Each UTF8-encoded code unit is written in reverse order to the output. +* \(11) Each UTF8-encoded code unit is written in reverse order to the output. If the input is not valid UTF8, then the output is undefined (but the size of output buffers will be preserved). -* \(11) Performs a byte-level reverse. Does not make any assumptions about encoding. - String padding ~~~~~~~~~~~~~~ From be6f7f927fa6812d33acbdc0100345a6ae94bc25 Mon Sep 17 00:00:00 2001 From: David Li Date: Fri, 10 Dec 2021 08:42:00 -0500 Subject: [PATCH 7/7] ARROW-14306: [C++] Fix table alignment --- docs/source/cpp/compute.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 5a9066fdee5..76f1d705dc9 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -851,7 +851,7 @@ String transforms +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ | binary_replace_slice | Unary | String-like | Binary- or String-like | :struct:`ReplaceSliceOptions` | \(5) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ -| binary_reverse | Unary | Binary | Binary | | \(6) | +| binary_reverse | Unary | Binary | Binary | | \(6) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ | replace_substring | Unary | String-like | String-like | :struct:`ReplaceSubstringOptions` | \(7) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+