diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 3084753208e..e0e4a354d79 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -3016,6 +3016,34 @@ void AddBinaryRepeat(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); } +struct BinaryReverseTransform : public StringTransformBase { + int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, + uint8_t* output) { + for (int64_t i = 0; i < input_string_ncodeunits; i++) { + output[input_string_ncodeunits - i - 1] = input[i]; + } + return input_string_ncodeunits; + } +}; + +template +using BinaryReverse = StringTransformExec; + +const FunctionDoc binary_reverse_doc( + "Reverse binary input", + ("For each binary string in `strings`, return a reversed version.\n\n" + "This function reverses the binary data at a byte-level."), + {"strings"}); + +void AddBinaryReverse(FunctionRegistry* registry) { + auto func = std::make_shared("binary_reverse", Arity::Unary(), + &binary_reverse_doc); + for (const auto& ty : BinaryTypes()) { + DCHECK_OK(func->AddKernel({ty}, ty, GenerateVarBinaryToVarBinary(ty))); + } + DCHECK_OK(registry->AddFunction(std::move(func))); +} + // ---------------------------------------------------------------------- // Replace substring (plain, regex) @@ -5211,6 +5239,7 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) { AddStrptime(registry); AddBinaryJoin(registry); AddBinaryRepeat(registry); + AddBinaryReverse(registry); } } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index fd67e6c4d33..e3b17da4bf7 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -392,6 +392,15 @@ TYPED_TEST(TestBinaryKernels, NonUtf8WithNullRegex) { } #endif +TYPED_TEST(TestBinaryKernels, BinaryReverse) { + this->CheckUnary( + "binary_reverse", + this->template MakeArray( + {{"abc123", 6}, {"\x00\x00\x42\xfe\xff", 5}, {"\xf0", 1}, {"", 0}}), + this->template MakeArray( + {{"321cba", 6}, {"\xff\xfe\x42\x00\x00", 5}, {"\xf0", 1}, {"", 0}})); +} + TYPED_TEST(TestBaseBinaryKernels, BinaryReplaceSlice) { ReplaceSliceOptions options{0, 1, "XX"}; this->CheckUnary("binary_replace_slice", "[]", this->type(), "[]", &options); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index a01ba0532c5..1f085de8c7c 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -2400,6 +2400,11 @@ const std::vector>& BaseBinaryTypes() { return g_base_binary_types; } +const std::vector>& BinaryTypes() { + static DataTypeVector types = {binary(), large_binary()}; + return types; +} + const std::vector>& StringTypes() { static DataTypeVector types = {utf8(), large_utf8()}; return types; diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index fd461b7b14e..c273cf028bc 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -2038,6 +2038,8 @@ const std::vector>& NumericTypes(); ARROW_EXPORT const std::vector>& BaseBinaryTypes(); ARROW_EXPORT +const std::vector>& BinaryTypes(); +ARROW_EXPORT const std::vector>& StringTypes(); // Temporal types including time and timestamps for each unit ARROW_EXPORT diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 67db2cf21e7..76f1d705dc9 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -851,25 +851,27 @@ String transforms +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ | binary_replace_slice | Unary | String-like | Binary- or String-like | :struct:`ReplaceSliceOptions` | \(5) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ -| replace_substring | Unary | String-like | String-like | :struct:`ReplaceSubstringOptions` | \(6) | +| binary_reverse | Unary | Binary | Binary | | \(6) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ -| replace_substring_regex | Unary | String-like | String-like | :struct:`ReplaceSubstringOptions` | \(7) | +| replace_substring | Unary | String-like | String-like | :struct:`ReplaceSubstringOptions` | \(7) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ -| utf8_capitalize | Unary | String-like | String-like | | \(8) | +| replace_substring_regex | Unary | String-like | String-like | :struct:`ReplaceSubstringOptions` | \(8) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ -| utf8_length | Unary | String-like | Int32 or Int64 | | \(9) | +| utf8_capitalize | Unary | String-like | String-like | | \(9) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ -| utf8_lower | Unary | String-like | String-like | | \(8) | +| utf8_length | Unary | String-like | Int32 or Int64 | | \(10) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ -| utf8_replace_slice | Unary | String-like | String-like | :struct:`ReplaceSliceOptions` | \(6) | +| utf8_lower | Unary | String-like | String-like | | \(9) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ -| utf8_reverse | Unary | String-like | String-like | | \(10) | +| utf8_replace_slice | Unary | String-like | String-like | :struct:`ReplaceSliceOptions` | \(7) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ -| utf8_swapcase | Unary | String-like | String-like | | \(8) | +| utf8_reverse | Unary | String-like | String-like | | \(11) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ -| utf8_title | Unary | String-like | String-like | | \(8) | +| utf8_swapcase | Unary | String-like | String-like | | \(9) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ -| utf8_upper | Unary | String-like | String-like | | \(8) | +| utf8_title | Unary | String-like | String-like | | \(9) | ++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ +| utf8_upper | Unary | String-like | String-like | | \(9) | +-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+ * \(1) Each ASCII character in the input is converted to lowercase or @@ -888,26 +890,28 @@ String transforms :member:`ReplaceSubstringOptions::replacement`. The binary kernel measures the slice in bytes, while the UTF8 kernel measures the slice in codeunits. -* \(6) Replace non-overlapping substrings that match to +* \(6) Perform a byte-level reverse. + +* \(7) Replace non-overlapping substrings that match to :member:`ReplaceSubstringOptions::pattern` by :member:`ReplaceSubstringOptions::replacement`. If :member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the maximum number of replacements made, counting from the left. -* \(7) Replace non-overlapping substrings that match to the regular expression +* \(8) Replace non-overlapping substrings that match to the regular expression :member:`ReplaceSubstringOptions::pattern` by :member:`ReplaceSubstringOptions::replacement`, using the Google RE2 library. If :member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the maximum number of replacements made, counting from the left. Note that if the pattern contains groups, backreferencing can be used. -* \(8) Each UTF8-encoded character in the input is converted to lowercase or +* \(9) Each UTF8-encoded character in the input is converted to lowercase or uppercase. -* \(9) Output is the number of characters (not bytes) of each input element. +* \(10) Output is the number of characters (not bytes) of each input element. Output type is Int32 for String, Int64 for LargeString. -* \(10) Each UTF8-encoded code unit is written in reverse order to the output. +* \(11) Each UTF8-encoded code unit is written in reverse order to the output. If the input is not valid UTF8, then the output is undefined (but the size of output buffers will be preserved). diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index 00897a24983..704a6cbea74 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -269,6 +269,7 @@ String Transforms binary_length binary_repeat binary_replace_slice + binary_reverse replace_substring replace_substring_regex utf8_capitalize