Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3016,6 +3016,34 @@ void AddBinaryRepeat(FunctionRegistry* registry) {
DCHECK_OK(registry->AddFunction(std::move(func)));
}

struct BinaryReverseTransform : public StringTransformBase {
int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
uint8_t* output) {
for (int64_t i = 0; i < input_string_ncodeunits; i++) {
output[input_string_ncodeunits - i - 1] = input[i];
}
return input_string_ncodeunits;
}
};

template <typename Type>
using BinaryReverse = StringTransformExec<Type, BinaryReverseTransform>;

const FunctionDoc binary_reverse_doc(
"Reverse binary input",
("For each binary string in `strings`, return a reversed version.\n\n"
"This function reverses the binary data at a byte-level."),
{"strings"});

void AddBinaryReverse(FunctionRegistry* registry) {
auto func = std::make_shared<ScalarFunction>("binary_reverse", Arity::Unary(),
&binary_reverse_doc);
for (const auto& ty : BinaryTypes()) {
DCHECK_OK(func->AddKernel({ty}, ty, GenerateVarBinaryToVarBinary<BinaryReverse>(ty)));
}
DCHECK_OK(registry->AddFunction(std::move(func)));
}

// ----------------------------------------------------------------------
// Replace substring (plain, regex)

Expand Down Expand Up @@ -5211,6 +5239,7 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
AddStrptime(registry);
AddBinaryJoin(registry);
AddBinaryRepeat(registry);
AddBinaryReverse(registry);
}

} // namespace internal
Expand Down
9 changes: 9 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_string_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,15 @@ TYPED_TEST(TestBinaryKernels, NonUtf8WithNullRegex) {
}
#endif

TYPED_TEST(TestBinaryKernels, BinaryReverse) {
this->CheckUnary(
"binary_reverse",
this->template MakeArray<std::string>(
{{"abc123", 6}, {"\x00\x00\x42\xfe\xff", 5}, {"\xf0", 1}, {"", 0}}),
this->template MakeArray<std::string>(
{{"321cba", 6}, {"\xff\xfe\x42\x00\x00", 5}, {"\xf0", 1}, {"", 0}}));
}

TYPED_TEST(TestBaseBinaryKernels, BinaryReplaceSlice) {
ReplaceSliceOptions options{0, 1, "XX"};
this->CheckUnary("binary_replace_slice", "[]", this->type(), "[]", &options);
Expand Down
5 changes: 5 additions & 0 deletions cpp/src/arrow/type.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2400,6 +2400,11 @@ const std::vector<std::shared_ptr<DataType>>& BaseBinaryTypes() {
return g_base_binary_types;
}

const std::vector<std::shared_ptr<DataType>>& BinaryTypes() {
static DataTypeVector types = {binary(), large_binary()};
return types;
}

const std::vector<std::shared_ptr<DataType>>& StringTypes() {
static DataTypeVector types = {utf8(), large_utf8()};
return types;
Expand Down
2 changes: 2 additions & 0 deletions cpp/src/arrow/type.h
Original file line number Diff line number Diff line change
Expand Up @@ -2038,6 +2038,8 @@ const std::vector<std::shared_ptr<DataType>>& NumericTypes();
ARROW_EXPORT
const std::vector<std::shared_ptr<DataType>>& BaseBinaryTypes();
ARROW_EXPORT
const std::vector<std::shared_ptr<DataType>>& BinaryTypes();
ARROW_EXPORT
const std::vector<std::shared_ptr<DataType>>& StringTypes();
// Temporal types including time and timestamps for each unit
ARROW_EXPORT
Expand Down
34 changes: 19 additions & 15 deletions docs/source/cpp/compute.rst
Original file line number Diff line number Diff line change
Expand Up @@ -851,25 +851,27 @@ String transforms
+-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
| binary_replace_slice | Unary | String-like | Binary- or String-like | :struct:`ReplaceSliceOptions` | \(5) |
+-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
| replace_substring | Unary | String-like | String-like | :struct:`ReplaceSubstringOptions` | \(6) |
| binary_reverse | Unary | Binary | Binary | | \(6) |
+-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
| replace_substring_regex | Unary | String-like | String-like | :struct:`ReplaceSubstringOptions` | \(7) |
| replace_substring | Unary | String-like | String-like | :struct:`ReplaceSubstringOptions` | \(7) |
+-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
| utf8_capitalize | Unary | String-like | String-like | | \(8) |
| replace_substring_regex | Unary | String-like | String-like | :struct:`ReplaceSubstringOptions` | \(8) |
+-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
| utf8_length | Unary | String-like | Int32 or Int64 | | \(9) |
| utf8_capitalize | Unary | String-like | String-like | | \(9) |
+-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
| utf8_lower | Unary | String-like | String-like | | \(8) |
| utf8_length | Unary | String-like | Int32 or Int64 | | \(10) |
+-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
| utf8_replace_slice | Unary | String-like | String-like | :struct:`ReplaceSliceOptions` | \(6) |
| utf8_lower | Unary | String-like | String-like | | \(9) |
+-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
| utf8_reverse | Unary | String-like | String-like | | \(10) |
| utf8_replace_slice | Unary | String-like | String-like | :struct:`ReplaceSliceOptions` | \(7) |
+-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
| utf8_swapcase | Unary | String-like | String-like | | \(8) |
| utf8_reverse | Unary | String-like | String-like | | \(11) |
+-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
| utf8_title | Unary | String-like | String-like | | \(8) |
| utf8_swapcase | Unary | String-like | String-like | | \(9) |
+-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
| utf8_upper | Unary | String-like | String-like | | \(8) |
| utf8_title | Unary | String-like | String-like | | \(9) |
+-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
| utf8_upper | Unary | String-like | String-like | | \(9) |
+-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+

* \(1) Each ASCII character in the input is converted to lowercase or
Expand All @@ -888,26 +890,28 @@ String transforms
:member:`ReplaceSubstringOptions::replacement`. The binary kernel measures the
slice in bytes, while the UTF8 kernel measures the slice in codeunits.

* \(6) Replace non-overlapping substrings that match to
* \(6) Perform a byte-level reverse.

* \(7) Replace non-overlapping substrings that match to
:member:`ReplaceSubstringOptions::pattern` by
:member:`ReplaceSubstringOptions::replacement`. If
:member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the
maximum number of replacements made, counting from the left.

* \(7) Replace non-overlapping substrings that match to the regular expression
* \(8) Replace non-overlapping substrings that match to the regular expression
:member:`ReplaceSubstringOptions::pattern` by
:member:`ReplaceSubstringOptions::replacement`, using the Google RE2 library. If
:member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the
maximum number of replacements made, counting from the left. Note that if the
pattern contains groups, backreferencing can be used.

* \(8) Each UTF8-encoded character in the input is converted to lowercase or
* \(9) Each UTF8-encoded character in the input is converted to lowercase or
uppercase.

* \(9) Output is the number of characters (not bytes) of each input element.
* \(10) Output is the number of characters (not bytes) of each input element.
Output type is Int32 for String, Int64 for LargeString.

* \(10) Each UTF8-encoded code unit is written in reverse order to the output.
* \(11) Each UTF8-encoded code unit is written in reverse order to the output.
If the input is not valid UTF8, then the output is undefined (but the size of output
buffers will be preserved).

Expand Down
1 change: 1 addition & 0 deletions docs/source/python/api/compute.rst
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,7 @@ String Transforms
binary_length
binary_repeat
binary_replace_slice
binary_reverse
replace_substring
replace_substring_regex
utf8_capitalize
Expand Down