From 50eb552f99c02ce1c8656adbfdb748211b78ae61 Mon Sep 17 00:00:00 2001 From: niranda perera Date: Wed, 12 May 2021 16:59:08 -0400 Subject: [PATCH 01/15] adding basic impl for string reverse --- .../arrow/compute/kernels/scalar_string.cc | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 65196b2a491..ae022ef0e6c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -266,6 +266,22 @@ void EnsureLookupTablesFilled() {} #endif // ARROW_WITH_UTF8PROC +template +struct StringReverse : StringTransform> { + using Base = StringTransform>; + using offset_type = typename Base::offset_type; + + bool Transform(const uint8_t* input, offset_type input_string_ncodeunits, + uint8_t* output, offset_type* output_written) { + std::reverse_copy(input, input + input_string_ncodeunits, output); + // todo: this is not needed for reverse. may be change StringTransform struct to + // handle 2 cases. 1. for same size string transform and 2. variable size string + // transform + *output_written = input_string_ncodeunits; + return true; + } +}; + using TransformFunc = std::function; // Transform a buffer of offsets to one which begins with 0 and has same @@ -2317,6 +2333,10 @@ const FunctionDoc utf8_lower_doc( "Transform input to lowercase", ("For each string in `strings`, return a lowercase version."), {"strings"}); +const FunctionDoc string_reverse_doc( + "Reverse input ", ("For each string in `strings`, return a reversed version."), + {"strings"}); + } // namespace void RegisterScalarStringAscii(FunctionRegistry* registry) { @@ -2332,6 +2352,8 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) { &ascii_ltrim_whitespace_doc); MakeUnaryStringBatchKernel("ascii_rtrim_whitespace", registry, &ascii_rtrim_whitespace_doc); + MakeUnaryStringBatchKernel("string_reverse", registry, + &string_reverse_doc); MakeUnaryStringBatchKernelWithState("ascii_trim", registry, &ascii_lower_doc); MakeUnaryStringBatchKernelWithState("ascii_ltrim", registry, From 991152938bf01836fa3de0cdc84418db0a81ca19 Mon Sep 17 00:00:00 2001 From: niranda perera Date: Wed, 12 May 2021 17:58:49 -0400 Subject: [PATCH 02/15] adding basic test case --- cpp/src/arrow/compute/kernels/scalar_string_test.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index a59634b7be8..10d77c4a290 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -91,6 +91,14 @@ TYPED_TEST(TestStringKernels, AsciiLower) { "[\"aaazzæÆ&\", null, \"\", \"bbb\"]"); } +TYPED_TEST(TestStringKernels, StringReverse) { + this->CheckUnary("string_reverse", "[]", this->type(), "[]"); + this->CheckUnary("string_reverse", "[\"abcd\", null, \"\", \"bbb\"]", this->type(), + "[\"dcba\", null, \"\", \"bbb\"]"); +// this->CheckUnary("string_reverse", "[\"aAazZæÆ&\", null, \"\", \"bbb\"]", this->type(), +// "[\"&ÆæZzaAa\", null, \"\", \"bbb\"]"); +} + TEST(TestStringKernels, LARGE_MEMORY_TEST(Utf8Upper32bitGrowth)) { // 0x7fff * 0xffff is the max a 32 bit string array can hold // since the utf8_upper kernel can grow it by 3/2, the max we should accept is is From f8fe15d5278d6781f374647f96243fe1d9527581 Mon Sep 17 00:00:00 2001 From: niranda perera Date: Thu, 13 May 2021 14:33:39 -0400 Subject: [PATCH 03/15] separating ascii and utf8 reverse kernels --- .../arrow/compute/kernels/scalar_string.cc | 51 ++++++++++++++++--- .../compute/kernels/scalar_string_test.cc | 20 ++++++-- 2 files changed, 58 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index ae022ef0e6c..2b5795a9cfe 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -267,17 +267,48 @@ void EnsureLookupTablesFilled() {} #endif // ARROW_WITH_UTF8PROC template -struct StringReverse : StringTransform> { - using Base = StringTransform>; +struct AsciiReverse : StringTransform> { + using Base = StringTransform>; using offset_type = typename Base::offset_type; bool Transform(const uint8_t* input, offset_type input_string_ncodeunits, uint8_t* output, offset_type* output_written) { - std::reverse_copy(input, input + input_string_ncodeunits, output); + uint8_t failure = 0; + for (offset_type i = 0; i < input_string_ncodeunits; i++) { + failure |= input[i] & 0x80; // if a utf8 char is found, report to failure + output[input_string_ncodeunits - i - 1] = input[i]; + } + // std::reverse_copy(input, input + input_string_ncodeunits, output); // todo: this is not needed for reverse. may be change StringTransform struct to // handle 2 cases. 1. for same size string transform and 2. variable size string // transform *output_written = input_string_ncodeunits; + return failure == 0; + } +}; + +// todo move to a proper place +/** + * UTF8 codeunit size can be determined by looking at the first 4 bits of BYTE1 + */ +const std::array UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 2, 2, 3, 4}; + +template +struct Utf8Reverse : StringTransform> { + using Base = StringTransform>; + using offset_type = typename Base::offset_type; + + bool Transform(const uint8_t* input, offset_type input_string_ncodeunits, + uint8_t* output, offset_type* output_written) { + offset_type i = 0; + while (i < input_string_ncodeunits) { + uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4]; + std::copy(input + i, input + (i + offset), + output + (input_string_ncodeunits - i - offset)); + i += offset; + } + *output_written = input_string_ncodeunits; return true; } }; @@ -2333,9 +2364,13 @@ const FunctionDoc utf8_lower_doc( "Transform input to lowercase", ("For each string in `strings`, return a lowercase version."), {"strings"}); -const FunctionDoc string_reverse_doc( - "Reverse input ", ("For each string in `strings`, return a reversed version."), - {"strings"}); +const FunctionDoc ascii_reverse_doc( + "Reverse ascii input", + ("For each ascii string in `strings`, return a reversed version."), {"strings"}); + +const FunctionDoc utf8_reverse_doc( + "Reverse utf8 input", + ("For each utf8 string in `strings`, return a reversed version."), {"strings"}); } // namespace @@ -2352,8 +2387,8 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) { &ascii_ltrim_whitespace_doc); MakeUnaryStringBatchKernel("ascii_rtrim_whitespace", registry, &ascii_rtrim_whitespace_doc); - MakeUnaryStringBatchKernel("string_reverse", registry, - &string_reverse_doc); + MakeUnaryStringBatchKernel("ascii_reverse", registry, &ascii_reverse_doc); + MakeUnaryStringBatchKernel("utf8_reverse", registry, &utf8_reverse_doc); MakeUnaryStringBatchKernelWithState("ascii_trim", registry, &ascii_lower_doc); MakeUnaryStringBatchKernelWithState("ascii_ltrim", registry, diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 10d77c4a290..5a55a953071 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -91,12 +91,22 @@ TYPED_TEST(TestStringKernels, AsciiLower) { "[\"aaazzæÆ&\", null, \"\", \"bbb\"]"); } -TYPED_TEST(TestStringKernels, StringReverse) { - this->CheckUnary("string_reverse", "[]", this->type(), "[]"); - this->CheckUnary("string_reverse", "[\"abcd\", null, \"\", \"bbb\"]", this->type(), +TYPED_TEST(TestStringKernels, AsciiReverse) { + this->CheckUnary("ascii_reverse", "[]", this->type(), "[]"); + this->CheckUnary("ascii_reverse", "[\"abcd\", null, \"\", \"bbb\"]", this->type(), "[\"dcba\", null, \"\", \"bbb\"]"); -// this->CheckUnary("string_reverse", "[\"aAazZæÆ&\", null, \"\", \"bbb\"]", this->type(), -// "[\"&ÆæZzaAa\", null, \"\", \"bbb\"]"); + + Datum input = ArrayFromJSON(this->type(), "[\"aAazZæÆ&\", null, \"\", \"bbb\"]"); + FunctionOptions options{}; + ASSERT_NOT_OK(CallFunction("ascii_reverse", {input}, &options)); +} + +TYPED_TEST(TestStringKernels, Utf8Reverse) { + this->CheckUnary("utf8_reverse", "[]", this->type(), "[]"); + this->CheckUnary("utf8_reverse", "[\"abcd\", null, \"\", \"bbb\"]", this->type(), + "[\"dcba\", null, \"\", \"bbb\"]"); + this->CheckUnary("utf8_reverse", "[\"aAazZæÆ&\", null, \"\", \"bbb\"]", this->type(), + "[\"&ÆæZzaAa\", null, \"\", \"bbb\"]"); } TEST(TestStringKernels, LARGE_MEMORY_TEST(Utf8Upper32bitGrowth)) { From c84ac035d9a7dd6ec3ed1423f6369ec88af56769 Mon Sep 17 00:00:00 2001 From: niranda perera Date: Thu, 13 May 2021 15:33:17 -0400 Subject: [PATCH 04/15] minor changes --- cpp/src/arrow/compute/kernels/scalar_string.cc | 4 ++-- cpp/src/arrow/compute/kernels/scalar_string_test.cc | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 2b5795a9cfe..750255e8adc 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -289,7 +289,7 @@ struct AsciiReverse : StringTransform> { // todo move to a proper place /** - * UTF8 codeunit size can be determined by looking at the first 4 bits of BYTE1 + * UTF8 codeunit size can be determined by looking at the leading 4 bits of BYTE1 */ const std::array UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4}; @@ -303,7 +303,7 @@ struct Utf8Reverse : StringTransform> { uint8_t* output, offset_type* output_written) { offset_type i = 0; while (i < input_string_ncodeunits) { - uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4]; + uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4]; // right shift leading 4 bits std::copy(input + i, input + (i + offset), output + (input_string_ncodeunits - i - offset)); i += offset; diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 5a55a953071..7df755b3872 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -97,8 +97,7 @@ TYPED_TEST(TestStringKernels, AsciiReverse) { "[\"dcba\", null, \"\", \"bbb\"]"); Datum input = ArrayFromJSON(this->type(), "[\"aAazZæÆ&\", null, \"\", \"bbb\"]"); - FunctionOptions options{}; - ASSERT_NOT_OK(CallFunction("ascii_reverse", {input}, &options)); + ASSERT_NOT_OK(CallFunction("ascii_reverse", {input})); } TYPED_TEST(TestStringKernels, Utf8Reverse) { From 124730e77044f3297b9726cdcab6835c709e8c62 Mon Sep 17 00:00:00 2001 From: niranda perera Date: Thu, 13 May 2021 17:32:55 -0400 Subject: [PATCH 05/15] making minor changes --- cpp/src/arrow/compute/kernels/scalar_string.cc | 17 ++++++++--------- .../arrow/compute/kernels/scalar_string_test.cc | 10 ++++++---- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 750255e8adc..b80f50dd5e0 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -273,22 +273,21 @@ struct AsciiReverse : StringTransform> { bool Transform(const uint8_t* input, offset_type input_string_ncodeunits, uint8_t* output, offset_type* output_written) { - uint8_t failure = 0; + uint8_t utf8_char_found = 0; for (offset_type i = 0; i < input_string_ncodeunits; i++) { - failure |= input[i] & 0x80; // if a utf8 char is found, report to failure + // if a utf8 char is found, report to utf8_char_found + utf8_char_found |= input[i] & 0x80; output[input_string_ncodeunits - i - 1] = input[i]; } + // todo: finalize if L278 check is required or not. If not required, + // simply use the following // std::reverse_copy(input, input + input_string_ncodeunits, output); - // todo: this is not needed for reverse. may be change StringTransform struct to - // handle 2 cases. 1. for same size string transform and 2. variable size string - // transform *output_written = input_string_ncodeunits; - return failure == 0; + return utf8_char_found == 0; } }; -// todo move to a proper place -/** +/* * UTF8 codeunit size can be determined by looking at the leading 4 bits of BYTE1 */ const std::array UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1, @@ -303,7 +302,7 @@ struct Utf8Reverse : StringTransform> { uint8_t* output, offset_type* output_written) { offset_type i = 0; while (i < input_string_ncodeunits) { - uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4]; // right shift leading 4 bits + uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4]; // right shift leading 4 bits std::copy(input + i, input + (i + offset), output + (input_string_ncodeunits - i - offset)); i += offset; diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 7df755b3872..a10549a49a7 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -93,8 +93,8 @@ TYPED_TEST(TestStringKernels, AsciiLower) { TYPED_TEST(TestStringKernels, AsciiReverse) { this->CheckUnary("ascii_reverse", "[]", this->type(), "[]"); - this->CheckUnary("ascii_reverse", "[\"abcd\", null, \"\", \"bbb\"]", this->type(), - "[\"dcba\", null, \"\", \"bbb\"]"); + this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(), + R"(["dcba", null, "", "bbb"])"); Datum input = ArrayFromJSON(this->type(), "[\"aAazZæÆ&\", null, \"\", \"bbb\"]"); ASSERT_NOT_OK(CallFunction("ascii_reverse", {input})); @@ -102,10 +102,12 @@ TYPED_TEST(TestStringKernels, AsciiReverse) { TYPED_TEST(TestStringKernels, Utf8Reverse) { this->CheckUnary("utf8_reverse", "[]", this->type(), "[]"); - this->CheckUnary("utf8_reverse", "[\"abcd\", null, \"\", \"bbb\"]", this->type(), - "[\"dcba\", null, \"\", \"bbb\"]"); + this->CheckUnary("utf8_reverse", R"(["abcd", null, "", "bbb"])", this->type(), + R"(["dcba", null, "", "bbb"])"); this->CheckUnary("utf8_reverse", "[\"aAazZæÆ&\", null, \"\", \"bbb\"]", this->type(), "[\"&ÆæZzaAa\", null, \"\", \"bbb\"]"); + this->CheckUnary("utf8_reverse", "[\"ɑɽⱤæÆ&\", null, \"\", \"bbb\"]", this->type(), + "[\"&ÆæⱤɽɑ\", null, \"\", \"bbb\"]"); } TEST(TestStringKernels, LARGE_MEMORY_TEST(Utf8Upper32bitGrowth)) { From 5ca75944eb821e0a6208e5fca6b57675c1a851f5 Mon Sep 17 00:00:00 2001 From: niranda perera Date: Fri, 14 May 2021 11:13:01 -0400 Subject: [PATCH 06/15] editing tests --- cpp/src/arrow/compute/kernels/scalar_string_test.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index a10549a49a7..b803be577cc 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -96,7 +96,7 @@ TYPED_TEST(TestStringKernels, AsciiReverse) { this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(), R"(["dcba", null, "", "bbb"])"); - Datum input = ArrayFromJSON(this->type(), "[\"aAazZæÆ&\", null, \"\", \"bbb\"]"); + Datum input = ArrayFromJSON(this->type(), R"(["aAazZæÆ&", null, "", "bbb"])"); ASSERT_NOT_OK(CallFunction("ascii_reverse", {input})); } @@ -104,10 +104,10 @@ TYPED_TEST(TestStringKernels, Utf8Reverse) { this->CheckUnary("utf8_reverse", "[]", this->type(), "[]"); this->CheckUnary("utf8_reverse", R"(["abcd", null, "", "bbb"])", this->type(), R"(["dcba", null, "", "bbb"])"); - this->CheckUnary("utf8_reverse", "[\"aAazZæÆ&\", null, \"\", \"bbb\"]", this->type(), - "[\"&ÆæZzaAa\", null, \"\", \"bbb\"]"); - this->CheckUnary("utf8_reverse", "[\"ɑɽⱤæÆ&\", null, \"\", \"bbb\"]", this->type(), - "[\"&ÆæⱤɽɑ\", null, \"\", \"bbb\"]"); + this->CheckUnary("utf8_reverse", R"(["aAazZæÆ&", null, "", "bbb"])", this->type(), + R"(["&ÆæZzaAa", null, "", "bbb"])"; + this->CheckUnary("utf8_reverse", R"(["ɑɽⱤæÆ&", null, "", "bbb"])", this->type(), + R"(["&ÆæⱤɽɑ", null, "", "bbb"])"); } TEST(TestStringKernels, LARGE_MEMORY_TEST(Utf8Upper32bitGrowth)) { From b8a4bb3adcb93676e7a74b97fc293bfffd4ffc10 Mon Sep 17 00:00:00 2001 From: niranda perera Date: Fri, 14 May 2021 18:12:50 -0400 Subject: [PATCH 07/15] adding review changes --- cpp/src/arrow/compute/kernels/scalar_string.cc | 18 ++++++++++-------- .../compute/kernels/scalar_string_test.cc | 17 +++++++++++------ 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index b80f50dd5e0..f7e3e604b08 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -279,19 +279,21 @@ struct AsciiReverse : StringTransform> { utf8_char_found |= input[i] & 0x80; output[input_string_ncodeunits - i - 1] = input[i]; } - // todo: finalize if L278 check is required or not. If not required, - // simply use the following - // std::reverse_copy(input, input + input_string_ncodeunits, output); *output_written = input_string_ncodeunits; return utf8_char_found == 0; } }; /* - * UTF8 codeunit size can be determined by looking at the leading 4 bits of BYTE1 + * size of a valid UTF8 can be determined by looking at leading 4 bits of BYTE1 + * UTF8_BYTE_SIZE_LUT[0..7] --> pure ascii chars --> 1B length + * UTF8_BYTE_SIZE_LUT[8..11] --> internal bytes --> 1B length + * UTF8_BYTE_SIZE_LUT[12,13] --> 2B long UTF8 chars + * UTF8_BYTE_SIZE_LUT[14] --> 3B long UTF8 chars + * UTF8_BYTE_SIZE_LUT[15] --> 4B long UTF8 chars */ const std::array UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 2, 2, 3, 4}; + 1, 1, 1, 1, 2, 2, 3, 4}; template struct Utf8Reverse : StringTransform> { @@ -302,9 +304,9 @@ struct Utf8Reverse : StringTransform> { uint8_t* output, offset_type* output_written) { offset_type i = 0; while (i < input_string_ncodeunits) { - uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4]; // right shift leading 4 bits - std::copy(input + i, input + (i + offset), - output + (input_string_ncodeunits - i - offset)); + uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4]; + offset_type stride = std::min(i + offset, input_string_ncodeunits); + std::copy(input + i, input + stride, output + input_string_ncodeunits - stride); i += offset; } *output_written = input_string_ncodeunits; diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index b803be577cc..f11314193e8 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -96,18 +96,23 @@ TYPED_TEST(TestStringKernels, AsciiReverse) { this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", this->type(), R"(["dcba", null, "", "bbb"])"); - Datum input = ArrayFromJSON(this->type(), R"(["aAazZæÆ&", null, "", "bbb"])"); - ASSERT_NOT_OK(CallFunction("ascii_reverse", {input})); + Datum invalid_input = ArrayFromJSON(this->type(), R"(["aAazZæÆ&", null, "", "bbb"])"); + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("Invalid UTF8 sequence"), + CallFunction("ascii_reverse", {invalid_input})); } TYPED_TEST(TestStringKernels, Utf8Reverse) { this->CheckUnary("utf8_reverse", "[]", this->type(), "[]"); this->CheckUnary("utf8_reverse", R"(["abcd", null, "", "bbb"])", this->type(), R"(["dcba", null, "", "bbb"])"); - this->CheckUnary("utf8_reverse", R"(["aAazZæÆ&", null, "", "bbb"])", this->type(), - R"(["&ÆæZzaAa", null, "", "bbb"])"; - this->CheckUnary("utf8_reverse", R"(["ɑɽⱤæÆ&", null, "", "bbb"])", this->type(), - R"(["&ÆæⱤɽɑ", null, "", "bbb"])"); + this->CheckUnary("utf8_reverse", R"(["aAazZæÆ&", null, "", "bbb", "ɑɽⱤæÆ"])", this->type(), + R"(["&ÆæZzaAa", null, "", "bbb", "ÆæⱤɽɑ"])"); + + // inputs with malformed utf8 chars would produce garbage output, but the end result + // would produce arrays with same lengths. Hence checking offset buffer equality + auto malformed_input = ArrayFromJSON(this->type(), "[\"ɑ\xFFɑa\", \"ɽ\xe1\xbdɽa\"]"); + const Result& res = CallFunction("utf8_reverse", {malformed_input}); + ASSERT_TRUE(res->array()->buffers[1]->Equals(*malformed_input->data()->buffers[1])); } TEST(TestStringKernels, LARGE_MEMORY_TEST(Utf8Upper32bitGrowth)) { From 55e39acdbb8471d0332a1d3cc0448758c6f63e02 Mon Sep 17 00:00:00 2001 From: niranda perera Date: Mon, 17 May 2021 14:02:15 -0400 Subject: [PATCH 08/15] adding review changes --- .../arrow/compute/kernels/scalar_string.cc | 26 ++++++++++++++----- .../compute/kernels/scalar_string_test.cc | 3 ++- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index f7e3e604b08..18be5b4f0af 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -142,6 +142,11 @@ struct StringTransform { static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { return Derived().Execute(ctx, batch, out); } + + static Status InvalidStatus() { + return Status::Invalid("Invalid UTF8 sequence in input"); + } + Status Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) { if (batch[0].kind() == Datum::ARRAY) { const ArrayData& input = *batch[0].array(); @@ -173,7 +178,7 @@ struct StringTransform { if (ARROW_PREDICT_FALSE(!static_cast(*this).Transform( input_string, input_string_ncodeunits, output_str + output_ncodeunits, &encoded_nbytes))) { - return Status::Invalid("Invalid UTF8 sequence in input"); + return Derived::InvalidStatus(); } output_ncodeunits += encoded_nbytes; output_string_offsets[i + 1] = output_ncodeunits; @@ -199,7 +204,7 @@ struct StringTransform { if (ARROW_PREDICT_FALSE(!static_cast(*this).Transform( input.value->data(), data_nbytes, value_buffer->mutable_data(), &encoded_nbytes))) { - return Status::Invalid("Invalid UTF8 sequence in input"); + return Derived::InvalidStatus(); } RETURN_NOT_OK(value_buffer->Resize(encoded_nbytes, /*shrink_to_fit=*/true)); } @@ -282,6 +287,8 @@ struct AsciiReverse : StringTransform> { *output_written = input_string_ncodeunits; return utf8_char_found == 0; } + + static Status InvalidStatus() { return Status::Invalid("Non-ascii sequence in input"); } }; /* @@ -2353,7 +2360,7 @@ const FunctionDoc ascii_upper_doc( const FunctionDoc ascii_lower_doc( "Transform ASCII input to lowercase", ("For each string in `strings`, return a lowercase version.\n\n" - "This function assumes the input is fully ASCII. It it may contain\n" + "This function assumes the input is fully ASCII. If it may contain\n" "non-ASCII characters, use \"utf8_lower\" instead."), {"strings"}); @@ -2366,12 +2373,19 @@ const FunctionDoc utf8_lower_doc( ("For each string in `strings`, return a lowercase version."), {"strings"}); const FunctionDoc ascii_reverse_doc( - "Reverse ascii input", - ("For each ascii string in `strings`, return a reversed version."), {"strings"}); + "Reverse ASCII input", + ("For each ASCII string in `strings`, return a reversed version.\n\n" + "This function assumes the input is fully ASCII. If it may contain\n" + "non-ASCII characters, use \"utf8_reverse\" instead."), + {"strings"}); const FunctionDoc utf8_reverse_doc( "Reverse utf8 input", - ("For each utf8 string in `strings`, return a reversed version."), {"strings"}); + ("For each utf8 string in `strings`, return a reversed version.\n\n" + "This function works on UTF8 codepoint level. As a consequence, if the input \n" + "contains combining UTF8 character sequences/ 'graphemes', they would also \n" + "be reversed."), + {"strings"}); } // namespace diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index f11314193e8..31352a9c6d5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -97,7 +97,8 @@ TYPED_TEST(TestStringKernels, AsciiReverse) { R"(["dcba", null, "", "bbb"])"); Datum invalid_input = ArrayFromJSON(this->type(), R"(["aAazZæÆ&", null, "", "bbb"])"); - EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, testing::HasSubstr("Invalid UTF8 sequence"), + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, + testing::HasSubstr("Non-ascii sequence in input"), CallFunction("ascii_reverse", {invalid_input})); } From 24a60883964bcc4888aa0ebdfb27ad022503c6e2 Mon Sep 17 00:00:00 2001 From: niranda perera Date: Mon, 17 May 2021 14:18:27 -0400 Subject: [PATCH 09/15] adding rst docs --- docs/source/cpp/compute.rst | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 592dc4ec1b0..889a0b96559 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -435,47 +435,59 @@ String transforms +==========================+============+=========================+=====================+=========+=======================================+ | ascii_lower | Unary | String-like | String-like | \(1) | | +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+ +| ascii_reverse | Unary | String-like | String-like | \(2) | | ++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+ | ascii_upper | Unary | String-like | String-like | \(1) | | +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+ -| binary_length | Unary | Binary- or String-like | Int32 or Int64 | \(2) | | +| binary_length | Unary | Binary- or String-like | Int32 or Int64 | \(3) | | ++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+ +| replace_substring | Unary | String-like | String-like | \(4) | :struct:`ReplaceSubstringOptions` | +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+ -| replace_substring | Unary | String-like | String-like | \(3) | :struct:`ReplaceSubstringOptions` | +| replace_substring_regex | Unary | String-like | String-like | \(5) | :struct:`ReplaceSubstringOptions` | +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+ -| replace_substring_regex | Unary | String-like | String-like | \(4) | :struct:`ReplaceSubstringOptions` | +| utf8_length | Unary | String-like | Int32 or Int64 | \(6) | | +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+ -| utf8_length | Unary | String-like | Int32 or Int64 | \(5) | | +| utf8_lower | Unary | String-like | String-like | \(7) | | +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+ -| utf8_lower | Unary | String-like | String-like | \(6) | | +| utf8_reverse | Unary | String-like | String-like | \(8) | | +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+ -| utf8_upper | Unary | String-like | String-like | \(6) | | +| utf8_upper | Unary | String-like | String-like | \(7) | | +--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+ * \(1) Each ASCII character in the input is converted to lowercase or uppercase. Non-ASCII characters are left untouched. -* \(2) Output is the physical length in bytes of each input element. Output +* \(2) ASCII input is reversed to the output. If non-ASCII characters + are present, ``Invalid`` :class:`Status` will be returned. + +* \(3) Output is the physical length in bytes of each input element. Output type is Int32 for Binary / String, Int64 for LargeBinary / LargeString. -* \(3) Replace non-overlapping substrings that match to +* \(4) Replace non-overlapping substrings that match to :member:`ReplaceSubstringOptions::pattern` by :member:`ReplaceSubstringOptions::replacement`. If :member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the maximum number of replacements made, counting from the left. -* \(4) Replace non-overlapping substrings that match to the regular expression +* \(5) Replace non-overlapping substrings that match to the regular expression :member:`ReplaceSubstringOptions::pattern` by :member:`ReplaceSubstringOptions::replacement`, using the Google RE2 library. If :member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the maximum number of replacements made, counting from the left. Note that if the pattern contains groups, backreferencing can be used. -* \(5) Output is the number of characters (not bytes) of each input element. +* \(6) Output is the number of characters (not bytes) of each input element. Output type is Int32 for String, Int64 for LargeString. -* \(6) Each UTF8-encoded character in the input is converted to lowercase or +* \(7) Each UTF8-encoded character in the input is converted to lowercase or uppercase. +* \(8) UTF8-encoded input is reversed to the output. If malformed-UTF8 characters + are present, the Output would be undefined (but the size of Output buffers would + be preserved). If combined UTF8 characters are available (ex: graphems, glyphs), + they would also be reversed. + String trimming ~~~~~~~~~~~~~~~ From 2fd1cc38f636a2e46a7674df88ab0a62e563d414 Mon Sep 17 00:00:00 2001 From: niranda perera Date: Mon, 17 May 2021 15:10:36 -0400 Subject: [PATCH 10/15] adding review comments --- cpp/src/arrow/compute/kernels/scalar_string.cc | 8 ++++---- docs/source/cpp/compute.rst | 7 +++---- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 18be5b4f0af..b4de24bfbda 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -288,7 +288,7 @@ struct AsciiReverse : StringTransform> { return utf8_char_found == 0; } - static Status InvalidStatus() { return Status::Invalid("Non-ascii sequence in input"); } + static Status InvalidStatus() { return Status::Invalid("Non-ASCII sequence in input"); } }; /* @@ -2382,9 +2382,9 @@ const FunctionDoc ascii_reverse_doc( const FunctionDoc utf8_reverse_doc( "Reverse utf8 input", ("For each utf8 string in `strings`, return a reversed version.\n\n" - "This function works on UTF8 codepoint level. As a consequence, if the input \n" - "contains combining UTF8 character sequences/ 'graphemes', they would also \n" - "be reversed."), + "This function operates on codepoints/UTF-8 code units, not grapheme\n" + "clusters. Hence, it will not correctly reverse grapheme clusters\n" + "composed of multiple codepoints."), {"strings"}); } // namespace diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 889a0b96559..1cee7bcf266 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -483,10 +483,9 @@ String transforms * \(7) Each UTF8-encoded character in the input is converted to lowercase or uppercase. -* \(8) UTF8-encoded input is reversed to the output. If malformed-UTF8 characters - are present, the Output would be undefined (but the size of Output buffers would - be preserved). If combined UTF8 characters are available (ex: graphems, glyphs), - they would also be reversed. +* \(8) Each UTF8-encoded code unit is written in reverse order to the output. + If the input is not valid UTF8, then the output is undefined (but the size of output + buffers will be preserved). String trimming From 1af4985fe797fe983614152ab97f95299c2a111d Mon Sep 17 00:00:00 2001 From: niranda perera Date: Mon, 17 May 2021 16:07:41 -0400 Subject: [PATCH 11/15] adding python docs rst --- docs/source/python/api/compute.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index da16ccdfa29..d74732933e7 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -144,8 +144,10 @@ String Transforms :toctree: ../generated/ ascii_lower + ascii_reverse ascii_upper utf8_lower + utf8_reverse utf8_upper Containment tests From 0bbefa8b8f7c942bda5a51c44cba036a1e11701d Mon Sep 17 00:00:00 2001 From: niranda perera Date: Tue, 18 May 2021 10:57:53 -0400 Subject: [PATCH 12/15] Update cpp/src/arrow/compute/kernels/scalar_string_test.cc Co-authored-by: David Li --- cpp/src/arrow/compute/kernels/scalar_string_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 31352a9c6d5..42ad1298ddb 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -98,7 +98,7 @@ TYPED_TEST(TestStringKernels, AsciiReverse) { Datum invalid_input = ArrayFromJSON(this->type(), R"(["aAazZæÆ&", null, "", "bbb"])"); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, - testing::HasSubstr("Non-ascii sequence in input"), + testing::HasSubstr("Non-ASCII sequence in input"), CallFunction("ascii_reverse", {invalid_input})); } From 2c1861910518a749a69ab31ebf4b95f3bdd2fce7 Mon Sep 17 00:00:00 2001 From: niranda perera Date: Tue, 18 May 2021 23:57:57 -0400 Subject: [PATCH 13/15] fixing lint problems --- cpp/src/arrow/compute/kernels/scalar_string_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 42ad1298ddb..2deac9bc5c4 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -106,8 +106,8 @@ TYPED_TEST(TestStringKernels, Utf8Reverse) { this->CheckUnary("utf8_reverse", "[]", this->type(), "[]"); this->CheckUnary("utf8_reverse", R"(["abcd", null, "", "bbb"])", this->type(), R"(["dcba", null, "", "bbb"])"); - this->CheckUnary("utf8_reverse", R"(["aAazZæÆ&", null, "", "bbb", "ɑɽⱤæÆ"])", this->type(), - R"(["&ÆæZzaAa", null, "", "bbb", "ÆæⱤɽɑ"])"); + this->CheckUnary("utf8_reverse", R"(["aAazZæÆ&", null, "", "bbb", "ɑɽⱤæÆ"])", + this->type(), R"(["&ÆæZzaAa", null, "", "bbb", "ÆæⱤɽɑ"])"); // inputs with malformed utf8 chars would produce garbage output, but the end result // would produce arrays with same lengths. Hence checking offset buffer equality From 6b27e5422027d4acdb7c0a46405ae1d08ab70128 Mon Sep 17 00:00:00 2001 From: niranda perera Date: Thu, 20 May 2021 11:01:18 -0400 Subject: [PATCH 14/15] moving ValidUtf8CodepointByteSize to util/utf8.h --- cpp/src/arrow/compute/kernels/scalar_string.cc | 12 +----------- cpp/src/arrow/util/utf8.cc | 2 ++ cpp/src/arrow/util/utf8.h | 14 ++++++++++++++ 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index b4de24bfbda..dbbfcdbca04 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -291,16 +291,6 @@ struct AsciiReverse : StringTransform> { static Status InvalidStatus() { return Status::Invalid("Non-ASCII sequence in input"); } }; -/* - * size of a valid UTF8 can be determined by looking at leading 4 bits of BYTE1 - * UTF8_BYTE_SIZE_LUT[0..7] --> pure ascii chars --> 1B length - * UTF8_BYTE_SIZE_LUT[8..11] --> internal bytes --> 1B length - * UTF8_BYTE_SIZE_LUT[12,13] --> 2B long UTF8 chars - * UTF8_BYTE_SIZE_LUT[14] --> 3B long UTF8 chars - * UTF8_BYTE_SIZE_LUT[15] --> 4B long UTF8 chars - */ -const std::array UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 2, 2, 3, 4}; template struct Utf8Reverse : StringTransform> { @@ -311,7 +301,7 @@ struct Utf8Reverse : StringTransform> { uint8_t* output, offset_type* output_written) { offset_type i = 0; while (i < input_string_ncodeunits) { - uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4]; + uint8_t offset = util::ValidUtf8CodepointByteSize(input + i); offset_type stride = std::min(i + offset, input_string_ncodeunits); std::copy(input + i, input + stride, output + input_string_ncodeunits - stride); i += offset; diff --git a/cpp/src/arrow/util/utf8.cc b/cpp/src/arrow/util/utf8.cc index 478d8ade95f..11394d2e64c 100644 --- a/cpp/src/arrow/util/utf8.cc +++ b/cpp/src/arrow/util/utf8.cc @@ -64,6 +64,8 @@ const uint8_t utf8_small_table[] = { // NOLINT uint16_t utf8_large_table[9 * 256] = {0xffff}; +const uint8_t utf8_byte_size_table[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4}; + static void InitializeLargeTable() { for (uint32_t state = 0; state < 9; ++state) { for (uint32_t byte = 0; byte < 256; ++byte) { diff --git a/cpp/src/arrow/util/utf8.h b/cpp/src/arrow/util/utf8.h index 2d94ca4986e..310d6913403 100644 --- a/cpp/src/arrow/util/utf8.h +++ b/cpp/src/arrow/util/utf8.h @@ -65,6 +65,8 @@ static constexpr uint8_t kUTF8DecodeReject = 12; // In this table states are multiples of 256. ARROW_EXPORT extern uint16_t utf8_large_table[9 * 256]; +ARROW_EXPORT extern const uint8_t utf8_byte_size_table[16]; + // Success / reject states when looked up in the large table static constexpr uint16_t kUTF8ValidateAccept = 0; static constexpr uint16_t kUTF8ValidateReject = 256; @@ -293,6 +295,18 @@ Result SkipUTF8BOM(const uint8_t* data, int64_t size); static constexpr uint32_t kMaxUnicodeCodepoint = 0x110000; +// size of a valid UTF8 can be determined by looking at leading 4 bits of BYTE1 +// utf8_byte_size_table[0..7] --> pure ascii chars --> 1B length +// utf8_byte_size_table[8..11] --> internal bytes --> 1B length +// utf8_byte_size_table[12,13] --> 2B long UTF8 chars +// utf8_byte_size_table[14] --> 3B long UTF8 chars +// utf8_byte_size_table[15] --> 4B long UTF8 chars +// NOTE: Results for invalid/ malformed utf-8 sequences are undefined. +// ex: \xFF... returns 4B +static inline uint8_t ValidUtf8CodepointByteSize(const uint8_t* codeunit) { + return internal::utf8_byte_size_table[*codeunit >> 4]; +} + static inline bool Utf8IsContinuation(const uint8_t codeunit) { return (codeunit & 0xC0) == 0x80; // upper two bits should be 10 } From 7311300db96aa385f3b5ae5766f57549fccf23a6 Mon Sep 17 00:00:00 2001 From: niranda perera Date: Fri, 21 May 2021 00:21:32 -0400 Subject: [PATCH 15/15] fixing lint issues --- cpp/src/arrow/compute/kernels/scalar_string.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index dbbfcdbca04..00b1fcca9e2 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -291,7 +291,6 @@ struct AsciiReverse : StringTransform> { static Status InvalidStatus() { return Status::Invalid("Non-ASCII sequence in input"); } }; - template struct Utf8Reverse : StringTransform> { using Base = StringTransform>;