From bbb32e7d756dbc3c0a994b81f9620af00c511be5 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Tue, 23 Mar 2021 15:31:27 -0400 Subject: [PATCH 1/8] Add utf8_length documentation --- docs/source/cpp/compute.rst | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 8b4f2219989..e4eaa94bc59 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -435,9 +435,11 @@ String transforms +--------------------------+------------+-------------------------+---------------------+---------+ | binary_length | Unary | Binary- or String-like | Int32 or Int64 | \(2) | +--------------------------+------------+-------------------------+---------------------+---------+ -| utf8_lower | Unary | String-like | String-like | \(3) | +| utf8_length | Unary | String-like | Int32 or Int64 | \(3) | +--------------------------+------------+-------------------------+---------------------+---------+ -| utf8_upper | Unary | String-like | String-like | \(3) | +| utf8_lower | Unary | String-like | String-like | \(4) | ++--------------------------+------------+-------------------------+---------------------+---------+ +| utf8_upper | Unary | String-like | String-like | \(4) | +--------------------------+------------+-------------------------+---------------------+---------+ @@ -447,7 +449,10 @@ String transforms * \(2) Output is the physical length in bytes of each input element. Output type is Int32 for Binary / String, Int64 for LargeBinary / LargeString. -* \(3) Each UTF8-encoded character in the input is converted to lowercase or +* \(3) Output is the number of characters (not bytes) of each input element. + Output type is Int32 for String, Int64 for LargeString. + +* \(4) Each UTF8-encoded character in the input is converted to lowercase or uppercase. From 8630d248b1bbf951f36490ec0222e54f8650cb33 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Tue, 23 Mar 2021 15:39:01 -0400 Subject: [PATCH 2/8] add a UTF8 case to binary_length, add an equivalent test for utf8_length --- cpp/src/arrow/compute/kernels/scalar_string_test.cc | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index d72c631bdcd..8b5931e6a22 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -61,8 +61,8 @@ class TestBinaryKernels : public BaseTestStringKernels {}; TYPED_TEST_SUITE(TestBinaryKernels, BinaryTypes); TYPED_TEST(TestBinaryKernels, BinaryLength) { - this->CheckUnary("binary_length", R"(["aaa", null, "", "b"])", this->offset_type(), - "[3, null, 0, 1]"); + this->CheckUnary("binary_length", R"(["aaa", null, "áéíóú", "", "b"])", + this->offset_type(), "[3, null, 10, 0, 1]"); } template @@ -103,6 +103,11 @@ TEST(TestStringKernels, LARGE_MEMORY_TEST(Utf8Upper32bitGrowth)) { #ifdef ARROW_WITH_UTF8PROC +TYPED_TEST(TestStringKernels, Utf8Length) { + this->CheckUnary("utf8_length", R"(["aaa", null, "áéíóú", "", "b"])", + this->offset_type(), "[3, null, 5, 0, 1]"); +} + TYPED_TEST(TestStringKernels, Utf8Upper) { this->CheckUnary("utf8_upper", "[\"aAazZæÆ&\", null, \"\", \"b\"]", this->type(), "[\"AAAZZÆÆ&\", null, \"\", \"B\"]"); From 05d6bde3d55bc6a16e8bedc0e666a6d6cc5efb18 Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Tue, 23 Mar 2021 15:41:37 -0400 Subject: [PATCH 3/8] add a test ensuring "\0" is invalid utf8 --- cpp/src/arrow/array/array_binary_test.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index 5c247a6dc66..55535c81cb0 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -301,14 +301,18 @@ class TestStringArray : public ::testing::Test { auto st2 = ValidateFull(1, {0, 4}, "\xf4\x90\x80\x80"); // Single UTF8 character straddles two entries auto st3 = ValidateFull(2, {0, 1, 2}, "\xc3\xa9"); + // Null characters in the string + auto st4 = ValidateFull(1, {0, 4}, "\0\0\0\0"); if (T::is_utf8) { ASSERT_RAISES(Invalid, st1); ASSERT_RAISES(Invalid, st2); ASSERT_RAISES(Invalid, st3); + ASSERT_RAISES(Invalid, st4); } else { ASSERT_OK(st1); ASSERT_OK(st2); ASSERT_OK(st3); + ASSERT_OK(st4); } } From 6379ff41847b3b2ac4bd3b51d19c253ffb6a982e Mon Sep 17 00:00:00 2001 From: Eduardo Ponce Date: Tue, 23 Mar 2021 18:32:06 -0400 Subject: [PATCH 4/8] add utf8_length kernels for STRING and LARGE_STRING --- .../arrow/compute/kernels/scalar_string.cc | 48 ++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 2eeac71c727..966d72cfb75 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -66,6 +66,25 @@ struct BinaryLength { #ifdef ARROW_WITH_UTF8PROC +struct Utf8Length { + template + static OutValue Call(KernelContext*, Arg0Value val) { + auto str = reinterpret_cast(val.data()); + auto strlen = static_cast(val.size()); + utf8proc_int32_t codepoint; + + OutValue length = 0; + while (strlen > 0) { + auto char_width = utf8proc_iterate(str, strlen, &codepoint); + // XXX check for errmsg? + str += char_width; + strlen -= char_width; + ++length; + } + return length; + } +}; + // Direct lookup tables for unicode properties constexpr uint32_t kMaxCodepointLookup = 0xffff; // up to this codepoint is in a lookup table @@ -1569,9 +1588,16 @@ const FunctionDoc strptime_doc( const FunctionDoc binary_length_doc( "Compute string lengths", - ("For each string in `strings`, emit its length. Null values emit null."), + ("For each string in `strings`, emit the number of bytes. Null values emit null."), {"strings"}); +#ifdef ARROW_WITH_UTF8PROC +const FunctionDoc utf8_length_doc("Compute utf8 string lengths", + ("For each string in `strings`, emit the number of " + "utf8 characters. Null values emit null."), + {"strings"}); +#endif // ARROW_WITH_UTF8PROC + void AddStrptime(FunctionRegistry* registry) { auto func = std::make_shared("strptime", Arity::Unary(), &strptime_doc); DCHECK_OK(func->AddKernel({utf8()}, OutputType(StrptimeResolve), @@ -1597,6 +1623,23 @@ void AddBinaryLength(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); } +#ifdef ARROW_WITH_UTF8PROC +void AddUtf8Length(FunctionRegistry* registry) { + auto func = + std::make_shared("utf8_length", Arity::Unary(), &utf8_length_doc); + + ArrayKernelExec exec_offset_32 = + applicator::ScalarUnaryNotNull::Exec; + DCHECK_OK(func->AddKernel({utf8()}, int32(), std::move(exec_offset_32))); + + ArrayKernelExec exec_offset_64 = + applicator::ScalarUnaryNotNull::Exec; + DCHECK_OK(func->AddKernel({large_utf8()}, int64(), std::move(exec_offset_64))); + + DCHECK_OK(registry->AddFunction(std::move(func))); +} +#endif // ARROW_WITH_UTF8PROC + template