From aecaff643ea144f656861fff900ed0f568896382 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Pedro?= Date: Fri, 23 Apr 2021 23:26:21 -0300 Subject: [PATCH 01/13] Add base implementation and tests for LPAD function considering string input values --- cpp/src/gandiva/precompiled/string_ops.cc | 67 +++++++++++++++++++ .../gandiva/precompiled/string_ops_test.cc | 43 ++++++++++++ cpp/src/gandiva/precompiled/types.h | 4 ++ 3 files changed, 114 insertions(+) diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index fa9164bd139..ab5e4acc05d 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -1422,6 +1422,73 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text, out_len); } +FORCE_INLINE +const char* lpad(gdv_int64 context, const char* text, gdv_int32 text_len, + gdv_int32 return_length, const char* fill_text, gdv_int32 fill_text_len, + gdv_int32* out_len) { + // if the text length or the defined return length (number of characters to return) + // is <=0, then return an empty string. + if (text_len == 0 || return_length <= 0) { + *out_len = 0; + return ""; + } + + // initially counts the number of utf8 characters in the defined text and fill_text + int32_t text_char_count = utf8_length(context, text, text_len); + int32_t fill_char_count = utf8_length(context, fill_text, fill_text_len); + // text_char_count is zero if input has invalid utf8 char + // fill_char_count is zero if fill_text_len is > 0 and its value has invalid utf8 char + if (text_char_count == 0 || (fill_text_len > 0 && fill_char_count == 0)) { + *out_len = 0; + return ""; + } + + if (return_length == text_char_count || + (return_length > text_char_count && fill_text_len == 0)) { + // case where the return length is same as the text's length, or if it need to + // fill into text but "fill_text" is empty, then return text directly. + *out_len = text_len; + return text; + } else if (return_length < text_char_count) { + // case where it truncates the result on return length. + *out_len = utf8_byte_pos(context, text, text_len, return_length); + return text; + } else { + // case (return_length > text_char_count) + // case where it needs to copy "fill_text" on the string left. The total number + // of chars to copy is given by (return_length - text_char_count) + char* ret = + reinterpret_cast(gdv_fn_context_arena_malloc(context, return_length)); + if (ret == nullptr) { + gdv_fn_context_set_error_msg(context, + "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + // try to fulfill the return string with the "fill_text" continuously + int32_t copied_chars_count = 0; + int32_t copied_chars_position = 0; + while (copied_chars_count < return_length - text_char_count) { + int32_t char_len; + int32_t fill_index; + // for each char, evaluate its length to consider it when mem copying + for (fill_index = 0; fill_index < fill_text_len; fill_index += char_len) { + if (copied_chars_count >= return_length - text_char_count) { + break; + } + char_len = utf8_char_length(fill_text[fill_index]); + copied_chars_count++; + } + memcpy(ret + copied_chars_position, fill_text, fill_index); + copied_chars_position += fill_index; + } + // after fulfilling the text, copy the main string + memcpy(ret + copied_chars_position, text, text_len); + *out_len = copied_chars_position + text_len; + return ret; + } +} + FORCE_INLINE const char* split_part(gdv_int64 context, const char* text, gdv_int32 text_len, const char* delimiter, gdv_int32 delim_len, gdv_int32 index, diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index 9326aac1e0f..258efc6fd1d 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -696,6 +696,49 @@ TEST(TestStringOps, TestLtrim) { EXPECT_FALSE(ctx.has_error()); } +TEST(TestStringOps, TestLpadString) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + gdv_int32 out_len = 0; + const char* out_str; + + out_str = lpad(ctx_ptr, "TestString", 10, 4, "fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "Test"); + + out_str = lpad(ctx_ptr, "TestString", 10, 10, "fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString"); + + out_str = lpad(ctx_ptr, "TestString", 0, 10, "fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = lpad(ctx_ptr, "TestString", 10, 0, "fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = lpad(ctx_ptr, "TestString", 10, -500, "fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = lpad(ctx_ptr, "TestString", 10, 500, "", 0, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString"); + + out_str = lpad(ctx_ptr, "TestString", 10, 18, "Fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "FillFillTestString"); + + out_str = lpad(ctx_ptr, "TestString", 10, 15, "Fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "FillFTestString"); + + out_str = lpad(ctx_ptr, "TestString", 10, 20, "Fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "FillFillFiTestString"); + + out_str = lpad(ctx_ptr, "абвгд", 10, 7, "д", 2, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "ддабвгд"); + + out_str = lpad(ctx_ptr, "абвгд", 10, 20, "абвгд", 10, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "абвгдабвгдабвгдабвгд"); + + out_str = lpad(ctx_ptr, "hello", 5, 6, "д", 2, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "дhello"); +} + TEST(TestStringOps, TestRtrim) { gandiva::ExecutionContext ctx; uint64_t ctx_ptr = reinterpret_cast(&ctx); diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h index 1b0f96e0ab7..3d08417c692 100644 --- a/cpp/src/gandiva/precompiled/types.h +++ b/cpp/src/gandiva/precompiled/types.h @@ -407,6 +407,10 @@ gdv_int32 locate_utf8_utf8_int32(gdv_int64 context, const char* sub_str, gdv_int32 sub_str_len, const char* str, gdv_int32 str_len, gdv_int32 start_pos); +const char* lpad(gdv_int64 context, const char* text, gdv_int32 text_len, + gdv_int32 return_length, const char* fill_text, gdv_int32 fill_text_len, + gdv_int32* out_len); + const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* text, gdv_int32 text_len, const char* from_str, gdv_int32 from_str_len, From 2c929a98aacd043df98fe7c85c9b7a8d3a87a07a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Pedro?= Date: Fri, 23 Apr 2021 23:35:20 -0300 Subject: [PATCH 02/13] Add function registry for LPAD string function --- cpp/src/gandiva/function_registry_string.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index d1f97cdb3e8..aff7fd7f71c 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -134,6 +134,9 @@ std::vector GetStringFunctionRegistry() { utf8(), kResultNullIfNull, "substr_utf8_int64", NativeFunction::kNeedsContext), + NativeFunction("lpad", {}, DataTypeVector{utf8(), int32(), utf8()}, utf8(), + kResultNullIfNull, "lpad", NativeFunction::kNeedsContext), + NativeFunction("concatOperator", {}, DataTypeVector{utf8(), utf8()}, utf8(), kResultNullIfNull, "concatOperator_utf8_utf8", NativeFunction::kNeedsContext), From 73927fc617a0d12dcd42782b2b808cdc0e53985a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Pedro?= Date: Fri, 23 Apr 2021 23:35:36 -0300 Subject: [PATCH 03/13] Add projector test for LPAD string function --- cpp/src/gandiva/tests/projector_test.cc | 43 +++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc index b63af40d359..71eaf1fef55 100644 --- a/cpp/src/gandiva/tests/projector_test.cc +++ b/cpp/src/gandiva/tests/projector_test.cc @@ -1010,4 +1010,47 @@ TEST_F(TestProjector, TestIfElseOpt) { EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); } +TEST_F(TestProjector, TestLpad) { + // schema for input fields + auto field0 = field("f0", arrow::utf8()); + auto field1 = field("f1", arrow::int32()); + auto field2 = field("f2", arrow::utf8()); + auto schema = arrow::schema({field0, field1, field2}); + + // output fields + auto field_lpad = field("lpad", arrow::utf8()); + + // Build expression + auto lpad_expr = + TreeExprBuilder::MakeExpression("lpad", {field0, field1, field2}, field_lpad); + + std::shared_ptr projector; + auto status = Projector::Make(schema, {lpad_expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Create a row-batch with some sample data + int num_records = 7; + auto array0 = MakeArrowArrayUtf8({"ab", "a", "ab", "invalid", "valid", "invalid", ""}, + {true, true, true, true, true, true, true}); + auto array1 = MakeArrowArrayInt32({1, 5, 3, 12, 0, 2, 10}, + {true, true, true, true, true, true, true}); + auto array2 = MakeArrowArrayUtf8({"z", "z", "c", "valid", "invalid", "invalid", ""}, + {true, true, true, true, true, true, true}); + // expected output + auto exp_lpad = MakeArrowArrayUtf8({"a", "zzzza", "cab", "validinvalid", "", "in", ""}, + {true, true, true, true, true, true, true}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, + {array0, array1, array2}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()) << status.message(); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp_lpad, outputs.at(0)); +} + } // namespace gandiva From 585cad384a9f4eeb4b1cea9e22556551584a797a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Pedro?= Date: Fri, 23 Apr 2021 23:57:29 -0300 Subject: [PATCH 04/13] Add base implementation and tests for LPAD function without pad texts considering string input values --- cpp/src/gandiva/precompiled/string_ops.cc | 52 +++++++++++++++++++ .../gandiva/precompiled/string_ops_test.cc | 26 ++++++++++ cpp/src/gandiva/precompiled/types.h | 3 ++ 3 files changed, 81 insertions(+) diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index ab5e4acc05d..ecf9864c287 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -1489,6 +1489,58 @@ const char* lpad(gdv_int64 context, const char* text, gdv_int32 text_len, } } +FORCE_INLINE +const char* lpad_no_fill_text(gdv_int64 context, const char* text, gdv_int32 text_len, + gdv_int32 return_length, gdv_int32* out_len) { + // if the text length or the defined return length (number of characters to return) + // is <=0, then return an empty string. + if (text_len == 0 || return_length <= 0) { + *out_len = 0; + return ""; + } + + // initially counts the number of utf8 characters in the defined text and fill_text + int32_t text_char_count = utf8_length(context, text, text_len); + // text_char_count is zero if input has invalid utf8 char + // fill_char_count is zero if fill_text_len is > 0 and its value has invalid utf8 char + if (text_char_count == 0) { + *out_len = 0; + return ""; + } + + if (return_length == text_char_count) { + // case where the return length is same as the text's length, or if it need to + // fill into text but "fill_text" is empty, then return text directly. + *out_len = text_len; + return text; + } else if (return_length < text_char_count) { + // case where it truncates the result on return length. + *out_len = utf8_byte_pos(context, text, text_len, return_length); + return text; + } else { + // case (return_length > text_char_count) + // case where it needs to copy "fill_text" on the string left. The total number + // of chars to copy is given by (return_length - text_char_count) + char* ret = + reinterpret_cast(gdv_fn_context_arena_malloc( + context, + text_len + (return_length - text_char_count))); + if (ret == nullptr) { + gdv_fn_context_set_error_msg(context, + "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + const char* blank_space = " "; + for (int i = 0; i < return_length - text_char_count; ++i) { + ret[i] = blank_space[0]; + } + memcpy(ret + return_length - text_char_count, text, text_len); + *out_len = text_len + (return_length - text_char_count); + return ret; + } +} + FORCE_INLINE const char* split_part(gdv_int64 context, const char* text, gdv_int32 text_len, const char* delimiter, gdv_int32 delim_len, gdv_int32 index, diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index 258efc6fd1d..11d1848066c 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -702,6 +702,7 @@ TEST(TestStringOps, TestLpadString) { gdv_int32 out_len = 0; const char* out_str; + // LPAD function tests - with defined fill pad text out_str = lpad(ctx_ptr, "TestString", 10, 4, "fill", 4, &out_len); EXPECT_EQ(std::string(out_str, out_len), "Test"); @@ -737,6 +738,31 @@ TEST(TestStringOps, TestLpadString) { out_str = lpad(ctx_ptr, "hello", 5, 6, "д", 2, &out_len); EXPECT_EQ(std::string(out_str, out_len), "дhello"); + + // LPAD function tests - with NO pad text + out_str = lpad_no_fill_text(ctx_ptr, "TestString", 10, 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "Test"); + + out_str = lpad_no_fill_text(ctx_ptr, "TestString", 10, 10, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString"); + + out_str = lpad_no_fill_text(ctx_ptr, "TestString", 0, 10, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = lpad_no_fill_text(ctx_ptr, "TestString", 10, 0,&out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = lpad_no_fill_text(ctx_ptr, "TestString", 10, -500, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = lpad_no_fill_text(ctx_ptr, "TestString", 10, 18, &out_len); + EXPECT_EQ(std::string(out_str, out_len), " TestString"); + + out_str = lpad_no_fill_text(ctx_ptr, "TestString", 10, 15, &out_len); + EXPECT_EQ(std::string(out_str, out_len), " TestString"); + + out_str = lpad_no_fill_text(ctx_ptr, "абвгд", 10, 7, &out_len); + EXPECT_EQ(std::string(out_str, out_len), " абвгд"); } TEST(TestStringOps, TestRtrim) { diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h index 3d08417c692..e93301884a5 100644 --- a/cpp/src/gandiva/precompiled/types.h +++ b/cpp/src/gandiva/precompiled/types.h @@ -411,6 +411,9 @@ const char* lpad(gdv_int64 context, const char* text, gdv_int32 text_len, gdv_int32 return_length, const char* fill_text, gdv_int32 fill_text_len, gdv_int32* out_len); +const char* lpad_no_fill_text(gdv_int64 context, const char* text, gdv_int32 text_len, + gdv_int32 return_length, gdv_int32* out_len); + const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* text, gdv_int32 text_len, const char* from_str, gdv_int32 from_str_len, From 08d2053238540a56eed6e6a97e82dcaafffe81c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Pedro?= Date: Fri, 23 Apr 2021 23:59:24 -0300 Subject: [PATCH 05/13] Add function registry for LPAD string function without pad text --- cpp/src/gandiva/function_registry_string.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index aff7fd7f71c..7fad2154457 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -137,6 +137,9 @@ std::vector GetStringFunctionRegistry() { NativeFunction("lpad", {}, DataTypeVector{utf8(), int32(), utf8()}, utf8(), kResultNullIfNull, "lpad", NativeFunction::kNeedsContext), + NativeFunction("lpad", {}, DataTypeVector{utf8(), int32()}, utf8(), + kResultNullIfNull, "lpad_no_fill_text", NativeFunction::kNeedsContext), + NativeFunction("concatOperator", {}, DataTypeVector{utf8(), utf8()}, utf8(), kResultNullIfNull, "concatOperator_utf8_utf8", NativeFunction::kNeedsContext), From c270fb1ec4da917a3dee7e245bcdb8e3aef5167a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Pedro?= Date: Sun, 25 Apr 2021 11:01:41 -0300 Subject: [PATCH 06/13] Add base implementation and tests for RPAD functions --- cpp/src/gandiva/precompiled/string_ops.cc | 119 ++++++++++++++++++ .../gandiva/precompiled/string_ops_test.cc | 69 ++++++++++ cpp/src/gandiva/precompiled/types.h | 7 ++ 3 files changed, 195 insertions(+) diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index ecf9864c287..062bab912c9 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -1489,6 +1489,73 @@ const char* lpad(gdv_int64 context, const char* text, gdv_int32 text_len, } } +FORCE_INLINE +const char* rpad(gdv_int64 context, const char* text, gdv_int32 text_len, + gdv_int32 return_length, const char* fill_text, gdv_int32 fill_text_len, + gdv_int32* out_len) { + // if the text length or the defined return length (number of characters to return) + // is <=0, then return an empty string. + if (text_len == 0 || return_length <= 0) { + *out_len = 0; + return ""; + } + + // initially counts the number of utf8 characters in the defined text and fill_text + int32_t text_char_count = utf8_length(context, text, text_len); + int32_t fill_char_count = utf8_length(context, fill_text, fill_text_len); + // text_char_count is zero if input has invalid utf8 char + // fill_char_count is zero if fill_text_len is > 0 and its value has invalid utf8 char + if (text_char_count == 0 || (fill_text_len > 0 && fill_char_count == 0)) { + *out_len = 0; + return ""; + } + + if (return_length == text_char_count || + (return_length > text_char_count && fill_text_len == 0)) { + // case where the return length is same as the text's length, or if it need to + // fill into text but "fill_text" is empty, then return text directly. + *out_len = text_len; + return text; + } else if (return_length < text_char_count) { + // case where it truncates the result on return length. + *out_len = utf8_byte_pos(context, text, text_len, return_length); + return text; + } else { + // case (return_length > text_char_count) + // case where it needs to copy "fill_text" on the string right + char* ret = + reinterpret_cast(gdv_fn_context_arena_malloc(context, return_length)); + if (ret == nullptr) { + gdv_fn_context_set_error_msg(context, + "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + // fulfill the initial text copying the main input string + memcpy(ret, text, text_len); + // try to fulfill the return string with the "fill_text" continuously + int32_t copied_chars_count = 0; + int32_t copied_chars_position = 0; + while (text_char_count + copied_chars_count < return_length) { + int32_t char_len; + int32_t fill_length; + // for each char, evaluate its length to consider it when mem copying + for (fill_length = 0; fill_length < fill_text_len; fill_length += char_len) { + if (text_char_count + copied_chars_count >= return_length) { + break; + } + char_len = utf8_char_length(fill_text[fill_length]); + copied_chars_count++; + } + memcpy(ret + text_len + copied_chars_position, fill_text, fill_length); + copied_chars_position += fill_length; + } + *out_len = copied_chars_position + text_len; + return ret; + } +} + + FORCE_INLINE const char* lpad_no_fill_text(gdv_int64 context, const char* text, gdv_int32 text_len, gdv_int32 return_length, gdv_int32* out_len) { @@ -1541,6 +1608,58 @@ const char* lpad_no_fill_text(gdv_int64 context, const char* text, gdv_int32 tex } } +FORCE_INLINE +const char* rpad_no_fill_text(gdv_int64 context, const char* text, gdv_int32 text_len, + gdv_int32 return_length, gdv_int32* out_len) { + // if the text length or the defined return length (number of characters to return) + // is <=0, then return an empty string. + if (text_len == 0 || return_length <= 0) { + *out_len = 0; + return ""; + } + + // initially counts the number of utf8 characters in the defined text and fill_text + int32_t text_char_count = utf8_length(context, text, text_len); + // text_char_count is zero if input has invalid utf8 char + // fill_char_count is zero if fill_text_len is > 0 and its value has invalid utf8 char + if (text_char_count == 0) { + *out_len = 0; + return ""; + } + + if (return_length == text_char_count) { + // case where the return length is same as the text's length, or if it need to + // fill into text but "fill_text" is empty, then return text directly. + *out_len = text_len; + return text; + } else if (return_length < text_char_count) { + // case where it truncates the result on return length. + *out_len = utf8_byte_pos(context, text, text_len, return_length); + return text; + } else { + // case (return_length > text_char_count) + // case where it needs to copy "fill_text" on the string right + char* ret = + reinterpret_cast(gdv_fn_context_arena_malloc( + context, + text_len + (return_length - text_char_count))); + if (ret == nullptr) { + gdv_fn_context_set_error_msg(context, + "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + // fulfill the initial text copying the main string input + memcpy(ret, text, text_len); + const char* blank_space = " "; + for (int i = 0; i < return_length - text_char_count; ++i) { + ret[text_len + i] = blank_space[0]; + } + *out_len = text_len + (return_length - text_char_count); + return ret; + } +} + FORCE_INLINE const char* split_part(gdv_int64 context, const char* text, gdv_int32 text_len, const char* delimiter, gdv_int32 delim_len, gdv_int32 index, diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index 11d1848066c..1cac876e161 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -765,6 +765,75 @@ TEST(TestStringOps, TestLpadString) { EXPECT_EQ(std::string(out_str, out_len), " абвгд"); } +TEST(TestStringOps, TestRpadString) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + gdv_int32 out_len = 0; + const char* out_str; + + // RPAD function tests - with defined fill pad text + out_str = rpad(ctx_ptr, "TestString", 10, 4, "fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "Test"); + + out_str = rpad(ctx_ptr, "TestString", 10, 10, "fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString"); + + out_str = rpad(ctx_ptr, "TestString", 0, 10, "fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = rpad(ctx_ptr, "TestString", 10, 0, "fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = rpad(ctx_ptr, "TestString", 10, -500, "fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = rpad(ctx_ptr, "TestString", 10, 500, "", 0, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString"); + + out_str = rpad(ctx_ptr, "TestString", 10, 18, "Fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestStringFillFill"); + + out_str = rpad(ctx_ptr, "TestString", 10, 15, "Fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestStringFillF"); + + out_str = rpad(ctx_ptr, "TestString", 10, 20, "Fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestStringFillFillFi"); + + out_str = rpad(ctx_ptr, "абвгд", 10, 7, "д", 2, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "абвгддд"); + + out_str = rpad(ctx_ptr, "абвгд", 10, 20, "абвгд", 10, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "абвгдабвгдабвгдабвгд"); + + out_str = rpad(ctx_ptr, "hello", 5, 6, "д", 2, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "helloд"); + + // RPAD function tests - with NO pad text + out_str = rpad_no_fill_text(ctx_ptr, "TestString", 10, 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "Test"); + + out_str = rpad_no_fill_text(ctx_ptr, "TestString", 10, 10, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString"); + + out_str = rpad_no_fill_text(ctx_ptr, "TestString", 0, 10, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = rpad_no_fill_text(ctx_ptr, "TestString", 10, 0,&out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = rpad_no_fill_text(ctx_ptr, "TestString", 10, -500, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = rpad_no_fill_text(ctx_ptr, "TestString", 10, 18, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString "); + + out_str = rpad_no_fill_text(ctx_ptr, "TestString", 10, 15, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString "); + + out_str = rpad_no_fill_text(ctx_ptr, "абвгд", 10, 7, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "абвгд "); +} + TEST(TestStringOps, TestRtrim) { gandiva::ExecutionContext ctx; uint64_t ctx_ptr = reinterpret_cast(&ctx); diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h index e93301884a5..d5b39b548cf 100644 --- a/cpp/src/gandiva/precompiled/types.h +++ b/cpp/src/gandiva/precompiled/types.h @@ -411,9 +411,16 @@ const char* lpad(gdv_int64 context, const char* text, gdv_int32 text_len, gdv_int32 return_length, const char* fill_text, gdv_int32 fill_text_len, gdv_int32* out_len); +const char* rpad(gdv_int64 context, const char* text, gdv_int32 text_len, + gdv_int32 return_length, const char* fill_text, gdv_int32 fill_text_len, + gdv_int32* out_len); + const char* lpad_no_fill_text(gdv_int64 context, const char* text, gdv_int32 text_len, gdv_int32 return_length, gdv_int32* out_len); +const char* rpad_no_fill_text(gdv_int64 context, const char* text, gdv_int32 text_len, + gdv_int32 return_length, gdv_int32* out_len); + const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* text, gdv_int32 text_len, const char* from_str, gdv_int32 from_str_len, From dc72148d6d96e9798efdda3374ca242390007976 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Pedro?= Date: Sun, 25 Apr 2021 11:03:29 -0300 Subject: [PATCH 07/13] Add function registry for RPAD string function without pad text --- cpp/src/gandiva/function_registry_string.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index 7fad2154457..3fd75f4f3b8 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -140,6 +140,12 @@ std::vector GetStringFunctionRegistry() { NativeFunction("lpad", {}, DataTypeVector{utf8(), int32()}, utf8(), kResultNullIfNull, "lpad_no_fill_text", NativeFunction::kNeedsContext), + NativeFunction("rpad", {}, DataTypeVector{utf8(), int32(), utf8()}, utf8(), + kResultNullIfNull, "rpad", NativeFunction::kNeedsContext), + + NativeFunction("rpad", {}, DataTypeVector{utf8(), int32()}, utf8(), + kResultNullIfNull, "rpad_no_fill_text", NativeFunction::kNeedsContext), + NativeFunction("concatOperator", {}, DataTypeVector{utf8(), utf8()}, utf8(), kResultNullIfNull, "concatOperator_utf8_utf8", NativeFunction::kNeedsContext), From b6b63e9d3fdb945c639fa608f355671bfe425ac2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Pedro?= Date: Sun, 25 Apr 2021 11:07:01 -0300 Subject: [PATCH 08/13] Add projector test for RPAD string function --- cpp/src/gandiva/tests/projector_test.cc | 43 +++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc index 71eaf1fef55..1c8719999c7 100644 --- a/cpp/src/gandiva/tests/projector_test.cc +++ b/cpp/src/gandiva/tests/projector_test.cc @@ -1053,4 +1053,47 @@ TEST_F(TestProjector, TestLpad) { EXPECT_ARROW_ARRAY_EQUALS(exp_lpad, outputs.at(0)); } +TEST_F(TestProjector, TestRpad) { + // schema for input fields + auto field0 = field("f0", arrow::utf8()); + auto field1 = field("f1", arrow::int32()); + auto field2 = field("f2", arrow::utf8()); + auto schema = arrow::schema({field0, field1, field2}); + + // output fields + auto field_rpad = field("rpad", arrow::utf8()); + + // Build expression + auto rpad_expr = + TreeExprBuilder::MakeExpression("rpad", {field0, field1, field2}, field_rpad); + + std::shared_ptr projector; + auto status = Projector::Make(schema, {rpad_expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Create a row-batch with some sample data + int num_records = 7; + auto array0 = MakeArrowArrayUtf8({"ab", "a", "ab", "invalid", "valid", "invalid", ""}, + {true, true, true, true, true, true, true}); + auto array1 = MakeArrowArrayInt32({1, 5, 3, 12, 0, 2, 10}, + {true, true, true, true, true, true, true}); + auto array2 = MakeArrowArrayUtf8({"z", "z", "c", "valid", "invalid", "invalid", ""}, + {true, true, true, true, true, true, true}); + // expected output + auto exp_rpad = MakeArrowArrayUtf8({"a", "azzzz", "abc", "invalidvalid", "", "in", ""}, + {true, true, true, true, true, true, true}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, + {array0, array1, array2}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()) << status.message(); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp_rpad, outputs.at(0)); +} + } // namespace gandiva From 66594a0af7cdafa55b59302a4f17e8c5b95d8344 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Pedro?= Date: Mon, 26 Apr 2021 19:08:53 -0300 Subject: [PATCH 09/13] Correct lint local errors on gandiva --- cpp/src/gandiva/function_registry_string.cc | 6 ++++-- cpp/src/gandiva/precompiled/string_ops_test.cc | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index 3fd75f4f3b8..c5739270056 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -138,13 +138,15 @@ std::vector GetStringFunctionRegistry() { kResultNullIfNull, "lpad", NativeFunction::kNeedsContext), NativeFunction("lpad", {}, DataTypeVector{utf8(), int32()}, utf8(), - kResultNullIfNull, "lpad_no_fill_text", NativeFunction::kNeedsContext), + kResultNullIfNull, "lpad_no_fill_text", + NativeFunction::kNeedsContext), NativeFunction("rpad", {}, DataTypeVector{utf8(), int32(), utf8()}, utf8(), kResultNullIfNull, "rpad", NativeFunction::kNeedsContext), NativeFunction("rpad", {}, DataTypeVector{utf8(), int32()}, utf8(), - kResultNullIfNull, "rpad_no_fill_text", NativeFunction::kNeedsContext), + kResultNullIfNull, "rpad_no_fill_text", + NativeFunction::kNeedsContext), NativeFunction("concatOperator", {}, DataTypeVector{utf8(), utf8()}, utf8(), kResultNullIfNull, "concatOperator_utf8_utf8", diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index 1cac876e161..670cf798f76 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -749,7 +749,7 @@ TEST(TestStringOps, TestLpadString) { out_str = lpad_no_fill_text(ctx_ptr, "TestString", 0, 10, &out_len); EXPECT_EQ(std::string(out_str, out_len), ""); - out_str = lpad_no_fill_text(ctx_ptr, "TestString", 10, 0,&out_len); + out_str = lpad_no_fill_text(ctx_ptr, "TestString", 10, 0, &out_len); EXPECT_EQ(std::string(out_str, out_len), ""); out_str = lpad_no_fill_text(ctx_ptr, "TestString", 10, -500, &out_len); @@ -818,7 +818,7 @@ TEST(TestStringOps, TestRpadString) { out_str = rpad_no_fill_text(ctx_ptr, "TestString", 0, 10, &out_len); EXPECT_EQ(std::string(out_str, out_len), ""); - out_str = rpad_no_fill_text(ctx_ptr, "TestString", 10, 0,&out_len); + out_str = rpad_no_fill_text(ctx_ptr, "TestString", 10, 0, &out_len); EXPECT_EQ(std::string(out_str, out_len), ""); out_str = rpad_no_fill_text(ctx_ptr, "TestString", 10, -500, &out_len); From 26b90b09e5c52a9fd80f398c43819f685bd46231 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Pedro?= Date: Tue, 27 Apr 2021 11:08:34 -0300 Subject: [PATCH 10/13] Correct ci lint errors on gandiva --- cpp/src/gandiva/precompiled/string_ops.cc | 13 ++++--------- cpp/src/gandiva/tests/projector_test.cc | 14 ++++++-------- 2 files changed, 10 insertions(+), 17 deletions(-) diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 062bab912c9..7f817c3308e 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -1555,7 +1555,6 @@ const char* rpad(gdv_int64 context, const char* text, gdv_int32 text_len, } } - FORCE_INLINE const char* lpad_no_fill_text(gdv_int64 context, const char* text, gdv_int32 text_len, gdv_int32 return_length, gdv_int32* out_len) { @@ -1588,10 +1587,8 @@ const char* lpad_no_fill_text(gdv_int64 context, const char* text, gdv_int32 tex // case (return_length > text_char_count) // case where it needs to copy "fill_text" on the string left. The total number // of chars to copy is given by (return_length - text_char_count) - char* ret = - reinterpret_cast(gdv_fn_context_arena_malloc( - context, - text_len + (return_length - text_char_count))); + char* ret = reinterpret_cast(gdv_fn_context_arena_malloc( + context, text_len + (return_length - text_char_count))); if (ret == nullptr) { gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); @@ -1639,10 +1636,8 @@ const char* rpad_no_fill_text(gdv_int64 context, const char* text, gdv_int32 tex } else { // case (return_length > text_char_count) // case where it needs to copy "fill_text" on the string right - char* ret = - reinterpret_cast(gdv_fn_context_arena_malloc( - context, - text_len + (return_length - text_char_count))); + char* ret = reinterpret_cast(gdv_fn_context_arena_malloc( + context, text_len + (return_length - text_char_count))); if (ret == nullptr) { gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc index 1c8719999c7..af1e6544115 100644 --- a/cpp/src/gandiva/tests/projector_test.cc +++ b/cpp/src/gandiva/tests/projector_test.cc @@ -1033,20 +1033,19 @@ TEST_F(TestProjector, TestLpad) { auto array0 = MakeArrowArrayUtf8({"ab", "a", "ab", "invalid", "valid", "invalid", ""}, {true, true, true, true, true, true, true}); auto array1 = MakeArrowArrayInt32({1, 5, 3, 12, 0, 2, 10}, - {true, true, true, true, true, true, true}); + {true, true, true, true, true, true, true}); auto array2 = MakeArrowArrayUtf8({"z", "z", "c", "valid", "invalid", "invalid", ""}, {true, true, true, true, true, true, true}); // expected output auto exp_lpad = MakeArrowArrayUtf8({"a", "zzzza", "cab", "validinvalid", "", "in", ""}, - {true, true, true, true, true, true, true}); + {true, true, true, true, true, true, true}); // prepare input record batch - auto in_batch = arrow::RecordBatch::Make(schema, num_records, - {array0, array1, array2}); + auto in = arrow::RecordBatch::Make(schema, num_records, {array0, array1, array2}); // Evaluate expression arrow::ArrayVector outputs; - status = projector->Evaluate(*in_batch, pool_, &outputs); + status = projector->Evaluate(*in, pool_, &outputs); EXPECT_TRUE(status.ok()) << status.message(); // Validate results @@ -1084,12 +1083,11 @@ TEST_F(TestProjector, TestRpad) { {true, true, true, true, true, true, true}); // prepare input record batch - auto in_batch = arrow::RecordBatch::Make(schema, num_records, - {array0, array1, array2}); + auto in = arrow::RecordBatch::Make(schema, num_records, {array0, array1, array2}); // Evaluate expression arrow::ArrayVector outputs; - status = projector->Evaluate(*in_batch, pool_, &outputs); + status = projector->Evaluate(*in, pool_, &outputs); EXPECT_TRUE(status.ok()) << status.message(); // Validate results From 4c4b2f490f21a579b5f15a652887e91a4f5f2a5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Pedro?= Date: Wed, 28 Apr 2021 15:36:33 -0300 Subject: [PATCH 11/13] Change lpad and rpad functions signature and definition --- cpp/src/gandiva/function_registry_string.cc | 10 +- cpp/src/gandiva/precompiled/string_ops.cc | 112 ++---------------- .../gandiva/precompiled/string_ops_test.cc | 80 ++++++------- cpp/src/gandiva/precompiled/types.h | 18 +-- 4 files changed, 67 insertions(+), 153 deletions(-) diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index c5739270056..69e08133c15 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -135,17 +135,19 @@ std::vector GetStringFunctionRegistry() { NativeFunction::kNeedsContext), NativeFunction("lpad", {}, DataTypeVector{utf8(), int32(), utf8()}, utf8(), - kResultNullIfNull, "lpad", NativeFunction::kNeedsContext), + kResultNullIfNull, "lpad_utf8_int32_utf8", + NativeFunction::kNeedsContext), NativeFunction("lpad", {}, DataTypeVector{utf8(), int32()}, utf8(), - kResultNullIfNull, "lpad_no_fill_text", + kResultNullIfNull, "lpad_utf8_int32", NativeFunction::kNeedsContext), NativeFunction("rpad", {}, DataTypeVector{utf8(), int32(), utf8()}, utf8(), - kResultNullIfNull, "rpad", NativeFunction::kNeedsContext), + kResultNullIfNull, "rpad_utf8_int32_utf8", + NativeFunction::kNeedsContext), NativeFunction("rpad", {}, DataTypeVector{utf8(), int32()}, utf8(), - kResultNullIfNull, "rpad_no_fill_text", + kResultNullIfNull, "rpad_utf8_int32", NativeFunction::kNeedsContext), NativeFunction("concatOperator", {}, DataTypeVector{utf8(), utf8()}, utf8(), diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 7f817c3308e..6aa0c3b2175 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -1423,9 +1423,9 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text, } FORCE_INLINE -const char* lpad(gdv_int64 context, const char* text, gdv_int32 text_len, - gdv_int32 return_length, const char* fill_text, gdv_int32 fill_text_len, - gdv_int32* out_len) { +const char* lpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len, + gdv_int32 return_length, const char* fill_text, + gdv_int32 fill_text_len, gdv_int32* out_len) { // if the text length or the defined return length (number of characters to return) // is <=0, then return an empty string. if (text_len == 0 || return_length <= 0) { @@ -1490,9 +1490,9 @@ const char* lpad(gdv_int64 context, const char* text, gdv_int32 text_len, } FORCE_INLINE -const char* rpad(gdv_int64 context, const char* text, gdv_int32 text_len, - gdv_int32 return_length, const char* fill_text, gdv_int32 fill_text_len, - gdv_int32* out_len) { +const char* rpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len, + gdv_int32 return_length, const char* fill_text, + gdv_int32 fill_text_len, gdv_int32* out_len) { // if the text length or the defined return length (number of characters to return) // is <=0, then return an empty string. if (text_len == 0 || return_length <= 0) { @@ -1556,103 +1556,15 @@ const char* rpad(gdv_int64 context, const char* text, gdv_int32 text_len, } FORCE_INLINE -const char* lpad_no_fill_text(gdv_int64 context, const char* text, gdv_int32 text_len, - gdv_int32 return_length, gdv_int32* out_len) { - // if the text length or the defined return length (number of characters to return) - // is <=0, then return an empty string. - if (text_len == 0 || return_length <= 0) { - *out_len = 0; - return ""; - } - - // initially counts the number of utf8 characters in the defined text and fill_text - int32_t text_char_count = utf8_length(context, text, text_len); - // text_char_count is zero if input has invalid utf8 char - // fill_char_count is zero if fill_text_len is > 0 and its value has invalid utf8 char - if (text_char_count == 0) { - *out_len = 0; - return ""; - } - - if (return_length == text_char_count) { - // case where the return length is same as the text's length, or if it need to - // fill into text but "fill_text" is empty, then return text directly. - *out_len = text_len; - return text; - } else if (return_length < text_char_count) { - // case where it truncates the result on return length. - *out_len = utf8_byte_pos(context, text, text_len, return_length); - return text; - } else { - // case (return_length > text_char_count) - // case where it needs to copy "fill_text" on the string left. The total number - // of chars to copy is given by (return_length - text_char_count) - char* ret = reinterpret_cast(gdv_fn_context_arena_malloc( - context, text_len + (return_length - text_char_count))); - if (ret == nullptr) { - gdv_fn_context_set_error_msg(context, - "Could not allocate memory for output string"); - *out_len = 0; - return ""; - } - const char* blank_space = " "; - for (int i = 0; i < return_length - text_char_count; ++i) { - ret[i] = blank_space[0]; - } - memcpy(ret + return_length - text_char_count, text, text_len); - *out_len = text_len + (return_length - text_char_count); - return ret; - } +const char* lpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len, + gdv_int32 return_length, gdv_int32* out_len) { + return lpad_utf8_int32_utf8(context, text, text_len, return_length, " ", 1, out_len); } FORCE_INLINE -const char* rpad_no_fill_text(gdv_int64 context, const char* text, gdv_int32 text_len, - gdv_int32 return_length, gdv_int32* out_len) { - // if the text length or the defined return length (number of characters to return) - // is <=0, then return an empty string. - if (text_len == 0 || return_length <= 0) { - *out_len = 0; - return ""; - } - - // initially counts the number of utf8 characters in the defined text and fill_text - int32_t text_char_count = utf8_length(context, text, text_len); - // text_char_count is zero if input has invalid utf8 char - // fill_char_count is zero if fill_text_len is > 0 and its value has invalid utf8 char - if (text_char_count == 0) { - *out_len = 0; - return ""; - } - - if (return_length == text_char_count) { - // case where the return length is same as the text's length, or if it need to - // fill into text but "fill_text" is empty, then return text directly. - *out_len = text_len; - return text; - } else if (return_length < text_char_count) { - // case where it truncates the result on return length. - *out_len = utf8_byte_pos(context, text, text_len, return_length); - return text; - } else { - // case (return_length > text_char_count) - // case where it needs to copy "fill_text" on the string right - char* ret = reinterpret_cast(gdv_fn_context_arena_malloc( - context, text_len + (return_length - text_char_count))); - if (ret == nullptr) { - gdv_fn_context_set_error_msg(context, - "Could not allocate memory for output string"); - *out_len = 0; - return ""; - } - // fulfill the initial text copying the main string input - memcpy(ret, text, text_len); - const char* blank_space = " "; - for (int i = 0; i < return_length - text_char_count; ++i) { - ret[text_len + i] = blank_space[0]; - } - *out_len = text_len + (return_length - text_char_count); - return ret; - } +const char* rpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len, + gdv_int32 return_length, gdv_int32* out_len) { + return rpad_utf8_int32_utf8(context, text, text_len, return_length, " ", 1, out_len); } FORCE_INLINE diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index 670cf798f76..1a4aafa266c 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -703,65 +703,65 @@ TEST(TestStringOps, TestLpadString) { const char* out_str; // LPAD function tests - with defined fill pad text - out_str = lpad(ctx_ptr, "TestString", 10, 4, "fill", 4, &out_len); + out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 4, "fill", 4, &out_len); EXPECT_EQ(std::string(out_str, out_len), "Test"); - out_str = lpad(ctx_ptr, "TestString", 10, 10, "fill", 4, &out_len); + out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 10, "fill", 4, &out_len); EXPECT_EQ(std::string(out_str, out_len), "TestString"); - out_str = lpad(ctx_ptr, "TestString", 0, 10, "fill", 4, &out_len); + out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 0, 10, "fill", 4, &out_len); EXPECT_EQ(std::string(out_str, out_len), ""); - out_str = lpad(ctx_ptr, "TestString", 10, 0, "fill", 4, &out_len); + out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 0, "fill", 4, &out_len); EXPECT_EQ(std::string(out_str, out_len), ""); - out_str = lpad(ctx_ptr, "TestString", 10, -500, "fill", 4, &out_len); + out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, -500, "fill", 4, &out_len); EXPECT_EQ(std::string(out_str, out_len), ""); - out_str = lpad(ctx_ptr, "TestString", 10, 500, "", 0, &out_len); + out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 500, "", 0, &out_len); EXPECT_EQ(std::string(out_str, out_len), "TestString"); - out_str = lpad(ctx_ptr, "TestString", 10, 18, "Fill", 4, &out_len); + out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 18, "Fill", 4, &out_len); EXPECT_EQ(std::string(out_str, out_len), "FillFillTestString"); - out_str = lpad(ctx_ptr, "TestString", 10, 15, "Fill", 4, &out_len); + out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 15, "Fill", 4, &out_len); EXPECT_EQ(std::string(out_str, out_len), "FillFTestString"); - out_str = lpad(ctx_ptr, "TestString", 10, 20, "Fill", 4, &out_len); + out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 20, "Fill", 4, &out_len); EXPECT_EQ(std::string(out_str, out_len), "FillFillFiTestString"); - out_str = lpad(ctx_ptr, "абвгд", 10, 7, "д", 2, &out_len); + out_str = lpad_utf8_int32_utf8(ctx_ptr, "абвгд", 10, 7, "д", 2, &out_len); EXPECT_EQ(std::string(out_str, out_len), "ддабвгд"); - out_str = lpad(ctx_ptr, "абвгд", 10, 20, "абвгд", 10, &out_len); + out_str = lpad_utf8_int32_utf8(ctx_ptr, "абвгд", 10, 20, "абвгд", 10, &out_len); EXPECT_EQ(std::string(out_str, out_len), "абвгдабвгдабвгдабвгд"); - out_str = lpad(ctx_ptr, "hello", 5, 6, "д", 2, &out_len); + out_str = lpad_utf8_int32_utf8(ctx_ptr, "hello", 5, 6, "д", 2, &out_len); EXPECT_EQ(std::string(out_str, out_len), "дhello"); // LPAD function tests - with NO pad text - out_str = lpad_no_fill_text(ctx_ptr, "TestString", 10, 4, &out_len); + out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, 4, &out_len); EXPECT_EQ(std::string(out_str, out_len), "Test"); - out_str = lpad_no_fill_text(ctx_ptr, "TestString", 10, 10, &out_len); + out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, 10, &out_len); EXPECT_EQ(std::string(out_str, out_len), "TestString"); - out_str = lpad_no_fill_text(ctx_ptr, "TestString", 0, 10, &out_len); + out_str = lpad_utf8_int32(ctx_ptr, "TestString", 0, 10, &out_len); EXPECT_EQ(std::string(out_str, out_len), ""); - out_str = lpad_no_fill_text(ctx_ptr, "TestString", 10, 0, &out_len); + out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, 0, &out_len); EXPECT_EQ(std::string(out_str, out_len), ""); - out_str = lpad_no_fill_text(ctx_ptr, "TestString", 10, -500, &out_len); + out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, -500, &out_len); EXPECT_EQ(std::string(out_str, out_len), ""); - out_str = lpad_no_fill_text(ctx_ptr, "TestString", 10, 18, &out_len); + out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, 18, &out_len); EXPECT_EQ(std::string(out_str, out_len), " TestString"); - out_str = lpad_no_fill_text(ctx_ptr, "TestString", 10, 15, &out_len); + out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, 15, &out_len); EXPECT_EQ(std::string(out_str, out_len), " TestString"); - out_str = lpad_no_fill_text(ctx_ptr, "абвгд", 10, 7, &out_len); + out_str = lpad_utf8_int32(ctx_ptr, "абвгд", 10, 7, &out_len); EXPECT_EQ(std::string(out_str, out_len), " абвгд"); } @@ -772,65 +772,65 @@ TEST(TestStringOps, TestRpadString) { const char* out_str; // RPAD function tests - with defined fill pad text - out_str = rpad(ctx_ptr, "TestString", 10, 4, "fill", 4, &out_len); + out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 4, "fill", 4, &out_len); EXPECT_EQ(std::string(out_str, out_len), "Test"); - out_str = rpad(ctx_ptr, "TestString", 10, 10, "fill", 4, &out_len); + out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 10, "fill", 4, &out_len); EXPECT_EQ(std::string(out_str, out_len), "TestString"); - out_str = rpad(ctx_ptr, "TestString", 0, 10, "fill", 4, &out_len); + out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 0, 10, "fill", 4, &out_len); EXPECT_EQ(std::string(out_str, out_len), ""); - out_str = rpad(ctx_ptr, "TestString", 10, 0, "fill", 4, &out_len); + out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 0, "fill", 4, &out_len); EXPECT_EQ(std::string(out_str, out_len), ""); - out_str = rpad(ctx_ptr, "TestString", 10, -500, "fill", 4, &out_len); + out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, -500, "fill", 4, &out_len); EXPECT_EQ(std::string(out_str, out_len), ""); - out_str = rpad(ctx_ptr, "TestString", 10, 500, "", 0, &out_len); + out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 500, "", 0, &out_len); EXPECT_EQ(std::string(out_str, out_len), "TestString"); - out_str = rpad(ctx_ptr, "TestString", 10, 18, "Fill", 4, &out_len); + out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 18, "Fill", 4, &out_len); EXPECT_EQ(std::string(out_str, out_len), "TestStringFillFill"); - out_str = rpad(ctx_ptr, "TestString", 10, 15, "Fill", 4, &out_len); + out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 15, "Fill", 4, &out_len); EXPECT_EQ(std::string(out_str, out_len), "TestStringFillF"); - out_str = rpad(ctx_ptr, "TestString", 10, 20, "Fill", 4, &out_len); + out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 20, "Fill", 4, &out_len); EXPECT_EQ(std::string(out_str, out_len), "TestStringFillFillFi"); - out_str = rpad(ctx_ptr, "абвгд", 10, 7, "д", 2, &out_len); + out_str = rpad_utf8_int32_utf8(ctx_ptr, "абвгд", 10, 7, "д", 2, &out_len); EXPECT_EQ(std::string(out_str, out_len), "абвгддд"); - out_str = rpad(ctx_ptr, "абвгд", 10, 20, "абвгд", 10, &out_len); + out_str = rpad_utf8_int32_utf8(ctx_ptr, "абвгд", 10, 20, "абвгд", 10, &out_len); EXPECT_EQ(std::string(out_str, out_len), "абвгдабвгдабвгдабвгд"); - out_str = rpad(ctx_ptr, "hello", 5, 6, "д", 2, &out_len); + out_str = rpad_utf8_int32_utf8(ctx_ptr, "hello", 5, 6, "д", 2, &out_len); EXPECT_EQ(std::string(out_str, out_len), "helloд"); // RPAD function tests - with NO pad text - out_str = rpad_no_fill_text(ctx_ptr, "TestString", 10, 4, &out_len); + out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, 4, &out_len); EXPECT_EQ(std::string(out_str, out_len), "Test"); - out_str = rpad_no_fill_text(ctx_ptr, "TestString", 10, 10, &out_len); + out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, 10, &out_len); EXPECT_EQ(std::string(out_str, out_len), "TestString"); - out_str = rpad_no_fill_text(ctx_ptr, "TestString", 0, 10, &out_len); + out_str = rpad_utf8_int32(ctx_ptr, "TestString", 0, 10, &out_len); EXPECT_EQ(std::string(out_str, out_len), ""); - out_str = rpad_no_fill_text(ctx_ptr, "TestString", 10, 0, &out_len); + out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, 0, &out_len); EXPECT_EQ(std::string(out_str, out_len), ""); - out_str = rpad_no_fill_text(ctx_ptr, "TestString", 10, -500, &out_len); + out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, -500, &out_len); EXPECT_EQ(std::string(out_str, out_len), ""); - out_str = rpad_no_fill_text(ctx_ptr, "TestString", 10, 18, &out_len); + out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, 18, &out_len); EXPECT_EQ(std::string(out_str, out_len), "TestString "); - out_str = rpad_no_fill_text(ctx_ptr, "TestString", 10, 15, &out_len); + out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, 15, &out_len); EXPECT_EQ(std::string(out_str, out_len), "TestString "); - out_str = rpad_no_fill_text(ctx_ptr, "абвгд", 10, 7, &out_len); + out_str = rpad_utf8_int32(ctx_ptr, "абвгд", 10, 7, &out_len); EXPECT_EQ(std::string(out_str, out_len), "абвгд "); } diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h index d5b39b548cf..02e3f60630f 100644 --- a/cpp/src/gandiva/precompiled/types.h +++ b/cpp/src/gandiva/precompiled/types.h @@ -407,18 +407,18 @@ gdv_int32 locate_utf8_utf8_int32(gdv_int64 context, const char* sub_str, gdv_int32 sub_str_len, const char* str, gdv_int32 str_len, gdv_int32 start_pos); -const char* lpad(gdv_int64 context, const char* text, gdv_int32 text_len, - gdv_int32 return_length, const char* fill_text, gdv_int32 fill_text_len, - gdv_int32* out_len); +const char* lpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len, + gdv_int32 return_length, const char* fill_text, + gdv_int32 fill_text_len, gdv_int32* out_len); -const char* rpad(gdv_int64 context, const char* text, gdv_int32 text_len, - gdv_int32 return_length, const char* fill_text, gdv_int32 fill_text_len, - gdv_int32* out_len); +const char* rpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len, + gdv_int32 return_length, const char* fill_text, + gdv_int32 fill_text_len, gdv_int32* out_len); -const char* lpad_no_fill_text(gdv_int64 context, const char* text, gdv_int32 text_len, - gdv_int32 return_length, gdv_int32* out_len); +const char* lpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len, + gdv_int32 return_length, gdv_int32* out_len); -const char* rpad_no_fill_text(gdv_int64 context, const char* text, gdv_int32 text_len, +const char* rpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len, gdv_int32 return_length, gdv_int32* out_len); const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* text, From 33a5a147832cf5e20c10562f80ed0c814a343038 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Pedro?= Date: Thu, 29 Apr 2021 11:54:26 -0300 Subject: [PATCH 12/13] Fix identation on function string registry --- cpp/src/gandiva/function_registry_string.cc | 6 ++---- cpp/src/gandiva/precompiled/types.h | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index 69e08133c15..15d35278a41 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -139,16 +139,14 @@ std::vector GetStringFunctionRegistry() { NativeFunction::kNeedsContext), NativeFunction("lpad", {}, DataTypeVector{utf8(), int32()}, utf8(), - kResultNullIfNull, "lpad_utf8_int32", - NativeFunction::kNeedsContext), + kResultNullIfNull, "lpad_utf8_int32", NativeFunction::kNeedsContext), NativeFunction("rpad", {}, DataTypeVector{utf8(), int32(), utf8()}, utf8(), kResultNullIfNull, "rpad_utf8_int32_utf8", NativeFunction::kNeedsContext), NativeFunction("rpad", {}, DataTypeVector{utf8(), int32()}, utf8(), - kResultNullIfNull, "rpad_utf8_int32", - NativeFunction::kNeedsContext), + kResultNullIfNull, "rpad_utf8_int32", NativeFunction::kNeedsContext), NativeFunction("concatOperator", {}, DataTypeVector{utf8(), utf8()}, utf8(), kResultNullIfNull, "concatOperator_utf8_utf8", diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h index 02e3f60630f..4ece685388d 100644 --- a/cpp/src/gandiva/precompiled/types.h +++ b/cpp/src/gandiva/precompiled/types.h @@ -419,7 +419,7 @@ const char* lpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_ gdv_int32 return_length, gdv_int32* out_len); const char* rpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len, - gdv_int32 return_length, gdv_int32* out_len); + gdv_int32 return_length, gdv_int32* out_len); const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* text, gdv_int32 text_len, const char* from_str, From 4efc0fe8c3d7058d9d67b71c4ce10709fcf9f1cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Pedro?= Date: Thu, 29 Apr 2021 14:15:27 -0300 Subject: [PATCH 13/13] Add utf8_length method that ignore invalid char considering size 1 --- cpp/src/gandiva/precompiled/string_ops.cc | 47 ++++++++++++++--------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 6aa0c3b2175..125a9d4c650 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -190,6 +190,27 @@ gdv_int32 utf8_length(gdv_int64 context, const char* data, gdv_int32 data_len) { return count; } +// Count the number of utf8 characters, ignoring invalid char, considering size 1 +FORCE_INLINE +gdv_int32 utf8_length_ignore_invalid(const char* data, gdv_int32 data_len) { + int char_len = 0; + int count = 0; + for (int i = 0; i < data_len; i += char_len) { + char_len = utf8_char_length(data[i]); + if (char_len == 0 || i + char_len > data_len) { // invalid byte or incomplete glyph + // if invalid byte or incomplete glyph, ignore it + char_len = 1; + } + for (int j = 1; j < char_len; ++j) { + if ((data[i + j] & 0xC0) != 0x80) { // bytes following head-byte of glyph + char_len += 1; + } + } + ++count; + } + return count; +} + // Get the byte position corresponding to a character position for a non-empty utf8 // sequence FORCE_INLINE @@ -1433,15 +1454,8 @@ const char* lpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 return ""; } - // initially counts the number of utf8 characters in the defined text and fill_text - int32_t text_char_count = utf8_length(context, text, text_len); - int32_t fill_char_count = utf8_length(context, fill_text, fill_text_len); - // text_char_count is zero if input has invalid utf8 char - // fill_char_count is zero if fill_text_len is > 0 and its value has invalid utf8 char - if (text_char_count == 0 || (fill_text_len > 0 && fill_char_count == 0)) { - *out_len = 0; - return ""; - } + // count the number of utf8 characters on text, ignoring invalid bytes + int text_char_count = utf8_length_ignore_invalid(text, text_len); if (return_length == text_char_count || (return_length > text_char_count && fill_text_len == 0)) { @@ -1477,6 +1491,8 @@ const char* lpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 break; } char_len = utf8_char_length(fill_text[fill_index]); + // ignore invalid char on the fill text, considering it as size 1 + if (char_len == 0) char_len += 1; copied_chars_count++; } memcpy(ret + copied_chars_position, fill_text, fill_index); @@ -1500,15 +1516,8 @@ const char* rpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 return ""; } - // initially counts the number of utf8 characters in the defined text and fill_text - int32_t text_char_count = utf8_length(context, text, text_len); - int32_t fill_char_count = utf8_length(context, fill_text, fill_text_len); - // text_char_count is zero if input has invalid utf8 char - // fill_char_count is zero if fill_text_len is > 0 and its value has invalid utf8 char - if (text_char_count == 0 || (fill_text_len > 0 && fill_char_count == 0)) { - *out_len = 0; - return ""; - } + // count the number of utf8 characters on text, ignoring invalid bytes + int text_char_count = utf8_length_ignore_invalid(text, text_len); if (return_length == text_char_count || (return_length > text_char_count && fill_text_len == 0)) { @@ -1545,6 +1554,8 @@ const char* rpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 break; } char_len = utf8_char_length(fill_text[fill_length]); + // ignore invalid char on the fill text, considering it as size 1 + if (char_len == 0) char_len += 1; copied_chars_count++; } memcpy(ret + text_len + copied_chars_position, fill_text, fill_length);