From 50fd0af9b4e89e0a2689b968bb38ffba350d437b Mon Sep 17 00:00:00 2001 From: ViniciusSouzaRoque Date: Wed, 9 Feb 2022 13:18:15 -0300 Subject: [PATCH 1/8] First implementation function Find in Set --- cpp/src/gandiva/function_registry_string.cc | 4 ++ cpp/src/gandiva/precompiled/string_ops.cc | 47 +++++++++++++++++++ .../gandiva/precompiled/string_ops_test.cc | 16 +++++++ cpp/src/gandiva/precompiled/types.h | 3 ++ cpp/src/gandiva/tests/projector_test.cc | 39 +++++++++++++++ 5 files changed, 109 insertions(+) diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index 2bc6936d77b..bb28bec8c14 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -510,6 +510,10 @@ std::vector GetStringFunctionRegistry() { kResultNullIfNull, "gdv_mask_last_n_utf8_int32", NativeFunction::kNeedsContext), + NativeFunction("find_in_set", {}, DataTypeVector{utf8(), utf8()}, int32(), + kResultNullIfNull, "find_in_set_utf8_utf8", + NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), + NativeFunction("instr", {}, DataTypeVector{utf8(), utf8()}, int32(), kResultNullIfNull, "instr_utf8"), diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index c255b9a11c0..4275d6494fb 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -3034,4 +3034,51 @@ int32_t instr_utf8(const char* string, int32_t string_len, const char* substring } return 0; } + +FORCE_INLINE +int32_t find_in_set_utf8_utf8(int64_t context, const char* to_find, int32_t to_find_len, + const char* string_list, int32_t string_list_len) { + // Return 0 if entry len <= 0 + if (to_find_len <= 0 || string_list_len <= 0) { + gdv_fn_context_set_error_msg(context, "Invalid input values."); + return 0; + } + + // Return 0 if to search entry have commas + if (is_substr_utf8_utf8(to_find, to_find_len, reinterpret_cast(","), 1)) { + return 0; + } + + int32_t cur_pos_in_array = 0; + int32_t cur_length = 0; + bool matching = true; + + for (int i = 0; i < string_list_len; i++) { + if (string_list[i] == ',') { + cur_pos_in_array++; + if (matching && cur_length == to_find_len) { + return cur_pos_in_array; + } else { + matching = true; + cur_length = 0; + } + } else { + if (cur_length + 1 <= string_list_len) { + if (!matching || to_find[cur_length] != string_list[i]) { + matching = false; + } + } else { + matching = false; + } + cur_length++; + } + } + + if (matching && cur_length == to_find_len) { + cur_pos_in_array++; + return cur_pos_in_array; + } else { + return 0; + } +} } // extern "C" diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index b84c51b3a6b..bcaf527baf9 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -2702,4 +2702,20 @@ TEST(TestStringOps, TestInstr) { result = instr_utf8(s1.c_str(), s1_len, s2.c_str(), s2_len); EXPECT_EQ(result, 8); } + +TEST(TestStringOps, TestFindInSet) { + gandiva::ExecutionContext ctx; + auto ctx_ptr = reinterpret_cast(&ctx); + int32_t result; + result = find_in_set_utf8_utf8(ctx_ptr, "EE", 2, ",A,B,C,D,EE,F", 13); + EXPECT_EQ(result, 6); + result = find_in_set_utf8_utf8(ctx_ptr, "A", 1, "A,B,C,D,EE,F", 12); + EXPECT_EQ(result, 1); + result = find_in_set_utf8_utf8(ctx_ptr, "AAAB", 4, "A,B,C,D,EE,F", 12); + EXPECT_EQ(result, 0); + result = find_in_set_utf8_utf8(ctx_ptr, "E,E", 3, "A,B,C,D,EE,F", 12); + EXPECT_EQ(result, 0); + result = find_in_set_utf8_utf8(ctx_ptr, "C", 1, "A,B,,,,,,,C,,,,,", 16); + EXPECT_EQ(result, 9); +} } // namespace gandiva diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h index a0a83f18dd4..67220659f8c 100644 --- a/cpp/src/gandiva/precompiled/types.h +++ b/cpp/src/gandiva/precompiled/types.h @@ -829,4 +829,7 @@ const char* elt_int32_utf8_utf8_utf8_utf8_utf8( int32_t instr_utf8(const char* string, int32_t string_len, const char* substring, int32_t substring_len); +int32_t find_in_set_utf8_utf8(int64_t context, const char* to_find, int32_t to_find_len, + const char* string_list, int32_t string_list_len); + } // extern "C" diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc index 65597b38f0b..e895c9dac1e 100644 --- a/cpp/src/gandiva/tests/projector_test.cc +++ b/cpp/src/gandiva/tests/projector_test.cc @@ -2824,6 +2824,45 @@ TEST_F(TestProjector, TestInstr) { // Validate results EXPECT_ARROW_ARRAY_EQUALS(exp_sum, outputs.at(0)); } +TEST_F(TestProjector, TestFindInSet) { + // schema for input fields + auto field0 = field("f0", arrow::utf8()); + auto field1 = field("f1", arrow::utf8()); + auto schema = arrow::schema({field0, field1}); + + // output fields + auto output_find_in_set = field("find_in_set_output", int32()); + + // Build expression + auto find_in_set_expr = TreeExprBuilder::MakeExpression("find_in_set", {field0, field1}, + output_find_in_set); + + std::shared_ptr projector; + auto status = + Projector::Make(schema, {find_in_set_expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()); + + // Create a row-batch with some sample data + int num_records = 4; + auto array0 = + MakeArrowArrayUtf8({"ABC", "...", "!C", "MORE"}, {true, true, true, true}); + auto array1 = MakeArrowArrayUtf8( + {"ZXL,KMY,DDD,ABC", "!!!,@@@,###,...,,,", ",A,,,,,,,,!C,,,,,", "MORE"}, + {true, true, true, true}); + // expected output + auto exp_sum = MakeArrowArrayInt32({4, 4, 10, 1}, {true, true, true, true}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp_sum, outputs.at(0)); +} TEST_F(TestProjector, TestNextDay) { // schema for input fields From b4c6692af135e4ced55708ce4cec11e17a07a6fd Mon Sep 17 00:00:00 2001 From: ViniciusSouzaRoque Date: Fri, 11 Feb 2022 10:02:48 -0300 Subject: [PATCH 2/8] Added UTF8 Support --- cpp/src/gandiva/precompiled/string_ops.cc | 10 ++++++---- cpp/src/gandiva/tests/projector_test.cc | 16 +++++++++------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 4275d6494fb..239988ebd26 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -3053,8 +3053,10 @@ int32_t find_in_set_utf8_utf8(int64_t context, const char* to_find, int32_t to_f int32_t cur_length = 0; bool matching = true; - for (int i = 0; i < string_list_len; i++) { - if (string_list[i] == ',') { + int char_length = 0; + for (int i = 0; i < string_list_len; i += char_length) { + char_length = utf8_char_length(string_list[i]); + if (char_length == 1 && string_list[i] == ',') { cur_pos_in_array++; if (matching && cur_length == to_find_len) { return cur_pos_in_array; @@ -3064,13 +3066,13 @@ int32_t find_in_set_utf8_utf8(int64_t context, const char* to_find, int32_t to_f } } else { if (cur_length + 1 <= string_list_len) { - if (!matching || to_find[cur_length] != string_list[i]) { + if (!matching || (memcmp(string_list + i, to_find + cur_length, char_length))) { matching = false; } } else { matching = false; } - cur_length++; + cur_length += utf8_char_length(to_find[cur_length]); } } diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc index e895c9dac1e..8a1754fdff5 100644 --- a/cpp/src/gandiva/tests/projector_test.cc +++ b/cpp/src/gandiva/tests/projector_test.cc @@ -2843,14 +2843,16 @@ TEST_F(TestProjector, TestFindInSet) { EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data - int num_records = 4; - auto array0 = - MakeArrowArrayUtf8({"ABC", "...", "!C", "MORE"}, {true, true, true, true}); - auto array1 = MakeArrowArrayUtf8( - {"ZXL,KMY,DDD,ABC", "!!!,@@@,###,...,,,", ",A,,,,,,,,!C,,,,,", "MORE"}, - {true, true, true, true}); + int num_records = 7; + auto array0 = MakeArrowArrayUtf8({"ABC", "...", "!C", "MORE", "学路", "b大", "路"}, + {true, true, true, true, true, true, true}); + auto array1 = + MakeArrowArrayUtf8({"ZXL,KMY,DDD,ABC", "!!!,@@@,###,...,,,", ",A,,,,,,,,!C,,,,,", + "MORE", "学路,学路,学路,123", "大b,,,b大", "大b,,学路,学,b大"}, + {true, true, true, true, true, true, true}); // expected output - auto exp_sum = MakeArrowArrayInt32({4, 4, 10, 1}, {true, true, true, true}); + auto exp_sum = MakeArrowArrayInt32({4, 4, 10, 1, 1, 4, 0}, + {true, true, true, true, true, true, true}); // prepare input record batch auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1}); From c7609753c2ffaf260b230a6b0b6ffe108a828645 Mon Sep 17 00:00:00 2001 From: ViniciusSouzaRoque Date: Mon, 18 Apr 2022 07:39:59 -0300 Subject: [PATCH 3/8] Change output test name --- cpp/src/gandiva/tests/projector_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc index 8a1754fdff5..e6fce44e748 100644 --- a/cpp/src/gandiva/tests/projector_test.cc +++ b/cpp/src/gandiva/tests/projector_test.cc @@ -2851,7 +2851,7 @@ TEST_F(TestProjector, TestFindInSet) { "MORE", "学路,学路,学路,123", "大b,,,b大", "大b,,学路,学,b大"}, {true, true, true, true, true, true, true}); // expected output - auto exp_sum = MakeArrowArrayInt32({4, 4, 10, 1, 1, 4, 0}, + auto exp_res = MakeArrowArrayInt32({4, 4, 10, 1, 1, 4, 0}, {true, true, true, true, true, true, true}); // prepare input record batch @@ -2863,7 +2863,7 @@ TEST_F(TestProjector, TestFindInSet) { EXPECT_TRUE(status.ok()); // Validate results - EXPECT_ARROW_ARRAY_EQUALS(exp_sum, outputs.at(0)); + EXPECT_ARROW_ARRAY_EQUALS(exp_res, outputs.at(0)); } TEST_F(TestProjector, TestNextDay) { From 443e9b080c3240a2e9a00a684710c6a61724f5dd Mon Sep 17 00:00:00 2001 From: ViniciusSouzaRoque Date: Tue, 5 Jul 2022 11:14:26 -0300 Subject: [PATCH 4/8] Remove invalid return to empty strings --- cpp/src/gandiva/precompiled/string_ops.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 239988ebd26..d1ca63efa4e 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -3038,9 +3038,8 @@ int32_t instr_utf8(const char* string, int32_t string_len, const char* substring FORCE_INLINE int32_t find_in_set_utf8_utf8(int64_t context, const char* to_find, int32_t to_find_len, const char* string_list, int32_t string_list_len) { - // Return 0 if entry len <= 0 - if (to_find_len <= 0 || string_list_len <= 0) { - gdv_fn_context_set_error_msg(context, "Invalid input values."); + // Return 0 if entry len = 0 + if (to_find_len == 0 || string_list_len == 0) { return 0; } From a0ae3e469398a5228e3c8b2ddc3f5c1577a7d915 Mon Sep 17 00:00:00 2001 From: ViniciusSouzaRoque Date: Wed, 13 Jul 2022 08:56:16 -0300 Subject: [PATCH 5/8] Skip utf8 length check --- cpp/src/gandiva/precompiled/string_ops.cc | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index d1ca63efa4e..ea1da8bfe99 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -3052,10 +3052,8 @@ int32_t find_in_set_utf8_utf8(int64_t context, const char* to_find, int32_t to_f int32_t cur_length = 0; bool matching = true; - int char_length = 0; - for (int i = 0; i < string_list_len; i += char_length) { - char_length = utf8_char_length(string_list[i]); - if (char_length == 1 && string_list[i] == ',') { + for (int i = 0; i < string_list_len; i++) { + if (string_list[i] == ',') { cur_pos_in_array++; if (matching && cur_length == to_find_len) { return cur_pos_in_array; @@ -3065,13 +3063,13 @@ int32_t find_in_set_utf8_utf8(int64_t context, const char* to_find, int32_t to_f } } else { if (cur_length + 1 <= string_list_len) { - if (!matching || (memcmp(string_list + i, to_find + cur_length, char_length))) { + if (!matching || (memcmp(string_list + i, to_find + cur_length, 1))) { matching = false; } } else { matching = false; } - cur_length += utf8_char_length(to_find[cur_length]); + cur_length++; } } From 67e218290c53263f51770ee8869cc8dc2e780048 Mon Sep 17 00:00:00 2001 From: ViniciusSouzaRoque Date: Wed, 20 Jul 2022 12:05:48 -0300 Subject: [PATCH 6/8] Fix return to empty strings --- cpp/src/gandiva/precompiled/string_ops.cc | 5 ----- cpp/src/gandiva/precompiled/string_ops_test.cc | 6 ++++++ 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index ea1da8bfe99..81bc117f2fe 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -3038,11 +3038,6 @@ int32_t instr_utf8(const char* string, int32_t string_len, const char* substring FORCE_INLINE int32_t find_in_set_utf8_utf8(int64_t context, const char* to_find, int32_t to_find_len, const char* string_list, int32_t string_list_len) { - // Return 0 if entry len = 0 - if (to_find_len == 0 || string_list_len == 0) { - return 0; - } - // Return 0 if to search entry have commas if (is_substr_utf8_utf8(to_find, to_find_len, reinterpret_cast(","), 1)) { return 0; diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index bcaf527baf9..f61281c7df3 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -2717,5 +2717,11 @@ TEST(TestStringOps, TestFindInSet) { EXPECT_EQ(result, 0); result = find_in_set_utf8_utf8(ctx_ptr, "C", 1, "A,B,,,,,,,C,,,,,", 16); EXPECT_EQ(result, 9); + result = find_in_set_utf8_utf8(ctx_ptr, "", 0, "", 0); + EXPECT_EQ(result, 1); + result = find_in_set_utf8_utf8(ctx_ptr, "", 0, " ", 1); + EXPECT_EQ(result, 0); + result = find_in_set_utf8_utf8(ctx_ptr, " ", 1, "", 0); + EXPECT_EQ(result, 0); } } // namespace gandiva From 92c8bd6d721f1c2e2d6512e10ad655d5027a663d Mon Sep 17 00:00:00 2001 From: ViniciusSouzaRoque Date: Fri, 22 Jul 2022 11:49:56 -0300 Subject: [PATCH 7/8] Add requested tests --- .../gandiva/precompiled/string_ops_test.cc | 4 ++++ cpp/src/gandiva/tests/projector_test.cc | 19 ++++++++++--------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index f61281c7df3..4bfa4709638 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -2723,5 +2723,9 @@ TEST(TestStringOps, TestFindInSet) { EXPECT_EQ(result, 0); result = find_in_set_utf8_utf8(ctx_ptr, " ", 1, "", 0); EXPECT_EQ(result, 0); + result = find_in_set_utf8_utf8(ctx_ptr, "", 0, "a,b,,c,d", 8); + EXPECT_EQ(result, 3); + result = find_in_set_utf8_utf8(ctx_ptr, "", 0, ",", 1); + EXPECT_EQ(result, 1); } } // namespace gandiva diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc index e6fce44e748..ebd9bab3473 100644 --- a/cpp/src/gandiva/tests/projector_test.cc +++ b/cpp/src/gandiva/tests/projector_test.cc @@ -2843,16 +2843,17 @@ TEST_F(TestProjector, TestFindInSet) { EXPECT_TRUE(status.ok()); // Create a row-batch with some sample data - int num_records = 7; - auto array0 = MakeArrowArrayUtf8({"ABC", "...", "!C", "MORE", "学路", "b大", "路"}, - {true, true, true, true, true, true, true}); - auto array1 = - MakeArrowArrayUtf8({"ZXL,KMY,DDD,ABC", "!!!,@@@,###,...,,,", ",A,,,,,,,,!C,,,,,", - "MORE", "学路,学路,学路,123", "大b,,,b大", "大b,,学路,学,b大"}, - {true, true, true, true, true, true, true}); + int num_records = 8; + auto array0 = + MakeArrowArrayUtf8({"ABC", "...", "!C", "MORE", "学路", "b大", "路", "学路"}, + {true, true, true, true, true, true, true, true}); + auto array1 = MakeArrowArrayUtf8( + {"ZXL,KMY,DDD,ABC", "!!!,@@@,###,...,,,", ",A,,,,,,,,!C,,,,,", "MORE", + "学路,学路,学路,123", "大b,,,b大", "大b,,学路,学,b大", "学路"}, + {true, true, true, true, true, true, true, true}); // expected output - auto exp_res = MakeArrowArrayInt32({4, 4, 10, 1, 1, 4, 0}, - {true, true, true, true, true, true, true}); + auto exp_res = MakeArrowArrayInt32({4, 4, 10, 1, 1, 4, 0, 1}, + {true, true, true, true, true, true, true, true}); // prepare input record batch auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1}); From 3dcca026b5b82b5dc35b4ca86a382c770631f148 Mon Sep 17 00:00:00 2001 From: ViniciusSouzaRoque Date: Mon, 25 Jul 2022 13:40:01 -0300 Subject: [PATCH 8/8] Empty commit to run CI