diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index d1f97cdb3e8..15d35278a41 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -134,6 +134,20 @@ std::vector GetStringFunctionRegistry() { utf8(), kResultNullIfNull, "substr_utf8_int64", NativeFunction::kNeedsContext), + NativeFunction("lpad", {}, DataTypeVector{utf8(), int32(), utf8()}, utf8(), + kResultNullIfNull, "lpad_utf8_int32_utf8", + NativeFunction::kNeedsContext), + + NativeFunction("lpad", {}, DataTypeVector{utf8(), int32()}, utf8(), + kResultNullIfNull, "lpad_utf8_int32", NativeFunction::kNeedsContext), + + NativeFunction("rpad", {}, DataTypeVector{utf8(), int32(), utf8()}, utf8(), + kResultNullIfNull, "rpad_utf8_int32_utf8", + NativeFunction::kNeedsContext), + + NativeFunction("rpad", {}, DataTypeVector{utf8(), int32()}, utf8(), + kResultNullIfNull, "rpad_utf8_int32", NativeFunction::kNeedsContext), + NativeFunction("concatOperator", {}, DataTypeVector{utf8(), utf8()}, utf8(), kResultNullIfNull, "concatOperator_utf8_utf8", NativeFunction::kNeedsContext), diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index fa9164bd139..125a9d4c650 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -190,6 +190,27 @@ gdv_int32 utf8_length(gdv_int64 context, const char* data, gdv_int32 data_len) { return count; } +// Count the number of utf8 characters, ignoring invalid char, considering size 1 +FORCE_INLINE +gdv_int32 utf8_length_ignore_invalid(const char* data, gdv_int32 data_len) { + int char_len = 0; + int count = 0; + for (int i = 0; i < data_len; i += char_len) { + char_len = utf8_char_length(data[i]); + if (char_len == 0 || i + char_len > data_len) { // invalid byte or incomplete glyph + // if invalid byte or incomplete glyph, ignore it + char_len = 1; + } + for (int j = 1; j < char_len; ++j) { + if ((data[i + j] & 0xC0) != 0x80) { // bytes following head-byte of glyph + char_len += 1; + } + } + ++count; + } + return count; +} + // Get the byte position corresponding to a character position for a non-empty utf8 // sequence FORCE_INLINE @@ -1422,6 +1443,141 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text, out_len); } +FORCE_INLINE +const char* lpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len, + gdv_int32 return_length, const char* fill_text, + gdv_int32 fill_text_len, gdv_int32* out_len) { + // if the text length or the defined return length (number of characters to return) + // is <=0, then return an empty string. + if (text_len == 0 || return_length <= 0) { + *out_len = 0; + return ""; + } + + // count the number of utf8 characters on text, ignoring invalid bytes + int text_char_count = utf8_length_ignore_invalid(text, text_len); + + if (return_length == text_char_count || + (return_length > text_char_count && fill_text_len == 0)) { + // case where the return length is same as the text's length, or if it need to + // fill into text but "fill_text" is empty, then return text directly. + *out_len = text_len; + return text; + } else if (return_length < text_char_count) { + // case where it truncates the result on return length. + *out_len = utf8_byte_pos(context, text, text_len, return_length); + return text; + } else { + // case (return_length > text_char_count) + // case where it needs to copy "fill_text" on the string left. The total number + // of chars to copy is given by (return_length - text_char_count) + char* ret = + reinterpret_cast(gdv_fn_context_arena_malloc(context, return_length)); + if (ret == nullptr) { + gdv_fn_context_set_error_msg(context, + "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + // try to fulfill the return string with the "fill_text" continuously + int32_t copied_chars_count = 0; + int32_t copied_chars_position = 0; + while (copied_chars_count < return_length - text_char_count) { + int32_t char_len; + int32_t fill_index; + // for each char, evaluate its length to consider it when mem copying + for (fill_index = 0; fill_index < fill_text_len; fill_index += char_len) { + if (copied_chars_count >= return_length - text_char_count) { + break; + } + char_len = utf8_char_length(fill_text[fill_index]); + // ignore invalid char on the fill text, considering it as size 1 + if (char_len == 0) char_len += 1; + copied_chars_count++; + } + memcpy(ret + copied_chars_position, fill_text, fill_index); + copied_chars_position += fill_index; + } + // after fulfilling the text, copy the main string + memcpy(ret + copied_chars_position, text, text_len); + *out_len = copied_chars_position + text_len; + return ret; + } +} + +FORCE_INLINE +const char* rpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len, + gdv_int32 return_length, const char* fill_text, + gdv_int32 fill_text_len, gdv_int32* out_len) { + // if the text length or the defined return length (number of characters to return) + // is <=0, then return an empty string. + if (text_len == 0 || return_length <= 0) { + *out_len = 0; + return ""; + } + + // count the number of utf8 characters on text, ignoring invalid bytes + int text_char_count = utf8_length_ignore_invalid(text, text_len); + + if (return_length == text_char_count || + (return_length > text_char_count && fill_text_len == 0)) { + // case where the return length is same as the text's length, or if it need to + // fill into text but "fill_text" is empty, then return text directly. + *out_len = text_len; + return text; + } else if (return_length < text_char_count) { + // case where it truncates the result on return length. + *out_len = utf8_byte_pos(context, text, text_len, return_length); + return text; + } else { + // case (return_length > text_char_count) + // case where it needs to copy "fill_text" on the string right + char* ret = + reinterpret_cast(gdv_fn_context_arena_malloc(context, return_length)); + if (ret == nullptr) { + gdv_fn_context_set_error_msg(context, + "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + // fulfill the initial text copying the main input string + memcpy(ret, text, text_len); + // try to fulfill the return string with the "fill_text" continuously + int32_t copied_chars_count = 0; + int32_t copied_chars_position = 0; + while (text_char_count + copied_chars_count < return_length) { + int32_t char_len; + int32_t fill_length; + // for each char, evaluate its length to consider it when mem copying + for (fill_length = 0; fill_length < fill_text_len; fill_length += char_len) { + if (text_char_count + copied_chars_count >= return_length) { + break; + } + char_len = utf8_char_length(fill_text[fill_length]); + // ignore invalid char on the fill text, considering it as size 1 + if (char_len == 0) char_len += 1; + copied_chars_count++; + } + memcpy(ret + text_len + copied_chars_position, fill_text, fill_length); + copied_chars_position += fill_length; + } + *out_len = copied_chars_position + text_len; + return ret; + } +} + +FORCE_INLINE +const char* lpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len, + gdv_int32 return_length, gdv_int32* out_len) { + return lpad_utf8_int32_utf8(context, text, text_len, return_length, " ", 1, out_len); +} + +FORCE_INLINE +const char* rpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len, + gdv_int32 return_length, gdv_int32* out_len) { + return rpad_utf8_int32_utf8(context, text, text_len, return_length, " ", 1, out_len); +} + FORCE_INLINE const char* split_part(gdv_int64 context, const char* text, gdv_int32 text_len, const char* delimiter, gdv_int32 delim_len, gdv_int32 index, diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index 9326aac1e0f..1a4aafa266c 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -696,6 +696,144 @@ TEST(TestStringOps, TestLtrim) { EXPECT_FALSE(ctx.has_error()); } +TEST(TestStringOps, TestLpadString) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + gdv_int32 out_len = 0; + const char* out_str; + + // LPAD function tests - with defined fill pad text + out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 4, "fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "Test"); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 10, "fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString"); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 0, 10, "fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 0, "fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, -500, "fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 500, "", 0, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString"); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 18, "Fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "FillFillTestString"); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 15, "Fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "FillFTestString"); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 20, "Fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "FillFillFiTestString"); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "абвгд", 10, 7, "д", 2, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "ддабвгд"); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "абвгд", 10, 20, "абвгд", 10, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "абвгдабвгдабвгдабвгд"); + + out_str = lpad_utf8_int32_utf8(ctx_ptr, "hello", 5, 6, "д", 2, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "дhello"); + + // LPAD function tests - with NO pad text + out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "Test"); + + out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, 10, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString"); + + out_str = lpad_utf8_int32(ctx_ptr, "TestString", 0, 10, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, 0, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, -500, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, 18, &out_len); + EXPECT_EQ(std::string(out_str, out_len), " TestString"); + + out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, 15, &out_len); + EXPECT_EQ(std::string(out_str, out_len), " TestString"); + + out_str = lpad_utf8_int32(ctx_ptr, "абвгд", 10, 7, &out_len); + EXPECT_EQ(std::string(out_str, out_len), " абвгд"); +} + +TEST(TestStringOps, TestRpadString) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + gdv_int32 out_len = 0; + const char* out_str; + + // RPAD function tests - with defined fill pad text + out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 4, "fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "Test"); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 10, "fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString"); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 0, 10, "fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 0, "fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, -500, "fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 500, "", 0, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString"); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 18, "Fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestStringFillFill"); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 15, "Fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestStringFillF"); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "TestString", 10, 20, "Fill", 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestStringFillFillFi"); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "абвгд", 10, 7, "д", 2, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "абвгддд"); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "абвгд", 10, 20, "абвгд", 10, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "абвгдабвгдабвгдабвгд"); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, "hello", 5, 6, "д", 2, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "helloд"); + + // RPAD function tests - with NO pad text + out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, 4, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "Test"); + + out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, 10, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString"); + + out_str = rpad_utf8_int32(ctx_ptr, "TestString", 0, 10, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, 0, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, -500, &out_len); + EXPECT_EQ(std::string(out_str, out_len), ""); + + out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, 18, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString "); + + out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, 15, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "TestString "); + + out_str = rpad_utf8_int32(ctx_ptr, "абвгд", 10, 7, &out_len); + EXPECT_EQ(std::string(out_str, out_len), "абвгд "); +} + TEST(TestStringOps, TestRtrim) { gandiva::ExecutionContext ctx; uint64_t ctx_ptr = reinterpret_cast(&ctx); diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h index 1b0f96e0ab7..4ece685388d 100644 --- a/cpp/src/gandiva/precompiled/types.h +++ b/cpp/src/gandiva/precompiled/types.h @@ -407,6 +407,20 @@ gdv_int32 locate_utf8_utf8_int32(gdv_int64 context, const char* sub_str, gdv_int32 sub_str_len, const char* str, gdv_int32 str_len, gdv_int32 start_pos); +const char* lpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len, + gdv_int32 return_length, const char* fill_text, + gdv_int32 fill_text_len, gdv_int32* out_len); + +const char* rpad_utf8_int32_utf8(gdv_int64 context, const char* text, gdv_int32 text_len, + gdv_int32 return_length, const char* fill_text, + gdv_int32 fill_text_len, gdv_int32* out_len); + +const char* lpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len, + gdv_int32 return_length, gdv_int32* out_len); + +const char* rpad_utf8_int32(gdv_int64 context, const char* text, gdv_int32 text_len, + gdv_int32 return_length, gdv_int32* out_len); + const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* text, gdv_int32 text_len, const char* from_str, gdv_int32 from_str_len, diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc index b63af40d359..af1e6544115 100644 --- a/cpp/src/gandiva/tests/projector_test.cc +++ b/cpp/src/gandiva/tests/projector_test.cc @@ -1010,4 +1010,88 @@ TEST_F(TestProjector, TestIfElseOpt) { EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); } +TEST_F(TestProjector, TestLpad) { + // schema for input fields + auto field0 = field("f0", arrow::utf8()); + auto field1 = field("f1", arrow::int32()); + auto field2 = field("f2", arrow::utf8()); + auto schema = arrow::schema({field0, field1, field2}); + + // output fields + auto field_lpad = field("lpad", arrow::utf8()); + + // Build expression + auto lpad_expr = + TreeExprBuilder::MakeExpression("lpad", {field0, field1, field2}, field_lpad); + + std::shared_ptr projector; + auto status = Projector::Make(schema, {lpad_expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Create a row-batch with some sample data + int num_records = 7; + auto array0 = MakeArrowArrayUtf8({"ab", "a", "ab", "invalid", "valid", "invalid", ""}, + {true, true, true, true, true, true, true}); + auto array1 = MakeArrowArrayInt32({1, 5, 3, 12, 0, 2, 10}, + {true, true, true, true, true, true, true}); + auto array2 = MakeArrowArrayUtf8({"z", "z", "c", "valid", "invalid", "invalid", ""}, + {true, true, true, true, true, true, true}); + // expected output + auto exp_lpad = MakeArrowArrayUtf8({"a", "zzzza", "cab", "validinvalid", "", "in", ""}, + {true, true, true, true, true, true, true}); + + // prepare input record batch + auto in = arrow::RecordBatch::Make(schema, num_records, {array0, array1, array2}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in, pool_, &outputs); + EXPECT_TRUE(status.ok()) << status.message(); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp_lpad, outputs.at(0)); +} + +TEST_F(TestProjector, TestRpad) { + // schema for input fields + auto field0 = field("f0", arrow::utf8()); + auto field1 = field("f1", arrow::int32()); + auto field2 = field("f2", arrow::utf8()); + auto schema = arrow::schema({field0, field1, field2}); + + // output fields + auto field_rpad = field("rpad", arrow::utf8()); + + // Build expression + auto rpad_expr = + TreeExprBuilder::MakeExpression("rpad", {field0, field1, field2}, field_rpad); + + std::shared_ptr projector; + auto status = Projector::Make(schema, {rpad_expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Create a row-batch with some sample data + int num_records = 7; + auto array0 = MakeArrowArrayUtf8({"ab", "a", "ab", "invalid", "valid", "invalid", ""}, + {true, true, true, true, true, true, true}); + auto array1 = MakeArrowArrayInt32({1, 5, 3, 12, 0, 2, 10}, + {true, true, true, true, true, true, true}); + auto array2 = MakeArrowArrayUtf8({"z", "z", "c", "valid", "invalid", "invalid", ""}, + {true, true, true, true, true, true, true}); + // expected output + auto exp_rpad = MakeArrowArrayUtf8({"a", "azzzz", "abc", "invalidvalid", "", "in", ""}, + {true, true, true, true, true, true, true}); + + // prepare input record batch + auto in = arrow::RecordBatch::Make(schema, num_records, {array0, array1, array2}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in, pool_, &outputs); + EXPECT_TRUE(status.ok()) << status.message(); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp_rpad, outputs.at(0)); +} + } // namespace gandiva