From 030c445e3adab05e5e51cdeea6906729829cc9b3 Mon Sep 17 00:00:00 2001 From: Hyunseok Seo Date: Sun, 24 Dec 2023 14:01:46 +0900 Subject: [PATCH 1/4] Add casting from string to dictionary --- .../compute/kernels/scalar_cast_dictionary.cc | 28 ++++++++++++++----- cpp/src/arrow/dataset/partition_test.cc | 4 +-- cpp/src/arrow/scalar_test.cc | 9 +++--- 3 files changed, 28 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc b/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc index 13c0d599bf9..4b362615130 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc @@ -44,6 +44,14 @@ Status CastToDictionary(KernelContext* ctx, const ExecSpan& batch, ExecResult* o } std::shared_ptr in_array = batch[0].array.ToArrayData(); + + // If the input type is STRING, it is first encoded as a dictionary to facilitate + // processing. This approach allows the subsequent code to uniformly handle STRING + // inputs as if they were originally provided in dictionary format. Encoding as a + // dictionary helps in reusing the same logic for dictionary operations. + if (batch[0].type()->id() == Type::STRING) { + in_array = DictionaryEncode(batch[0].array.ToArrayData())->array(); + } const auto& in_type = checked_cast(*in_array->type); ArrayData* out_array = out->array_data().get(); @@ -78,16 +86,22 @@ Status CastToDictionary(KernelContext* ctx, const ExecSpan& batch, ExecResult* o } std::vector> GetDictionaryCasts() { - auto func = std::make_shared("cast_dictionary", Type::DICTIONARY); + auto create_cast_function = [](const std::string& name, Type::type input_type) { + auto cast_function = std::make_shared(name, Type::DICTIONARY); + AddCommonCasts(input_type, kOutputTargetType, cast_function.get()); + + ScalarKernel kernel({InputType(input_type)}, kOutputTargetType, CastToDictionary); + kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; + kernel.mem_allocation = MemAllocation::NO_PREALLOCATE; + DCHECK_OK(cast_function->AddKernel(input_type, std::move(kernel))); - AddCommonCasts(Type::DICTIONARY, kOutputTargetType, func.get()); - ScalarKernel kernel({InputType(Type::DICTIONARY)}, kOutputTargetType, CastToDictionary); - kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; - kernel.mem_allocation = MemAllocation::NO_PREALLOCATE; + return cast_function; + }; - DCHECK_OK(func->AddKernel(Type::DICTIONARY, std::move(kernel))); + auto cast_dict = create_cast_function("cast_dictionary", Type::DICTIONARY); + auto cast_string = create_cast_function("cast_dictionary", Type::STRING); - return {func}; + return {cast_dict, cast_string}; } } // namespace internal diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc index 7ec96929a93..1b71be15d19 100644 --- a/cpp/src/arrow/dataset/partition_test.cc +++ b/cpp/src/arrow/dataset/partition_test.cc @@ -316,7 +316,7 @@ TEST_F(TestPartitioning, DirectoryPartitioningFormatDictionary) { ArrayVector{dictionary}); written_schema_ = partitioning_->schema(); - ASSERT_OK_AND_ASSIGN(auto dict_hello, MakeScalar("hello")->CastTo(DictStr("")->type())); + ASSERT_OK_AND_ASSIGN(auto dict_hello, Cast(MakeScalar("hello"), DictStr("")->type())); AssertFormat(equal(field_ref("alpha"), literal(dict_hello)), "hello"); } @@ -329,7 +329,7 @@ TEST_F(TestPartitioning, DirectoryPartitioningFormatDictionaryCustomIndex) { schema({field("alpha", dict_type)}), ArrayVector{dictionary}); written_schema_ = partitioning_->schema(); - ASSERT_OK_AND_ASSIGN(auto dict_hello, MakeScalar("hello")->CastTo(dict_type)); + ASSERT_OK_AND_ASSIGN(auto dict_hello, Cast(MakeScalar("hello"), dict_type)); AssertFormat(equal(field_ref("alpha"), literal(dict_hello)), "hello"); } diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index ac740f92c85..cc4d8a6f6a3 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -1479,11 +1479,12 @@ TEST(TestDictionaryScalar, Cast) { auto alpha = dict->IsValid(i) ? MakeScalar(dict->GetString(i)) : MakeNullScalar(utf8()); // Cast string to dict(..., string) - ASSERT_OK_AND_ASSIGN(auto cast_alpha, alpha->CastTo(ty)); - ASSERT_OK(cast_alpha->ValidateFull()); + ASSERT_OK_AND_ASSIGN(auto cast_alpha, Cast(alpha, ty)); + const auto& scalar = cast_alpha.scalar(); + ASSERT_OK(scalar->ValidateFull()); ASSERT_OK_AND_ASSIGN( auto roundtripped_alpha, - checked_cast(*cast_alpha).GetEncodedValue()); + checked_cast(*scalar).GetEncodedValue()); ASSERT_OK_AND_ASSIGN(auto i_scalar, MakeScalar(index_ty, i)); auto alpha_dict = DictionaryScalar({i_scalar, dict}, ty); @@ -1496,7 +1497,7 @@ TEST(TestDictionaryScalar, Cast) { AssertScalarsEqual(*encoded_alpha, *roundtripped_alpha); // dictionaries differ, though encoded values are identical - ASSERT_FALSE(alpha_dict.Equals(*cast_alpha)); + ASSERT_FALSE(alpha_dict.Equals(*scalar)); } } } From 731e944a57089a15c60bcb3bae997140ba9b5797 Mon Sep 17 00:00:00 2001 From: Hyunseok Seo Date: Sun, 24 Dec 2023 16:19:47 +0900 Subject: [PATCH 2/4] Fix adding dictionary cast kernel --- .../compute/kernels/scalar_cast_dictionary.cc | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc b/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc index 4b362615130..1a18e9613b8 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc @@ -85,23 +85,23 @@ Status CastToDictionary(KernelContext* ctx, const ExecSpan& batch, ExecResult* o return Status::OK(); } -std::vector> GetDictionaryCasts() { - auto create_cast_function = [](const std::string& name, Type::type input_type) { - auto cast_function = std::make_shared(name, Type::DICTIONARY); - AddCommonCasts(input_type, kOutputTargetType, cast_function.get()); - - ScalarKernel kernel({InputType(input_type)}, kOutputTargetType, CastToDictionary); - kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; - kernel.mem_allocation = MemAllocation::NO_PREALLOCATE; - DCHECK_OK(cast_function->AddKernel(input_type, std::move(kernel))); - - return cast_function; - }; +template +void AddDictionaryCast(CastFunction* func) { + ScalarKernel kernel; + kernel.exec = CastToDictionary; + kernel.signature = + KernelSignature::Make({InputType(SrcType::type_id)}, kOutputTargetType); + kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; + DCHECK_OK(func->AddKernel(SrcType::type_id, std::move(kernel))); +} - auto cast_dict = create_cast_function("cast_dictionary", Type::DICTIONARY); - auto cast_string = create_cast_function("cast_dictionary", Type::STRING); +std::vector> GetDictionaryCasts() { + auto cast_dict = std::make_shared("cast_dictionary", Type::DICTIONARY); + AddCommonCasts(Type::DICTIONARY, kOutputTargetType, cast_dict.get()); + AddDictionaryCast(cast_dict.get()); + AddDictionaryCast(cast_dict.get()); - return {cast_dict, cast_string}; + return {cast_dict}; } } // namespace internal From ea42725bf87159eb63809eaf19338379b5d9e2c5 Mon Sep 17 00:00:00 2001 From: Hyunseok Seo Date: Fri, 5 Jan 2024 12:32:44 +0900 Subject: [PATCH 3/4] Update the suggestions from the review --- .../arrow/compute/kernels/scalar_cast_dictionary.cc | 9 +++++---- cpp/src/arrow/scalar_test.cc | 10 +++++----- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc b/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc index 1a18e9613b8..a7ab2fa8d0e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc @@ -36,21 +36,21 @@ Status CastToDictionary(KernelContext* ctx, const ExecSpan& batch, ExecResult* o const CastOptions& options = CastState::Get(ctx); const auto& out_type = checked_cast(*out->type()); + std::shared_ptr in_array = batch[0].array.ToArrayData(); + // if out type is same as in type, return input if (out_type.Equals(*batch[0].type())) { /// XXX: This is the wrong place to do a zero-copy optimization - out->value = batch[0].array.ToArrayData(); + out->value = in_array; return Status::OK(); } - std::shared_ptr in_array = batch[0].array.ToArrayData(); - // If the input type is STRING, it is first encoded as a dictionary to facilitate // processing. This approach allows the subsequent code to uniformly handle STRING // inputs as if they were originally provided in dictionary format. Encoding as a // dictionary helps in reusing the same logic for dictionary operations. if (batch[0].type()->id() == Type::STRING) { - in_array = DictionaryEncode(batch[0].array.ToArrayData())->array(); + in_array = DictionaryEncode(in_array)->array(); } const auto& in_type = checked_cast(*in_array->type); @@ -92,6 +92,7 @@ void AddDictionaryCast(CastFunction* func) { kernel.signature = KernelSignature::Make({InputType(SrcType::type_id)}, kOutputTargetType); kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; + kernel.mem_allocation = MemAllocation::NO_PREALLOCATE; DCHECK_OK(func->AddKernel(SrcType::type_id, std::move(kernel))); } diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index cc4d8a6f6a3..c6b81eb46b4 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -1479,12 +1479,12 @@ TEST(TestDictionaryScalar, Cast) { auto alpha = dict->IsValid(i) ? MakeScalar(dict->GetString(i)) : MakeNullScalar(utf8()); // Cast string to dict(..., string) - ASSERT_OK_AND_ASSIGN(auto cast_alpha, Cast(alpha, ty)); - const auto& scalar = cast_alpha.scalar(); - ASSERT_OK(scalar->ValidateFull()); + ASSERT_OK_AND_ASSIGN(auto cast_alpha_datum, Cast(alpha, ty)); + const auto& cast_alpha = cast_alpha_datum.scalar(); + ASSERT_OK(cast_alpha->ValidateFull()); ASSERT_OK_AND_ASSIGN( auto roundtripped_alpha, - checked_cast(*scalar).GetEncodedValue()); + checked_cast(*cast_alpha).GetEncodedValue()); ASSERT_OK_AND_ASSIGN(auto i_scalar, MakeScalar(index_ty, i)); auto alpha_dict = DictionaryScalar({i_scalar, dict}, ty); @@ -1497,7 +1497,7 @@ TEST(TestDictionaryScalar, Cast) { AssertScalarsEqual(*encoded_alpha, *roundtripped_alpha); // dictionaries differ, though encoded values are identical - ASSERT_FALSE(alpha_dict.Equals(*scalar)); + ASSERT_FALSE(alpha_dict.Equals(*cast_alpha)); } } } From b4996ef1e1a58fa5fcefeb45dc187f7a3d88d1a4 Mon Sep 17 00:00:00 2001 From: Hyunseok Seo Date: Sun, 7 Jan 2024 07:25:55 +0900 Subject: [PATCH 4/4] Update to use ScalarKernel constructor --- cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc b/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc index a7ab2fa8d0e..f13aa26d969 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc @@ -87,10 +87,7 @@ Status CastToDictionary(KernelContext* ctx, const ExecSpan& batch, ExecResult* o template void AddDictionaryCast(CastFunction* func) { - ScalarKernel kernel; - kernel.exec = CastToDictionary; - kernel.signature = - KernelSignature::Make({InputType(SrcType::type_id)}, kOutputTargetType); + ScalarKernel kernel({InputType(SrcType::type_id)}, kOutputTargetType, CastToDictionary); kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; kernel.mem_allocation = MemAllocation::NO_PREALLOCATE; DCHECK_OK(func->AddKernel(SrcType::type_id, std::move(kernel)));