From 0390b0b09103794e37b0be73ff74cd954ee3496d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sun, 18 Apr 2021 19:39:28 +0200 Subject: [PATCH 1/2] ARROW-12420: [C++/Dataset] Reading null columns as dictionary not longer possible --- cpp/src/arrow/array/array_dict.cc | 1 - .../compute/kernels/scalar_cast_nested.cc | 6 +++++- python/pyarrow/tests/test_dataset.py | 21 +++++++++++++++++++ 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index 2fa95e9a176..d7e2494a0b8 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -80,7 +80,6 @@ int64_t DictionaryArray::GetValueIndex(int64_t i) const { DictionaryArray::DictionaryArray(const std::shared_ptr& data) : dict_type_(checked_cast(data->type.get())) { ARROW_CHECK_EQ(data->type->id(), Type::DICTIONARY); - ARROW_CHECK_NE(data->dictionary, nullptr); SetData(data); } diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 2592b77ab66..85f9aae0e90 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -118,7 +118,11 @@ std::vector> GetNestedCasts() { auto cast_struct = std::make_shared("cast_struct", Type::STRUCT); AddCommonCasts(Type::STRUCT, kOutputTargetType, cast_struct.get()); - return {cast_list, cast_large_list, cast_fsl, cast_struct}; + // So is dictionary + auto cast_dictionary = std::make_shared("cast_dictionary", Type::DICTIONARY); + AddCommonCasts(Type::DICTIONARY, kOutputTargetType, cast_dictionary.get()); + + return {cast_list, cast_large_list, cast_fsl, cast_struct, cast_dictionary}; } } // namespace internal diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 6ca6b095936..7688cf78ac7 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -3156,3 +3156,24 @@ def test_write_dataset_s3(s3_example_simple): "mybucket/dataset3", filesystem=fs, format="ipc", partitioning="hive" ).to_table() assert result.equals(table) + + +@pytest.mark.parquet +def test_dataset_null_to_dictionary_cast(tempdir): + # ARROW-12420 + import pyarrow.parquet as pq + + table = pa.table({"a": [None, None]}) + pq.write_table(table, tempdir / "test.parquet") + + schema = pa.schema([ + pa.field("a", pa.dictionary(pa.int32(), pa.string())) + ]) + fsds = ds.FileSystemDataset.from_paths( + paths=[tempdir / "test.parquet"], + schema=schema, + format=ds.ParquetFileFormat(), + filesystem=fs.LocalFileSystem(), + ) + table = fsds.to_table() + assert table.schema == schema From 56725a033a3034bd319fa1fbb4dca53b6d985ea3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 19 Apr 2021 00:31:48 +0200 Subject: [PATCH 2/2] Don't preallocate dictionary types --- cpp/src/arrow/array/array_dict.cc | 1 + cpp/src/arrow/compute/exec.cc | 1 + cpp/src/arrow/compute/kernels/scalar_cast_nested.cc | 3 ++- cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 8 ++++++++ python/pyarrow/tests/test_array.py | 3 ++- 5 files changed, 14 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index d7e2494a0b8..2fa95e9a176 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -80,6 +80,7 @@ int64_t DictionaryArray::GetValueIndex(int64_t i) const { DictionaryArray::DictionaryArray(const std::shared_ptr& data) : dict_type_(checked_cast(data->type.get())) { ARROW_CHECK_EQ(data->type->id(), Type::DICTIONARY); + ARROW_CHECK_NE(data->dictionary, nullptr); SetData(data); } diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index c3187a3995a..b88248071c2 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -704,6 +704,7 @@ class ScalarExecutor : public KernelExecutorImpl { preallocate_contiguous_ = (exec_context()->preallocate_contiguous() && kernel_->can_write_into_slices && validity_preallocated_ && !is_nested(output_descr_.type->id()) && + !is_dictionary(output_descr_.type->id()) && data_preallocated_.size() == static_cast(output_num_buffers_ - 1) && std::all_of(data_preallocated_.begin(), data_preallocated_.end(), [](const BufferPreallocation& prealloc) { diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 85f9aae0e90..1d81be48288 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -119,7 +119,8 @@ std::vector> GetNestedCasts() { AddCommonCasts(Type::STRUCT, kOutputTargetType, cast_struct.get()); // So is dictionary - auto cast_dictionary = std::make_shared("cast_dictionary", Type::DICTIONARY); + auto cast_dictionary = + std::make_shared("cast_dictionary", Type::DICTIONARY); AddCommonCasts(Type::DICTIONARY, kOutputTargetType, cast_dictionary.get()); return {cast_list, cast_large_list, cast_fsl, cast_struct, cast_dictionary}; diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 10e5ed26e5d..6efecbb2ad0 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -1782,6 +1782,14 @@ TEST(Cast, FromNull) { } } +TEST(Cast, FromNullToDictionary) { + auto from = std::make_shared(10); + auto to_type = dictionary(int8(), boolean()); + + ASSERT_OK_AND_ASSIGN(auto expected, MakeArrayOfNull(to_type, 10)); + CheckCast(from, expected); +} + // ---------------------------------------------------------------------- // Test casting from DictionaryType diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 01ee2977fec..37d69363816 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -1358,12 +1358,13 @@ def test_cast_from_null(): pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.list_(pa.int8())), pa.field('c', pa.string())]), + pa.dictionary(pa.int32(), pa.string()), ] for out_type in out_types: _check_cast_case((in_data, in_type, in_data, out_type)) out_types = [ - pa.dictionary(pa.int32(), pa.string()), + pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)),