diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index c3187a3995a..b88248071c2 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -704,6 +704,7 @@ class ScalarExecutor : public KernelExecutorImpl { preallocate_contiguous_ = (exec_context()->preallocate_contiguous() && kernel_->can_write_into_slices && validity_preallocated_ && !is_nested(output_descr_.type->id()) && + !is_dictionary(output_descr_.type->id()) && data_preallocated_.size() == static_cast(output_num_buffers_ - 1) && std::all_of(data_preallocated_.begin(), data_preallocated_.end(), [](const BufferPreallocation& prealloc) { diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 2592b77ab66..1d81be48288 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -118,7 +118,12 @@ std::vector> GetNestedCasts() { auto cast_struct = std::make_shared("cast_struct", Type::STRUCT); AddCommonCasts(Type::STRUCT, kOutputTargetType, cast_struct.get()); - return {cast_list, cast_large_list, cast_fsl, cast_struct}; + // So is dictionary + auto cast_dictionary = + std::make_shared("cast_dictionary", Type::DICTIONARY); + AddCommonCasts(Type::DICTIONARY, kOutputTargetType, cast_dictionary.get()); + + return {cast_list, cast_large_list, cast_fsl, cast_struct, cast_dictionary}; } } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 10e5ed26e5d..6efecbb2ad0 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -1782,6 +1782,14 @@ TEST(Cast, FromNull) { } } +TEST(Cast, FromNullToDictionary) { + auto from = std::make_shared(10); + auto to_type = dictionary(int8(), boolean()); + + ASSERT_OK_AND_ASSIGN(auto expected, MakeArrayOfNull(to_type, 10)); + CheckCast(from, expected); +} + // ---------------------------------------------------------------------- // Test casting from DictionaryType diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 01ee2977fec..37d69363816 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -1358,12 +1358,13 @@ def test_cast_from_null(): pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.list_(pa.int8())), pa.field('c', pa.string())]), + pa.dictionary(pa.int32(), pa.string()), ] for out_type in out_types: _check_cast_case((in_data, in_type, in_data, out_type)) out_types = [ - pa.dictionary(pa.int32(), pa.string()), + pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 6ca6b095936..7688cf78ac7 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -3156,3 +3156,24 @@ def test_write_dataset_s3(s3_example_simple): "mybucket/dataset3", filesystem=fs, format="ipc", partitioning="hive" ).to_table() assert result.equals(table) + + +@pytest.mark.parquet +def test_dataset_null_to_dictionary_cast(tempdir): + # ARROW-12420 + import pyarrow.parquet as pq + + table = pa.table({"a": [None, None]}) + pq.write_table(table, tempdir / "test.parquet") + + schema = pa.schema([ + pa.field("a", pa.dictionary(pa.int32(), pa.string())) + ]) + fsds = ds.FileSystemDataset.from_paths( + paths=[tempdir / "test.parquet"], + schema=schema, + format=ds.ParquetFileFormat(), + filesystem=fs.LocalFileSystem(), + ) + table = fsds.to_table() + assert table.schema == schema