diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 51633742967..87782cc6924 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -906,6 +906,16 @@ Status DictionaryArray::FromArrays(const std::shared_ptr& type, return Status::OK(); } +static bool IsTrivialTransposition(const std::vector& transpose_map, + int64_t input_dict_size) { + for (int64_t i = 0; i < input_dict_size; ++i) { + if (transpose_map[i] != i) { + return false; + } + } + return true; +} + template static Status TransposeDictIndices(MemoryPool* pool, const ArrayData& in_data, const std::vector& transpose_map, @@ -927,6 +937,14 @@ Status DictionaryArray::Transpose(MemoryPool* pool, const std::shared_ptrid() != Type::DICTIONARY) { return Status::TypeError("Expected dictionary type"); } + const int64_t in_dict_len = data_->dictionary->length(); + if (in_dict_len > static_cast(transpose_map.size())) { + return Status::Invalid( + "Transpose map too small for dictionary array " + "(has ", + transpose_map.size(), " values, need at least ", in_dict_len, ")"); + } + const auto& out_dict_type = checked_cast(*type); const auto& out_index_type = @@ -935,12 +953,33 @@ Status DictionaryArray::Transpose(MemoryPool* pool, const std::shared_ptrindex_type()->id(); auto out_type_id = out_index_type.id(); + if (in_type_id == out_type_id && IsTrivialTransposition(transpose_map, in_dict_len)) { + // Index type and values will be identical => we can simply reuse + // the existing buffers. + auto out_data = + ArrayData::Make(type, data_->length, {data_->buffers[0], data_->buffers[1]}, + data_->null_count, data_->offset); + out_data->dictionary = dictionary; + *out = MakeArray(out_data); + return Status::OK(); + } + + // Default path: compute a buffer of transposed indices. std::shared_ptr out_buffer; RETURN_NOT_OK(AllocateBuffer( pool, data_->length * out_index_type.bit_width() * CHAR_BIT, &out_buffer)); - // Null bitmap is unchanged - auto out_data = ArrayData::Make(type, data_->length, {data_->buffers[0], out_buffer}, - data_->null_count); + + // Shift null buffer if the original offset is non-zero + std::shared_ptr null_bitmap; + if (data_->offset != 0) { + RETURN_NOT_OK( + CopyBitmap(pool, null_bitmap_data_, data_->offset, data_->length, &null_bitmap)); + } else { + null_bitmap = data_->buffers[0]; + } + + auto out_data = + ArrayData::Make(type, data_->length, {null_bitmap, out_buffer}, data_->null_count); out_data->dictionary = dictionary; #define TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, OUT_INDEX_TYPE) \ diff --git a/cpp/src/arrow/array_dict_test.cc b/cpp/src/arrow/array_dict_test.cc index ca6552cb23d..78819c2156d 100644 --- a/cpp/src/arrow/array_dict_test.cc +++ b/cpp/src/arrow/array_dict_test.cc @@ -947,65 +947,115 @@ TEST(TestDictionary, FromArray) { ASSERT_RAISES(Invalid, DictionaryArray::FromArrays(dict_type, indices4, dict, &arr4)); } +static void CheckTranspose(const std::shared_ptr& input, + const std::vector& transpose_map, + const std::shared_ptr& out_dict_type, + const std::shared_ptr& out_dict, + const std::shared_ptr& expected_indices) { + std::shared_ptr out, expected; + ASSERT_OK(internal::checked_cast(*input).Transpose( + default_memory_pool(), out_dict_type, out_dict, transpose_map, &out)); + ASSERT_OK(out->Validate()); + + ASSERT_OK( + DictionaryArray::FromArrays(out_dict_type, expected_indices, out_dict, &expected)); + AssertArraysEqual(*out, *expected); +} + TEST(TestDictionary, TransposeBasic) { - std::shared_ptr arr, out, expected; + std::shared_ptr arr, sliced; auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]"); auto dict_type = dictionary(int16(), utf8()); auto indices = ArrayFromJSON(int16(), "[1, 2, 0, 0]"); // ["B", "C", "A", "A"] ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices, dict, &arr)); + // ["C", "A"] + sliced = arr->Slice(1, 2); // Transpose to same index type { + auto out_dict_type = dict_type; auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]"); - - const std::vector transpose_map{1, 3, 2}; - ASSERT_OK(internal::checked_cast(*arr).Transpose( - default_memory_pool(), dict_type, out_dict, transpose_map, &out)); - auto expected_indices = ArrayFromJSON(int16(), "[3, 2, 1, 1]"); - ASSERT_OK( - DictionaryArray::FromArrays(dict_type, expected_indices, out_dict, &expected)); - AssertArraysEqual(*out, *expected); + CheckTranspose(arr, {1, 3, 2}, out_dict_type, out_dict, expected_indices); + + // Sliced + expected_indices = ArrayFromJSON(int16(), "[2, 1]"); + CheckTranspose(sliced, {1, 3, 2}, out_dict_type, out_dict, expected_indices); } - // Transpose to other type + // Transpose to other index type { - auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]"); auto out_dict_type = dictionary(int8(), utf8()); + auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]"); + auto expected_indices = ArrayFromJSON(int8(), "[3, 2, 1, 1]"); + CheckTranspose(arr, {1, 3, 2}, out_dict_type, out_dict, expected_indices); - const std::vector transpose_map{1, 3, 2}; - ASSERT_OK(internal::checked_cast(*arr).Transpose( - default_memory_pool(), out_dict_type, out_dict, transpose_map, &out)); + // Sliced + expected_indices = ArrayFromJSON(int8(), "[2, 1]"); + CheckTranspose(sliced, {1, 3, 2}, out_dict_type, out_dict, expected_indices); + } +} - auto expected_indices = ArrayFromJSON(int8(), "[3, 2, 1, 1]"); - ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, out_dict, - &expected)); - AssertArraysEqual(*expected, *out); +TEST(TestDictionary, TransposeTrivial) { + // Test a trivial transposition, possibly optimized away + std::shared_ptr arr, sliced; + + auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]"); + auto dict_type = dictionary(int16(), utf8()); + auto indices = ArrayFromJSON(int16(), "[1, 2, 0, 0]"); + // ["B", "C", "A", "A"] + ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices, dict, &arr)); + // ["C", "A"] + sliced = arr->Slice(1, 2); + + // Transpose to same index type + { + auto out_dict_type = dict_type; + auto out_dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\", \"D\"]"); + auto expected_indices = ArrayFromJSON(int16(), "[1, 2, 0, 0]"); + CheckTranspose(arr, {0, 1, 2}, out_dict_type, out_dict, expected_indices); + + // Sliced + expected_indices = ArrayFromJSON(int16(), "[2, 0]"); + CheckTranspose(sliced, {0, 1, 2}, out_dict_type, out_dict, expected_indices); + } + + // Transpose to other index type + { + auto out_dict_type = dictionary(int8(), utf8()); + auto out_dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\", \"D\"]"); + auto expected_indices = ArrayFromJSON(int8(), "[1, 2, 0, 0]"); + CheckTranspose(arr, {0, 1, 2}, out_dict_type, out_dict, expected_indices); + + // Sliced + expected_indices = ArrayFromJSON(int8(), "[2, 0]"); + CheckTranspose(sliced, {0, 1, 2}, out_dict_type, out_dict, expected_indices); } } TEST(TestDictionary, TransposeNulls) { - std::shared_ptr arr, out, expected; + std::shared_ptr arr, sliced; auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]"); auto dict_type = dictionary(int16(), utf8()); auto indices = ArrayFromJSON(int16(), "[1, 2, null, 0]"); // ["B", "C", null, "A"] ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices, dict, &arr)); + // ["C", null] + sliced = arr->Slice(1, 2); auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]"); auto out_dict_type = dictionary(int16(), utf8()); + auto expected_indices = ArrayFromJSON(int16(), "[3, 2, null, 1]"); - const std::vector transpose_map{1, 3, 2}; - ASSERT_OK(internal::checked_cast(*arr).Transpose( - default_memory_pool(), out_dict_type, out_dict, transpose_map, &out)); + CheckTranspose(arr, {1, 3, 2}, out_dict_type, out_dict, expected_indices); - auto expected_indices = ArrayFromJSON(int16(), "[3, 2, null, 1]"); - ASSERT_OK( - DictionaryArray::FromArrays(out_dict_type, expected_indices, out_dict, &expected)); - AssertArraysEqual(*expected, *out); + // Sliced + expected_indices = ArrayFromJSON(int16(), "[2, null]"); + + CheckTranspose(sliced, {1, 3, 2}, out_dict_type, out_dict, expected_indices); } TEST(TestDictionary, DISABLED_ListOfDictionary) { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index e15336986cc..ff3b89ea7e7 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1259,7 +1259,8 @@ class ARROW_EXPORT DictionaryType : public FixedWidthType { /// one per input type. Each integer vector represents the transposition /// of input type indices into unified type indices. // XXX Should we return something special (an empty transpose map?) when - // the transposition is the identity function? + // the transposition is the identity function? Currently this case is + // detected in DictionaryArray::Transpose. static Status Unify(MemoryPool* pool, const std::vector& types, const std::vector& dictionaries, std::shared_ptr* out_type,