Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 42 additions & 3 deletions cpp/src/arrow/array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -906,6 +906,16 @@ Status DictionaryArray::FromArrays(const std::shared_ptr<DataType>& type,
return Status::OK();
}

static bool IsTrivialTransposition(const std::vector<int32_t>& transpose_map,
int64_t input_dict_size) {
for (int64_t i = 0; i < input_dict_size; ++i) {
if (transpose_map[i] != i) {
return false;
}
}
return true;
}

template <typename InType, typename OutType>
static Status TransposeDictIndices(MemoryPool* pool, const ArrayData& in_data,
const std::vector<int32_t>& transpose_map,
Expand All @@ -927,6 +937,14 @@ Status DictionaryArray::Transpose(MemoryPool* pool, const std::shared_ptr<DataTy
if (type->id() != Type::DICTIONARY) {
return Status::TypeError("Expected dictionary type");
}
const int64_t in_dict_len = data_->dictionary->length();
if (in_dict_len > static_cast<int64_t>(transpose_map.size())) {
return Status::Invalid(
"Transpose map too small for dictionary array "
"(has ",
transpose_map.size(), " values, need at least ", in_dict_len, ")");
}

const auto& out_dict_type = checked_cast<const DictionaryType&>(*type);

const auto& out_index_type =
Expand All @@ -935,12 +953,33 @@ Status DictionaryArray::Transpose(MemoryPool* pool, const std::shared_ptr<DataTy
auto in_type_id = dict_type_->index_type()->id();
auto out_type_id = out_index_type.id();

if (in_type_id == out_type_id && IsTrivialTransposition(transpose_map, in_dict_len)) {
// Index type and values will be identical => we can simply reuse
// the existing buffers.
auto out_data =
ArrayData::Make(type, data_->length, {data_->buffers[0], data_->buffers[1]},
data_->null_count, data_->offset);
out_data->dictionary = dictionary;
*out = MakeArray(out_data);
return Status::OK();
}

// Default path: compute a buffer of transposed indices.
std::shared_ptr<Buffer> out_buffer;
RETURN_NOT_OK(AllocateBuffer(
pool, data_->length * out_index_type.bit_width() * CHAR_BIT, &out_buffer));
// Null bitmap is unchanged
auto out_data = ArrayData::Make(type, data_->length, {data_->buffers[0], out_buffer},
data_->null_count);

// Shift null buffer if the original offset is non-zero
std::shared_ptr<Buffer> null_bitmap;
if (data_->offset != 0) {
RETURN_NOT_OK(
CopyBitmap(pool, null_bitmap_data_, data_->offset, data_->length, &null_bitmap));
} else {
null_bitmap = data_->buffers[0];
}

auto out_data =
ArrayData::Make(type, data_->length, {null_bitmap, out_buffer}, data_->null_count);
out_data->dictionary = dictionary;

#define TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, OUT_INDEX_TYPE) \
Expand Down
102 changes: 76 additions & 26 deletions cpp/src/arrow/array_dict_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -947,65 +947,115 @@ TEST(TestDictionary, FromArray) {
ASSERT_RAISES(Invalid, DictionaryArray::FromArrays(dict_type, indices4, dict, &arr4));
}

static void CheckTranspose(const std::shared_ptr<Array>& input,
const std::vector<int32_t>& transpose_map,
const std::shared_ptr<DataType>& out_dict_type,
const std::shared_ptr<Array>& out_dict,
const std::shared_ptr<Array>& expected_indices) {
std::shared_ptr<Array> out, expected;
ASSERT_OK(internal::checked_cast<const DictionaryArray&>(*input).Transpose(
default_memory_pool(), out_dict_type, out_dict, transpose_map, &out));
ASSERT_OK(out->Validate());

ASSERT_OK(
DictionaryArray::FromArrays(out_dict_type, expected_indices, out_dict, &expected));
AssertArraysEqual(*out, *expected);
}

TEST(TestDictionary, TransposeBasic) {
std::shared_ptr<Array> arr, out, expected;
std::shared_ptr<Array> arr, sliced;

auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]");
auto dict_type = dictionary(int16(), utf8());
auto indices = ArrayFromJSON(int16(), "[1, 2, 0, 0]");
// ["B", "C", "A", "A"]
ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices, dict, &arr));
// ["C", "A"]
sliced = arr->Slice(1, 2);

// Transpose to same index type
{
auto out_dict_type = dict_type;
auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]");

const std::vector<int32_t> transpose_map{1, 3, 2};
ASSERT_OK(internal::checked_cast<const DictionaryArray&>(*arr).Transpose(
default_memory_pool(), dict_type, out_dict, transpose_map, &out));

auto expected_indices = ArrayFromJSON(int16(), "[3, 2, 1, 1]");
ASSERT_OK(
DictionaryArray::FromArrays(dict_type, expected_indices, out_dict, &expected));
AssertArraysEqual(*out, *expected);
CheckTranspose(arr, {1, 3, 2}, out_dict_type, out_dict, expected_indices);

// Sliced
expected_indices = ArrayFromJSON(int16(), "[2, 1]");
CheckTranspose(sliced, {1, 3, 2}, out_dict_type, out_dict, expected_indices);
}

// Transpose to other type
// Transpose to other index type
{
auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]");
auto out_dict_type = dictionary(int8(), utf8());
auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]");
auto expected_indices = ArrayFromJSON(int8(), "[3, 2, 1, 1]");
CheckTranspose(arr, {1, 3, 2}, out_dict_type, out_dict, expected_indices);

const std::vector<int32_t> transpose_map{1, 3, 2};
ASSERT_OK(internal::checked_cast<const DictionaryArray&>(*arr).Transpose(
default_memory_pool(), out_dict_type, out_dict, transpose_map, &out));
// Sliced
expected_indices = ArrayFromJSON(int8(), "[2, 1]");
CheckTranspose(sliced, {1, 3, 2}, out_dict_type, out_dict, expected_indices);
}
}

auto expected_indices = ArrayFromJSON(int8(), "[3, 2, 1, 1]");
ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, out_dict,
&expected));
AssertArraysEqual(*expected, *out);
TEST(TestDictionary, TransposeTrivial) {
// Test a trivial transposition, possibly optimized away
std::shared_ptr<Array> arr, sliced;

auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]");
auto dict_type = dictionary(int16(), utf8());
auto indices = ArrayFromJSON(int16(), "[1, 2, 0, 0]");
// ["B", "C", "A", "A"]
ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices, dict, &arr));
// ["C", "A"]
sliced = arr->Slice(1, 2);

// Transpose to same index type
{
auto out_dict_type = dict_type;
auto out_dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\", \"D\"]");
auto expected_indices = ArrayFromJSON(int16(), "[1, 2, 0, 0]");
CheckTranspose(arr, {0, 1, 2}, out_dict_type, out_dict, expected_indices);

// Sliced
expected_indices = ArrayFromJSON(int16(), "[2, 0]");
CheckTranspose(sliced, {0, 1, 2}, out_dict_type, out_dict, expected_indices);
}

// Transpose to other index type
{
auto out_dict_type = dictionary(int8(), utf8());
auto out_dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\", \"D\"]");
auto expected_indices = ArrayFromJSON(int8(), "[1, 2, 0, 0]");
CheckTranspose(arr, {0, 1, 2}, out_dict_type, out_dict, expected_indices);

// Sliced
expected_indices = ArrayFromJSON(int8(), "[2, 0]");
CheckTranspose(sliced, {0, 1, 2}, out_dict_type, out_dict, expected_indices);
}
}

TEST(TestDictionary, TransposeNulls) {
std::shared_ptr<Array> arr, out, expected;
std::shared_ptr<Array> arr, sliced;

auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]");
auto dict_type = dictionary(int16(), utf8());
auto indices = ArrayFromJSON(int16(), "[1, 2, null, 0]");
// ["B", "C", null, "A"]
ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices, dict, &arr));
// ["C", null]
sliced = arr->Slice(1, 2);

auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]");
auto out_dict_type = dictionary(int16(), utf8());
auto expected_indices = ArrayFromJSON(int16(), "[3, 2, null, 1]");

const std::vector<int32_t> transpose_map{1, 3, 2};
ASSERT_OK(internal::checked_cast<const DictionaryArray&>(*arr).Transpose(
default_memory_pool(), out_dict_type, out_dict, transpose_map, &out));
CheckTranspose(arr, {1, 3, 2}, out_dict_type, out_dict, expected_indices);

auto expected_indices = ArrayFromJSON(int16(), "[3, 2, null, 1]");
ASSERT_OK(
DictionaryArray::FromArrays(out_dict_type, expected_indices, out_dict, &expected));
AssertArraysEqual(*expected, *out);
// Sliced
expected_indices = ArrayFromJSON(int16(), "[2, null]");

CheckTranspose(sliced, {1, 3, 2}, out_dict_type, out_dict, expected_indices);
}

TEST(TestDictionary, DISABLED_ListOfDictionary) {
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/arrow/type.h
Original file line number Diff line number Diff line change
Expand Up @@ -1259,7 +1259,8 @@ class ARROW_EXPORT DictionaryType : public FixedWidthType {
/// one per input type. Each integer vector represents the transposition
/// of input type indices into unified type indices.
// XXX Should we return something special (an empty transpose map?) when
// the transposition is the identity function?
// the transposition is the identity function? Currently this case is
// detected in DictionaryArray::Transpose.
static Status Unify(MemoryPool* pool, const std::vector<const DataType*>& types,
const std::vector<const Array*>& dictionaries,
std::shared_ptr<DataType>* out_type,
Expand Down