Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 35 additions & 5 deletions cpp/src/arrow/array-dict-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,26 @@ TYPED_TEST(TestDictionaryBuilder, DoubleDeltaDictionary) {
ASSERT_TRUE(expected_delta2.Equals(result_delta2));
}

TYPED_TEST(TestDictionaryBuilder, Dictionary32_BasicPrimitive) {
using c_type = typename TypeParam::c_type;
auto type = std::make_shared<TypeParam>();
auto dict_type = dictionary(int32(), type);

Dictionary32Builder<TypeParam> builder;

ASSERT_OK(builder.Append(static_cast<c_type>(1)));
ASSERT_OK(builder.Append(static_cast<c_type>(2)));
ASSERT_OK(builder.Append(static_cast<c_type>(1)));
ASSERT_OK(builder.Append(static_cast<c_type>(2)));
std::shared_ptr<Array> result;
FinishAndCheckPadding(&builder, &result);

// Build expected data for the initial dictionary
auto ex_dict1 = ArrayFromJSON(type, "[1, 2]");
DictionaryArray expected(dict_type, ArrayFromJSON(int32(), "[0, 1, 0, 1]"), ex_dict1);
ASSERT_TRUE(expected.Equals(result));
}

TEST(TestStringDictionaryBuilder, Basic) {
// Build the dictionary Array
StringDictionaryBuilder builder;
Expand All @@ -301,11 +321,14 @@ TEST(TestStringDictionaryBuilder, Basic) {
ASSERT_TRUE(expected.Equals(result));
}

TEST(TestStringDictionaryBuilder, AppendIndices) {
template <typename BuilderType, typename IndexType, typename AppendCType>
void TestStringDictionaryAppendIndices() {
auto index_type = TypeTraits<IndexType>::type_singleton();

auto ex_dict = ArrayFromJSON(utf8(), R"(["c", "a", "b", "d"])");
auto invalid_dict = ArrayFromJSON(binary(), R"(["e", "f"])");

StringDictionaryBuilder builder;
BuilderType builder;
ASSERT_OK(builder.InsertMemoValues(*ex_dict));

// Inserting again should have no effect
Expand All @@ -314,7 +337,7 @@ TEST(TestStringDictionaryBuilder, AppendIndices) {
// Type mismatch
ASSERT_RAISES(Invalid, builder.InsertMemoValues(*invalid_dict));

std::vector<int64_t> raw_indices = {0, 1, 2, -1, 3};
std::vector<AppendCType> raw_indices = {0, 1, 2, -1, 3};
std::vector<uint8_t> is_valid = {1, 1, 1, 0, 1};
for (int i = 0; i < 2; ++i) {
ASSERT_OK(builder.AppendIndices(
Expand All @@ -326,12 +349,19 @@ TEST(TestStringDictionaryBuilder, AppendIndices) {
std::shared_ptr<Array> result;
ASSERT_OK(builder.Finish(&result));

auto ex_indices = ArrayFromJSON(int8(), R"([0, 1, 2, null, 3, 0, 1, 2, null, 3])");
auto dtype = dictionary(int8(), utf8());
auto ex_indices = ArrayFromJSON(index_type, R"([0, 1, 2, null, 3, 0, 1, 2, null, 3])");
auto dtype = dictionary(index_type, utf8());
DictionaryArray expected(dtype, ex_indices, ex_dict);
ASSERT_TRUE(expected.Equals(result));
}

TEST(TestStringDictionaryBuilder, AppendIndices) {
// Currently AdaptiveIntBuilder only accepts int64_t in bulk appends
TestStringDictionaryAppendIndices<StringDictionaryBuilder, Int8Type, int64_t>();

TestStringDictionaryAppendIndices<StringDictionary32Builder, Int32Type, int32_t>();
}

TEST(TestStringDictionaryBuilder, ArrayInit) {
auto dict_array = ArrayFromJSON(utf8(), R"(["test", "test2"])");
auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]");
Expand Down
168 changes: 117 additions & 51 deletions cpp/src/arrow/array/builder_dict.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@
#include <algorithm>
#include <memory>

#include "arrow/array/builder_adaptive.h" // IWYU pragma: export
#include "arrow/array/builder_base.h" // IWYU pragma: export
#include "arrow/array/builder_adaptive.h" // IWYU pragma: export
#include "arrow/array/builder_base.h" // IWYU pragma: export
#include "arrow/array/builder_primitive.h" // IWYU pragma: export

#include "arrow/array.h"

Expand Down Expand Up @@ -84,8 +85,6 @@ class ARROW_EXPORT DictionaryMemoTable {
std::unique_ptr<DictionaryMemoTableImpl> impl_;
};

} // namespace internal

/// \brief Array builder for created encoded DictionaryArray from
/// dense array
///
Expand All @@ -95,50 +94,50 @@ class ARROW_EXPORT DictionaryMemoTable {
/// build a delta dictionary when new terms occur.
///
/// data
template <typename T>
class DictionaryBuilder : public ArrayBuilder {
template <typename BuilderType, typename T>
class DictionaryBuilderBase : public ArrayBuilder {
public:
using Scalar = typename internal::DictionaryScalar<T>::type;
using Scalar = typename DictionaryScalar<T>::type;

// WARNING: the type given below is the value type, not the DictionaryType.
// The DictionaryType is instantiated on the Finish() call.
template <typename T1 = T>
DictionaryBuilder(
DictionaryBuilderBase(
typename std::enable_if<!std::is_base_of<FixedSizeBinaryType, T1>::value,
const std::shared_ptr<DataType>&>::type type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(type, pool),
memo_table_(new internal::DictionaryMemoTable(type)),
memo_table_(new DictionaryMemoTable(type)),
delta_offset_(0),
byte_width_(-1),
values_builder_(pool) {}

template <typename T1 = T>
explicit DictionaryBuilder(
explicit DictionaryBuilderBase(
typename std::enable_if<std::is_base_of<FixedSizeBinaryType, T1>::value,
const std::shared_ptr<DataType>&>::type type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(type, pool),
memo_table_(new internal::DictionaryMemoTable(type)),
memo_table_(new DictionaryMemoTable(type)),
delta_offset_(0),
byte_width_(static_cast<const T1&>(*type).byte_width()),
values_builder_(pool) {}

template <typename T1 = T>
explicit DictionaryBuilder(
explicit DictionaryBuilderBase(
typename std::enable_if<TypeTraits<T1>::is_parameter_free, MemoryPool*>::type pool =
default_memory_pool())
: DictionaryBuilder<T1>(TypeTraits<T1>::type_singleton(), pool) {}
: DictionaryBuilderBase<BuilderType, T1>(TypeTraits<T1>::type_singleton(), pool) {}

DictionaryBuilder(const std::shared_ptr<Array>& dictionary,
MemoryPool* pool = default_memory_pool())
DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(dictionary->type(), pool),
memo_table_(new internal::DictionaryMemoTable(dictionary)),
memo_table_(new DictionaryMemoTable(dictionary)),
delta_offset_(0),
byte_width_(-1),
values_builder_(pool) {}

~DictionaryBuilder() override = default;
~DictionaryBuilderBase() override = default;

/// \brief Append a scalar value
Status Append(const Scalar& value) {
Expand Down Expand Up @@ -189,18 +188,6 @@ class DictionaryBuilder : public ArrayBuilder {
return memo_table_->InsertValues(values);
}

/// \brief Append dictionary indices directly without modifying memo
///
/// NOTE: Experimental API
Status AppendIndices(const int64_t* values, int64_t length,
const uint8_t* valid_bytes = NULLPTR) {
int64_t null_count_before = values_builder_.null_count();
ARROW_RETURN_NOT_OK(values_builder_.AppendValues(values, length, valid_bytes));
length_ += length;
null_count_ += values_builder_.null_count() - null_count_before;
return Status::OK();
}

/// \brief Append a whole dense array to the builder
template <typename T1 = T>
Status AppendArray(
Expand Down Expand Up @@ -242,7 +229,7 @@ class DictionaryBuilder : public ArrayBuilder {
void Reset() override {
ArrayBuilder::Reset();
values_builder_.Reset();
memo_table_.reset(new internal::DictionaryMemoTable(type_));
memo_table_.reset(new DictionaryMemoTable(type_));
delta_offset_ = 0;
}

Expand Down Expand Up @@ -291,26 +278,27 @@ class DictionaryBuilder : public ArrayBuilder {
bool is_building_delta() { return delta_offset_ > 0; }

protected:
std::unique_ptr<internal::DictionaryMemoTable> memo_table_;
std::unique_ptr<DictionaryMemoTable> memo_table_;

int32_t delta_offset_;
// Only used for FixedSizeBinaryType
int32_t byte_width_;

AdaptiveIntBuilder values_builder_;
BuilderType values_builder_;
};

template <>
class DictionaryBuilder<NullType> : public ArrayBuilder {
template <typename BuilderType>
class DictionaryBuilderBase<BuilderType, NullType> : public ArrayBuilder {
public:
DictionaryBuilder(const std::shared_ptr<DataType>& type,
MemoryPool* pool = default_memory_pool())
DictionaryBuilderBase(const std::shared_ptr<DataType>& type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(type, pool), values_builder_(pool) {}
explicit DictionaryBuilder(MemoryPool* pool = default_memory_pool())

explicit DictionaryBuilderBase(MemoryPool* pool = default_memory_pool())
: ArrayBuilder(null(), pool), values_builder_(pool) {}

DictionaryBuilder(const std::shared_ptr<Array>& dictionary,
MemoryPool* pool = default_memory_pool())
DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(dictionary->type(), pool), values_builder_(pool) {}

/// \brief Append a scalar null value
Expand Down Expand Up @@ -362,16 +350,68 @@ class DictionaryBuilder<NullType> : public ArrayBuilder {
Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); }

protected:
AdaptiveIntBuilder values_builder_;
BuilderType values_builder_;
};

class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder<BinaryType> {
} // namespace internal

/// \brief A DictionaryArray builder that uses AdaptiveIntBuilder to return the
/// smallest index size that can accommodate the dictionary indices
template <typename T>
class DictionaryBuilder : public internal::DictionaryBuilderBase<AdaptiveIntBuilder, T> {
public:
using DictionaryBuilder::Append;
using DictionaryBuilder::AppendIndices;
using DictionaryBuilder::DictionaryBuilder;
using BASE = internal::DictionaryBuilderBase<AdaptiveIntBuilder, T>;
using BASE::BASE;

/// \brief Append dictionary indices directly without modifying memo
///
/// NOTE: Experimental API
Status AppendIndices(const int64_t* values, int64_t length,
const uint8_t* valid_bytes = NULLPTR) {
int64_t null_count_before = this->values_builder_.null_count();
ARROW_RETURN_NOT_OK(this->values_builder_.AppendValues(values, length, valid_bytes));
this->length_ += length;
this->null_count_ += this->values_builder_.null_count() - null_count_before;
return Status::OK();
}
};

BinaryDictionaryBuilder() : BinaryDictionaryBuilder(default_memory_pool()) {}
/// \brief A DictionaryArray builder that always returns int32 dictionary
/// indices so that data cast to dictionary form will have a consistent index
/// type, e.g. for creating a ChunkedArray
template <typename T>
class Dictionary32Builder : public internal::DictionaryBuilderBase<Int32Builder, T> {
public:
using BASE = internal::DictionaryBuilderBase<Int32Builder, T>;
using BASE::BASE;

/// \brief Append dictionary indices directly without modifying memo
///
/// NOTE: Experimental API
Status AppendIndices(const int32_t* values, int64_t length,
const uint8_t* valid_bytes = NULLPTR) {
int64_t null_count_before = this->values_builder_.null_count();
ARROW_RETURN_NOT_OK(this->values_builder_.AppendValues(values, length, valid_bytes));
this->length_ += length;
this->null_count_ += this->values_builder_.null_count() - null_count_before;
return Status::OK();
}
};

// ----------------------------------------------------------------------
// Binary / Unicode builders with slightly expanded APIs

namespace internal {

template <typename T>
class BinaryDictionaryBuilderImpl : public DictionaryBuilder<T> {
public:
using BASE = DictionaryBuilder<T>;
using BASE::Append;
using BASE::AppendIndices;
using BASE::BASE;

BinaryDictionaryBuilderImpl() : BinaryDictionaryBuilderImpl(default_memory_pool()) {}

Status Append(const uint8_t* value, int32_t length) {
return Append(reinterpret_cast<const char*>(value), length);
Expand All @@ -382,14 +422,16 @@ class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder<BinaryType
}
};

/// \brief Dictionary array builder with convenience methods for strings
class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder<StringType> {
template <typename T>
class BinaryDictionary32BuilderImpl : public Dictionary32Builder<T> {
public:
using DictionaryBuilder::Append;
using DictionaryBuilder::AppendIndices;
using DictionaryBuilder::DictionaryBuilder;
using BASE = Dictionary32Builder<T>;
using BASE::Append;
using BASE::AppendIndices;
using BASE::BASE;

StringDictionaryBuilder() : StringDictionaryBuilder(default_memory_pool()) {}
BinaryDictionary32BuilderImpl()
: BinaryDictionary32BuilderImpl(default_memory_pool()) {}

Status Append(const uint8_t* value, int32_t length) {
return Append(reinterpret_cast<const char*>(value), length);
Expand All @@ -400,4 +442,28 @@ class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder<StringType
}
};

} // namespace internal

class BinaryDictionaryBuilder : public internal::BinaryDictionaryBuilderImpl<BinaryType> {
using BASE = internal::BinaryDictionaryBuilderImpl<BinaryType>;
using BASE::BASE;
};

class StringDictionaryBuilder : public internal::BinaryDictionaryBuilderImpl<StringType> {
using BASE = BinaryDictionaryBuilderImpl<StringType>;
using BASE::BASE;
};

class BinaryDictionary32Builder
: public internal::BinaryDictionary32BuilderImpl<BinaryType> {
using BASE = internal::BinaryDictionary32BuilderImpl<BinaryType>;
using BASE::BASE;
};

class StringDictionary32Builder
: public internal::BinaryDictionary32BuilderImpl<StringType> {
using BASE = internal::BinaryDictionary32BuilderImpl<StringType>;
using BASE::BASE;
};

} // namespace arrow
16 changes: 9 additions & 7 deletions cpp/src/parquet/arrow/arrow-reader-writer-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2736,20 +2736,22 @@ TEST(TestArrowWriterAdHoc, SchemaMismatch) {

class TestArrowReadDictionary : public ::testing::TestWithParam<double> {
public:
static constexpr int kNumRowGroups = 10;

void SetUp() override {
GenerateData(GetParam());

// Write 4 row groups; each row group will have a different dictionary
ASSERT_NO_FATAL_FAILURE(
WriteTableToBuffer(expected_dense_, expected_dense_->num_rows() / 4,
WriteTableToBuffer(expected_dense_, expected_dense_->num_rows() / kNumRowGroups,
default_arrow_writer_properties(), &buffer_));

properties_ = default_arrow_reader_properties();
}

void GenerateData(double null_probability) {
constexpr int num_unique = 100;
constexpr int repeat = 100;
constexpr int num_unique = 1000;
constexpr int repeat = 50;
constexpr int64_t min_length = 2;
constexpr int64_t max_length = 100;
::arrow::random::RandomArrayGenerator rag(0);
Expand Down Expand Up @@ -2781,7 +2783,7 @@ class TestArrowReadDictionary : public ::testing::TestWithParam<double> {
};

void AsDictionaryEncoded(const Array& arr, std::shared_ptr<Array>* out) {
::arrow::StringDictionaryBuilder builder(default_memory_pool());
::arrow::StringDictionary32Builder builder(default_memory_pool());
const auto& string_array = static_cast<const ::arrow::StringArray&>(arr);
ASSERT_OK(builder.AppendArray(string_array));
ASSERT_OK(builder.Finish(out));
Expand All @@ -2790,9 +2792,9 @@ void AsDictionaryEncoded(const Array& arr, std::shared_ptr<Array>* out) {
TEST_P(TestArrowReadDictionary, ReadWholeFileDict) {
properties_.set_read_dictionary(0, true);

std::vector<std::shared_ptr<Array>> chunks(4);
const int64_t chunk_size = expected_dense_->num_rows() / 4;
for (int i = 0; i < 4; ++i) {
std::vector<std::shared_ptr<Array>> chunks(kNumRowGroups);
const int64_t chunk_size = expected_dense_->num_rows() / kNumRowGroups;
for (int i = 0; i < kNumRowGroups; ++i) {
AsDictionaryEncoded(*dense_values_->Slice(chunk_size * i, chunk_size), &chunks[i]);
}
auto ex_table = MakeSimpleTable(std::make_shared<ChunkedArray>(chunks),
Expand Down
Loading