From 9f37917667af5a2727c00e78373b624bb8c96da6 Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Thu, 11 Apr 2019 11:50:11 +0900 Subject: [PATCH 1/2] [C++] Enable dictionary builder tests with MinGW build This stops exporting template class that has internal implementation. It's not supported in MinGW. This exposes all template class implementations to .h and hides internal details to .cc by pimpl idiom. See also ARROW-4399. --- ci/appveyor-cpp-setup-mingw.bat | 1 + cpp/src/arrow/CMakeLists.txt | 28 +-- cpp/src/arrow/array/builder_dict.cc | 316 ++++++++++++---------------- cpp/src/arrow/array/builder_dict.h | 225 ++++++++++++++++++-- cpp/src/arrow/util/hashing.h | 22 +- 5 files changed, 359 insertions(+), 233 deletions(-) diff --git a/ci/appveyor-cpp-setup-mingw.bat b/ci/appveyor-cpp-setup-mingw.bat index d65b9bd4d20..4e4ae757985 100644 --- a/ci/appveyor-cpp-setup-mingw.bat +++ b/ci/appveyor-cpp-setup-mingw.bat @@ -32,6 +32,7 @@ pacman --sync --refresh --noconfirm ^ "%MINGW_PACKAGE_PREFIX%-flatbuffers" ^ "%MINGW_PACKAGE_PREFIX%-gflags" ^ "%MINGW_PACKAGE_PREFIX%-gobject-introspection" ^ + "%MINGW_PACKAGE_PREFIX%-gtest" ^ "%MINGW_PACKAGE_PREFIX%-gtk-doc" ^ "%MINGW_PACKAGE_PREFIX%-lz4" ^ "%MINGW_PACKAGE_PREFIX%-meson" ^ diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 3a6605a035e..8b81af3c4a5 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -320,26 +320,14 @@ arrow_add_pkg_config("arrow") # add_arrow_test(allocator-test) - -if(WIN32) - add_arrow_test(array-test - SOURCES - array-test.cc - array-binary-test.cc - array-list-test.cc - array-struct-test.cc - array-union-test.cc) -else() - add_arrow_test(array-test - SOURCES - array-test.cc - array-binary-test.cc - array-dict-test.cc - array-list-test.cc - array-struct-test.cc - array-union-test.cc) -endif() - +add_arrow_test(array-test + SOURCES + array-test.cc + array-binary-test.cc + array-dict-test.cc + array-list-test.cc + array-struct-test.cc + array-union-test.cc) add_arrow_test(buffer-test) if(ARROW_IPC) diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc index 2e43234ae11..8a243fb9fe7 100644 --- a/cpp/src/arrow/array/builder_dict.cc +++ b/cpp/src/arrow/array/builder_dict.cc @@ -17,7 +17,6 @@ #include "arrow/array/builder_dict.h" -#include #include #include #include @@ -145,229 +144,176 @@ Status DictionaryType::Unify(MemoryPool* pool, const std::vector -class DictionaryBuilder::MemoTableImpl - : public internal::HashTraits::MemoTableType { - public: - using MemoTableType = typename internal::HashTraits::MemoTableType; - using MemoTableType::MemoTableType; - - MemoTableImpl(const std::shared_ptr& dictionary) - : MemoTableImpl(dictionary->length()) { - const auto& values = - static_cast::ArrayType&>(*dictionary); - for (int64_t i = 0; i < values.length(); ++i) { - ARROW_IGNORE_EXPR(this->GetOrInsert(values.GetView(i))); +class internal::DictionaryMemoTable::DictionaryMemoTableImpl { + struct MemoTableInitializer { + std::shared_ptr value_type_; + std::unique_ptr* memo_table_; + + Status Visit(const DataType&, void* = nullptr) { + return Status::NotImplemented("Initialization of ", value_type_, + " memo table is not implemented"); } - } -}; -template -DictionaryBuilder::~DictionaryBuilder() {} - -template -DictionaryBuilder::DictionaryBuilder(const std::shared_ptr& dictionary, - MemoryPool* pool) - : ArrayBuilder(dictionary->type(), pool), - memo_table_(new MemoTableImpl(dictionary)), - delta_offset_(0), - byte_width_(-1), - values_builder_(pool) { - DCHECK_EQ(T::type_id, type_->id()) << "inconsistent type passed to DictionaryBuilder"; -} + template + Status Visit(const T&, + typename internal::DictionaryTraits::MemoTableType* = nullptr) { + using MemoTable = typename internal::DictionaryTraits::MemoTableType; + memo_table_->reset(new MemoTable(0)); + return Status::OK(); + } + }; -template -DictionaryBuilder::DictionaryBuilder(const std::shared_ptr& type, - MemoryPool* pool) - : ArrayBuilder(type, pool), - memo_table_(new MemoTableImpl(0)), - delta_offset_(0), - byte_width_(-1), - values_builder_(pool) { - DCHECK_EQ(T::type_id, type->id()) << "inconsistent type passed to DictionaryBuilder"; -} + struct ArrayValuesInserter { + DictionaryMemoTableImpl* impl_; -DictionaryBuilder::DictionaryBuilder(const std::shared_ptr& type, - MemoryPool* pool) - : ArrayBuilder(type, pool), values_builder_(pool) { - DCHECK_EQ(Type::NA, type->id()) << "inconsistent type passed to DictionaryBuilder"; -} + template + Status Visit(const T& array) { + return InsertValues(array.type(), array); + } -DictionaryBuilder::DictionaryBuilder(const std::shared_ptr& dictionary, - MemoryPool* pool) - : ArrayBuilder(dictionary->type(), pool), values_builder_(pool) { - DCHECK_EQ(Type::NA, type_->id()) << "inconsistent type passed to DictionaryBuilder"; -} + private: + template + Status InsertValues(const DataType& type, const Array&, void* = nullptr) { + return Status::NotImplemented("Inserting array values of ", type, + " is not implemented"); + } -template <> -DictionaryBuilder::DictionaryBuilder( - const std::shared_ptr& type, MemoryPool* pool) - : ArrayBuilder(type, pool), - memo_table_(new MemoTableImpl(0)), - delta_offset_(0), - byte_width_(checked_cast(*type).byte_width()) {} - -template -void DictionaryBuilder::Reset() { - ArrayBuilder::Reset(); - values_builder_.Reset(); - memo_table_.reset(new MemoTableImpl(0)); - delta_offset_ = 0; -} + template + Status InsertValues( + const DataType&, const Array& array, + typename internal::DictionaryTraits::MemoTableType* = nullptr) { + for (int64_t i = 0; i < array.length(); ++i) { + ARROW_IGNORE_EXPR(impl_->GetOrInsert(array.GetView(i))); + } + return Status::OK(); + } + }; + + struct ArrayDataGetter { + std::shared_ptr value_type_; + MemoTable* memo_table_; + MemoryPool* pool_; + int64_t start_offset_; + std::shared_ptr* out_; + + Status Visit(const DataType&, void* = nullptr) { + return Status::NotImplemented("Getting array data of ", value_type_, + " is not implemented"); + } -template -Status DictionaryBuilder::Resize(int64_t capacity) { - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - capacity = std::max(capacity, kMinBuilderCapacity); + template + Status Visit(const T&, + typename internal::DictionaryTraits::MemoTableType* = nullptr) { + using ConcreteMemoTable = typename internal::DictionaryTraits::MemoTableType; + auto memo_table = static_cast(memo_table_); + return internal::DictionaryTraits::GetDictionaryArrayData( + pool_, value_type_, *memo_table, start_offset_, out_); + } + }; - if (capacity_ == 0) { - // Initialize hash table - // XXX should we let the user pass additional size heuristics? - delta_offset_ = 0; + public: + explicit DictionaryMemoTableImpl(const std::shared_ptr& type) + : type_(type), memo_table_(nullptr) { + MemoTableInitializer visitor{type_, &memo_table_}; + ARROW_IGNORE_EXPR(VisitTypeInline(*type_, &visitor)); } - RETURN_NOT_OK(values_builder_.Resize(capacity)); - capacity_ = values_builder_.capacity(); - return Status::OK(); -} - -Status DictionaryBuilder::Resize(int64_t capacity) { - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - capacity = std::max(capacity, kMinBuilderCapacity); - RETURN_NOT_OK(values_builder_.Resize(capacity)); - capacity_ = values_builder_.capacity(); - return Status::OK(); -} + Status InsertValues(const std::shared_ptr& array) { + ArrayValuesInserter visitor{this}; + return VisitArrayInline(*array, &visitor); + } -template -Status DictionaryBuilder::Append(const Scalar& value) { - RETURN_NOT_OK(Reserve(1)); + template + int32_t GetOrInsert(const typename TypeTraits::CType& value) { + using ConcreteMemoTable = typename internal::HashTraits::MemoTableType; + return static_cast(memo_table_.get())->GetOrInsert(value); + } - auto memo_index = memo_table_->GetOrInsert(value); - RETURN_NOT_OK(values_builder_.Append(memo_index)); - length_ += 1; + template + int32_t GetOrInsert( + typename std::enable_if::value, + const util::string_view&>::type value) { + using ConcreteMemoTable = typename internal::HashTraits::MemoTableType; + return static_cast(memo_table_.get())->GetOrInsert(value); + } - return Status::OK(); -} + Status GetArrayData(MemoryPool* pool, int64_t start_offset, + std::shared_ptr* out) { + ArrayDataGetter visitor{type_, memo_table_.get(), pool, start_offset, out}; + return VisitTypeInline(*type_, &visitor); + } -template -Status DictionaryBuilder::AppendNull() { - length_ += 1; - null_count_ += 1; + int32_t size() const { return memo_table_->size(); } - return values_builder_.AppendNull(); -} + private: + std::shared_ptr type_; + std::unique_ptr memo_table_; +}; -template -Status DictionaryBuilder::AppendNulls(int64_t length) { - length_ += length; - null_count_ += length; +internal::DictionaryMemoTable::DictionaryMemoTable(const std::shared_ptr& type) + : impl_(new DictionaryMemoTableImpl(type)) {} - return values_builder_.AppendNulls(length); +internal::DictionaryMemoTable::DictionaryMemoTable( + const std::shared_ptr& dictionary) + : impl_(new DictionaryMemoTableImpl(dictionary->type())) { + ARROW_IGNORE_EXPR(impl_->InsertValues(dictionary)); } -Status DictionaryBuilder::AppendNull() { - length_ += 1; - null_count_ += 1; +internal::DictionaryMemoTable::~DictionaryMemoTable() = default; - return values_builder_.AppendNull(); +int32_t internal::DictionaryMemoTable::GetOrInsert(const bool& value) { + return impl_->GetOrInsert(value); } -Status DictionaryBuilder::AppendNulls(int64_t length) { - length_ += length; - null_count_ += length; - - return values_builder_.AppendNulls(length); +int32_t internal::DictionaryMemoTable::GetOrInsert(const int8_t& value) { + return impl_->GetOrInsert(value); } -template -Status DictionaryBuilder::AppendArray(const Array& array) { - using ArrayType = typename TypeTraits::ArrayType; - - const auto& concrete_array = checked_cast(array); - for (int64_t i = 0; i < array.length(); i++) { - if (array.IsNull(i)) { - RETURN_NOT_OK(AppendNull()); - } else { - RETURN_NOT_OK(Append(concrete_array.GetView(i))); - } - } - return Status::OK(); +int32_t internal::DictionaryMemoTable::GetOrInsert(const int16_t& value) { + return impl_->GetOrInsert(value); } -template <> -Status DictionaryBuilder::AppendArray(const Array& array) { - if (!type_->Equals(*array.type())) { - return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type"); - } - - const auto& typed_array = checked_cast(array); - for (int64_t i = 0; i < array.length(); i++) { - if (array.IsNull(i)) { - RETURN_NOT_OK(AppendNull()); - } else { - RETURN_NOT_OK(Append(typed_array.GetValue(i))); - } - } - return Status::OK(); +int32_t internal::DictionaryMemoTable::GetOrInsert(const int32_t& value) { + return impl_->GetOrInsert(value); } -Status DictionaryBuilder::AppendArray(const Array& array) { - for (int64_t i = 0; i < array.length(); i++) { - RETURN_NOT_OK(AppendNull()); - } - return Status::OK(); +int32_t internal::DictionaryMemoTable::GetOrInsert(const int64_t& value) { + return impl_->GetOrInsert(value); } -template -Status DictionaryBuilder::FinishInternal(std::shared_ptr* out) { - // Finalize indices array - RETURN_NOT_OK(values_builder_.FinishInternal(out)); - - // Generate dictionary array from hash table contents - std::shared_ptr dictionary; - std::shared_ptr dictionary_data; +int32_t internal::DictionaryMemoTable::GetOrInsert(const uint8_t& value) { + return impl_->GetOrInsert(value); +} - RETURN_NOT_OK(internal::DictionaryTraits::GetDictionaryArrayData( - pool_, type_, *memo_table_, delta_offset_, &dictionary_data)); - dictionary = MakeArray(dictionary_data); +int32_t internal::DictionaryMemoTable::GetOrInsert(const uint16_t& value) { + return impl_->GetOrInsert(value); +} - // Set type of array data to the right dictionary type - (*out)->type = std::make_shared((*out)->type, dictionary); +int32_t internal::DictionaryMemoTable::GetOrInsert(const uint32_t& value) { + return impl_->GetOrInsert(value); +} - // Update internals for further uses of this DictionaryBuilder - delta_offset_ = memo_table_->size(); - values_builder_.Reset(); +int32_t internal::DictionaryMemoTable::GetOrInsert(const uint64_t& value) { + return impl_->GetOrInsert(value); +} - return Status::OK(); +int32_t internal::DictionaryMemoTable::GetOrInsert(const float& value) { + return impl_->GetOrInsert(value); } -Status DictionaryBuilder::FinishInternal(std::shared_ptr* out) { - std::shared_ptr dictionary = std::make_shared(0); +int32_t internal::DictionaryMemoTable::GetOrInsert(const double& value) { + return impl_->GetOrInsert(value); +} - RETURN_NOT_OK(values_builder_.FinishInternal(out)); - (*out)->type = std::make_shared((*out)->type, dictionary); +int32_t internal::DictionaryMemoTable::GetOrInsert(const util::string_view& value) { + return impl_->GetOrInsert(value); +} - return Status::OK(); +Status internal::DictionaryMemoTable::GetArrayData(MemoryPool* pool, int64_t start_offset, + std::shared_ptr* out) { + return impl_->GetArrayData(pool, start_offset, out); } -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; +int32_t internal::DictionaryMemoTable::size() const { return impl_->size(); } } // namespace arrow diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index e7f44b9d09f..84f2e87c359 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -17,11 +17,14 @@ #pragma once +#include #include #include "arrow/array/builder_adaptive.h" // IWYU pragma: export #include "arrow/array/builder_base.h" // IWYU pragma: export +#include "arrow/array.h" + namespace arrow { // ---------------------------------------------------------------------- @@ -49,6 +52,35 @@ struct DictionaryScalar { using type = util::string_view; }; +class ARROW_EXPORT DictionaryMemoTable { + public: + explicit DictionaryMemoTable(const std::shared_ptr& type); + explicit DictionaryMemoTable(const std::shared_ptr& dictionary); + ~DictionaryMemoTable(); + + int32_t GetOrInsert(const bool& value); + int32_t GetOrInsert(const int8_t& value); + int32_t GetOrInsert(const int16_t& value); + int32_t GetOrInsert(const int32_t& value); + int32_t GetOrInsert(const int64_t& value); + int32_t GetOrInsert(const uint8_t& value); + int32_t GetOrInsert(const uint16_t& value); + int32_t GetOrInsert(const uint32_t& value); + int32_t GetOrInsert(const uint64_t& value); + int32_t GetOrInsert(const float& value); + int32_t GetOrInsert(const double& value); + int32_t GetOrInsert(const util::string_view& value); + + Status GetArrayData(MemoryPool* pool, int64_t start_offset, + std::shared_ptr* out); + + int32_t size() const; + + private: + class DictionaryMemoTableImpl; + std::unique_ptr impl_; +}; + } // namespace internal /// \brief Array builder for created encoded DictionaryArray from dense array @@ -60,25 +92,58 @@ struct DictionaryScalar { /// /// data template -class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { +class DictionaryBuilder : public ArrayBuilder { public: using Scalar = typename internal::DictionaryScalar::type; // WARNING: the type given below is the value type, not the DictionaryType. // The DictionaryType is instantiated on the Finish() call. - DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool); + template + DictionaryBuilder( + typename std::enable_if::value, + const std::shared_ptr&>::type type, + MemoryPool* pool) + : ArrayBuilder(type, pool), + memo_table_(new internal::DictionaryMemoTable(type)), + delta_offset_(0), + byte_width_(-1), + values_builder_(pool) {} - DictionaryBuilder(const std::shared_ptr& dictionary, MemoryPool* pool); + template + explicit DictionaryBuilder( + typename std::enable_if::value, + const std::shared_ptr&>::type type, + MemoryPool* pool) + : ArrayBuilder(type, pool), + memo_table_(new internal::DictionaryMemoTable(type)), + delta_offset_(0), + byte_width_(static_cast(*type).byte_width()), + values_builder_(pool) {} template explicit DictionaryBuilder( typename std::enable_if::is_parameter_free, MemoryPool*>::type pool) : DictionaryBuilder(TypeTraits::type_singleton(), pool) {} - ~DictionaryBuilder() override; + DictionaryBuilder(const std::shared_ptr& dictionary, MemoryPool* pool) + : ArrayBuilder(dictionary->type(), pool), + memo_table_(new internal::DictionaryMemoTable(dictionary)), + delta_offset_(0), + byte_width_(-1), + values_builder_(pool) {} + + ~DictionaryBuilder() override = default; /// \brief Append a scalar value - Status Append(const Scalar& value); + Status Append(const Scalar& value) { + ARROW_RETURN_NOT_OK(Reserve(1)); + + auto memo_index = memo_table_->GetOrInsert(value); + ARROW_RETURN_NOT_OK(values_builder_.Append(memo_index)); + length_ += 1; + + return Status::OK(); + } /// \brief Append a fixed-width string (only for FixedSizeBinaryType) template @@ -95,16 +160,100 @@ class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { } /// \brief Append a scalar null value - Status AppendNull() final; + Status AppendNull() final { + length_ += 1; + null_count_ += 1; - Status AppendNulls(int64_t length) final; + return values_builder_.AppendNull(); + } + + Status AppendNulls(int64_t length) final { + length_ += length; + null_count_ += length; + + return values_builder_.AppendNulls(length); + } /// \brief Append a whole dense array to the builder - Status AppendArray(const Array& array); + template + Status AppendArray( + typename std::enable_if::value, + const Array&>::type array) { + using ArrayType = typename TypeTraits::ArrayType; + + const auto& concrete_array = static_cast(array); + for (int64_t i = 0; i < array.length(); i++) { + if (array.IsNull(i)) { + ARROW_RETURN_NOT_OK(AppendNull()); + } else { + ARROW_RETURN_NOT_OK(Append(concrete_array.GetView(i))); + } + } + return Status::OK(); + } - void Reset() override; - Status Resize(int64_t capacity) override; - Status FinishInternal(std::shared_ptr* out) override; + template + Status AppendArray( + typename std::enable_if::value, + const Array&>::type array) { + if (!type_->Equals(*array.type())) { + return Status::Invalid( + "Cannot append FixedSizeBinary array with non-matching type"); + } + + const auto& concrete_array = static_cast(array); + for (int64_t i = 0; i < array.length(); i++) { + if (array.IsNull(i)) { + ARROW_RETURN_NOT_OK(AppendNull()); + } else { + ARROW_RETURN_NOT_OK(Append(concrete_array.GetValue(i))); + } + } + return Status::OK(); + } + + void Reset() override { + ArrayBuilder::Reset(); + values_builder_.Reset(); + memo_table_.reset(new internal::DictionaryMemoTable(type_)); + delta_offset_ = 0; + } + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + capacity = std::max(capacity, kMinBuilderCapacity); + + if (capacity_ == 0) { + // Initialize hash table + // XXX should we let the user pass additional size heuristics? + delta_offset_ = 0; + } + ARROW_RETURN_NOT_OK(values_builder_.Resize(capacity)); + capacity_ = values_builder_.capacity(); + return Status::OK(); + } + + Status FinishInternal(std::shared_ptr* out) override { + // Finalize indices array + ARROW_RETURN_NOT_OK(values_builder_.FinishInternal(out)); + + // Generate dictionary array from hash table contents + std::shared_ptr dictionary; + std::shared_ptr dictionary_data; + + ARROW_RETURN_NOT_OK( + memo_table_->GetArrayData(pool_, delta_offset_, &dictionary_data)); + dictionary = MakeArray(dictionary_data); + + // Set type of array data to the right dictionary type + (*out)->type = std::make_shared((*out)->type, dictionary); + + // Update internals for further uses of this DictionaryBuilder + delta_offset_ = memo_table_->size(); + values_builder_.Reset(); + + return Status::OK(); + } /// \cond FALSE using ArrayBuilder::Finish; @@ -116,8 +265,7 @@ class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { bool is_building_delta() { return delta_offset_ > 0; } protected: - class MemoTableImpl; - std::unique_ptr memo_table_; + std::unique_ptr memo_table_; int32_t delta_offset_; // Only used for FixedSizeBinaryType @@ -127,23 +275,56 @@ class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { }; template <> -class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { +class DictionaryBuilder : public ArrayBuilder { public: - DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool); - explicit DictionaryBuilder(MemoryPool* pool); + DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool) + : ArrayBuilder(type, pool), values_builder_(pool) {} + explicit DictionaryBuilder(MemoryPool* pool) + : ArrayBuilder(null(), pool), values_builder_(pool) {} - DictionaryBuilder(const std::shared_ptr& dictionary, MemoryPool* pool); + DictionaryBuilder(const std::shared_ptr& dictionary, MemoryPool* pool) + : ArrayBuilder(dictionary->type(), pool), values_builder_(pool) {} /// \brief Append a scalar null value - Status AppendNull() final; + Status AppendNull() final { + length_ += 1; + null_count_ += 1; + + return values_builder_.AppendNull(); + } + + Status AppendNulls(int64_t length) final { + length_ += length; + null_count_ += length; - Status AppendNulls(int64_t length) final; + return values_builder_.AppendNulls(length); + } /// \brief Append a whole dense array to the builder - Status AppendArray(const Array& array); + Status AppendArray(const Array& array) { + for (int64_t i = 0; i < array.length(); i++) { + ARROW_RETURN_NOT_OK(AppendNull()); + } + return Status::OK(); + } + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + capacity = std::max(capacity, kMinBuilderCapacity); - Status Resize(int64_t capacity) override; - Status FinishInternal(std::shared_ptr* out) override; + ARROW_RETURN_NOT_OK(values_builder_.Resize(capacity)); + capacity_ = values_builder_.capacity(); + return Status::OK(); + } + + Status FinishInternal(std::shared_ptr* out) override { + std::shared_ptr dictionary = std::make_shared(0); + + ARROW_RETURN_NOT_OK(values_builder_.FinishInternal(out)); + (*out)->type = std::make_shared((*out)->type, dictionary); + + return Status::OK(); + } /// \cond FALSE using ArrayBuilder::Finish; diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h index 044d4e96624..27301585fc6 100644 --- a/cpp/src/arrow/util/hashing.h +++ b/cpp/src/arrow/util/hashing.h @@ -332,6 +332,16 @@ class HashTable { // XXX typedef memo_index_t int32_t ? +// ---------------------------------------------------------------------- +// A base class for memoization table. + +class MemoTable { + public: + virtual ~MemoTable() = default; + + virtual int32_t size() const = 0; +}; + // ---------------------------------------------------------------------- // A memoization table for memory-cheap scalar values. @@ -339,7 +349,7 @@ class HashTable { // index for each key. template class HashTableTemplateType = HashTable> -class ScalarMemoTable { +class ScalarMemoTable : public MemoTable { public: explicit ScalarMemoTable(int64_t entries = 0) : hash_table_(static_cast(entries)) {} @@ -382,7 +392,7 @@ class ScalarMemoTable { // The number of entries in the memo table // (which is also 1 + the largest memo index) - int32_t size() const { return static_cast(hash_table_.size()); } + int32_t size() const override { return static_cast(hash_table_.size()); } // Copy values starting from index `start` into `out_data` void CopyValues(int32_t start, Scalar* out_data) const { @@ -435,7 +445,7 @@ struct SmallScalarTraits class HashTableTemplateType = HashTable> -class SmallScalarMemoTable { +class SmallScalarMemoTable : public MemoTable { public: explicit SmallScalarMemoTable(int64_t entries = 0) { std::fill(value_to_index_, value_to_index_ + cardinality, -1); @@ -469,7 +479,7 @@ class SmallScalarMemoTable { // The number of entries in the memo table // (which is also 1 + the largest memo index) - int32_t size() const { return static_cast(index_to_value_.size()); } + int32_t size() const override { return static_cast(index_to_value_.size()); } // Copy values starting from index `start` into `out_data` void CopyValues(int32_t start, Scalar* out_data) const { @@ -498,7 +508,7 @@ class SmallScalarMemoTable { // ---------------------------------------------------------------------- // A memoization table for variable-sized binary data. -class BinaryMemoTable { +class BinaryMemoTable : public MemoTable { public: explicit BinaryMemoTable(int64_t entries = 0, int64_t values_size = -1) : hash_table_(static_cast(entries)) { @@ -576,7 +586,7 @@ class BinaryMemoTable { // The number of entries in the memo table // (which is also 1 + the largest memo index) - int32_t size() const { return static_cast(hash_table_.size()); } + int32_t size() const override { return static_cast(hash_table_.size()); } int32_t values_size() const { return static_cast(values_.size()); } From e4d7f7d0a6249036cbdd2ba0995edce2b9dec87a Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Tue, 7 May 2019 05:04:26 +0900 Subject: [PATCH 2/2] Simplify --- cpp/src/arrow/array/builder_dict.cc | 37 +++++++++++++---------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc index 8a243fb9fe7..8af36280d0d 100644 --- a/cpp/src/arrow/array/builder_dict.cc +++ b/cpp/src/arrow/array/builder_dict.cc @@ -224,17 +224,14 @@ class internal::DictionaryMemoTable::DictionaryMemoTableImpl { } template - int32_t GetOrInsert(const typename TypeTraits::CType& value) { - using ConcreteMemoTable = typename internal::HashTraits::MemoTableType; + int32_t GetOrInsert(const T& value) { + using ConcreteMemoTable = typename internal::DictionaryTraits< + typename CTypeTraits::ArrowType>::MemoTableType; return static_cast(memo_table_.get())->GetOrInsert(value); } - template - int32_t GetOrInsert( - typename std::enable_if::value, - const util::string_view&>::type value) { - using ConcreteMemoTable = typename internal::HashTraits::MemoTableType; - return static_cast(memo_table_.get())->GetOrInsert(value); + int32_t GetOrInsert(const util::string_view& value) { + return static_cast(memo_table_.get())->GetOrInsert(value); } Status GetArrayData(MemoryPool* pool, int64_t start_offset, @@ -262,51 +259,51 @@ internal::DictionaryMemoTable::DictionaryMemoTable( internal::DictionaryMemoTable::~DictionaryMemoTable() = default; int32_t internal::DictionaryMemoTable::GetOrInsert(const bool& value) { - return impl_->GetOrInsert(value); + return impl_->GetOrInsert(value); } int32_t internal::DictionaryMemoTable::GetOrInsert(const int8_t& value) { - return impl_->GetOrInsert(value); + return impl_->GetOrInsert(value); } int32_t internal::DictionaryMemoTable::GetOrInsert(const int16_t& value) { - return impl_->GetOrInsert(value); + return impl_->GetOrInsert(value); } int32_t internal::DictionaryMemoTable::GetOrInsert(const int32_t& value) { - return impl_->GetOrInsert(value); + return impl_->GetOrInsert(value); } int32_t internal::DictionaryMemoTable::GetOrInsert(const int64_t& value) { - return impl_->GetOrInsert(value); + return impl_->GetOrInsert(value); } int32_t internal::DictionaryMemoTable::GetOrInsert(const uint8_t& value) { - return impl_->GetOrInsert(value); + return impl_->GetOrInsert(value); } int32_t internal::DictionaryMemoTable::GetOrInsert(const uint16_t& value) { - return impl_->GetOrInsert(value); + return impl_->GetOrInsert(value); } int32_t internal::DictionaryMemoTable::GetOrInsert(const uint32_t& value) { - return impl_->GetOrInsert(value); + return impl_->GetOrInsert(value); } int32_t internal::DictionaryMemoTable::GetOrInsert(const uint64_t& value) { - return impl_->GetOrInsert(value); + return impl_->GetOrInsert(value); } int32_t internal::DictionaryMemoTable::GetOrInsert(const float& value) { - return impl_->GetOrInsert(value); + return impl_->GetOrInsert(value); } int32_t internal::DictionaryMemoTable::GetOrInsert(const double& value) { - return impl_->GetOrInsert(value); + return impl_->GetOrInsert(value); } int32_t internal::DictionaryMemoTable::GetOrInsert(const util::string_view& value) { - return impl_->GetOrInsert(value); + return impl_->GetOrInsert(value); } Status internal::DictionaryMemoTable::GetArrayData(MemoryPool* pool, int64_t start_offset,