From 779535280903bfe05fcbdfea5f5b01eb4e75c270 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sat, 6 Jul 2019 20:19:45 -0700 Subject: [PATCH 01/11] change from array builder to buffer builde --- cpp/src/parquet/arrow/writer.cc | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 9c487b97e2e..c399dd91bd2 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -49,8 +49,7 @@ using arrow::ChunkedArray; using arrow::Decimal128Array; using arrow::Field; using arrow::FixedSizeBinaryArray; -using arrow::Int16Array; -using arrow::Int16Builder; +using Int16BufferBuilder = arrow::TypedBufferBuilder; using arrow::ListArray; using arrow::MemoryPool; using arrow::NumericArray; @@ -82,7 +81,7 @@ namespace { class LevelBuilder { public: explicit LevelBuilder(MemoryPool* pool) - : def_levels_(::arrow::int16(), pool), rep_levels_(::arrow::int16(), pool) {} + : def_levels_(pool), rep_levels_(pool) {} Status VisitInline(const Array& array); @@ -179,15 +178,9 @@ class LevelBuilder { RETURN_NOT_OK(rep_levels_.Append(0)); RETURN_NOT_OK(HandleListEntries(0, 0, 0, array.length())); - std::shared_ptr def_levels_array; - std::shared_ptr rep_levels_array; - - RETURN_NOT_OK(def_levels_.Finish(&def_levels_array)); - RETURN_NOT_OK(rep_levels_.Finish(&rep_levels_array)); - - *def_levels_out = static_cast(def_levels_array.get())->values(); - *rep_levels_out = static_cast(rep_levels_array.get())->values(); - *num_levels = rep_levels_array->length(); + RETURN_NOT_OK(def_levels_.Finish(def_levels_out)); + RETURN_NOT_OK(rep_levels_.Finish(rep_levels_out)); + *num_levels = (*rep_levels_out)->size() / sizeof(int16_t); } return Status::OK(); @@ -261,8 +254,8 @@ class LevelBuilder { } private: - Int16Builder def_levels_; - Int16Builder rep_levels_; + Int16BufferBuilder def_levels_; + Int16BufferBuilder rep_levels_; std::vector null_counts_; std::vector valid_bitmaps_; @@ -307,7 +300,7 @@ struct ColumnWriterContext { Status GetLeafType(const ::arrow::DataType& type, ::arrow::Type::type* leaf_type) { if (type.id() == ::arrow::Type::LIST || type.id() == ::arrow::Type::STRUCT) { if (type.num_children() != 1) { - return Status::Invalid("Nested column branch had multiple children"); + return Status::Invalid("Nested column branch had multiple children: ", type.ToString()); } return GetLeafType(*type.child(0)->type(), leaf_type); } else { From 4516110c569d033749bcb0488cb93a0e401844cd Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sun, 7 Jul 2019 22:37:21 -0700 Subject: [PATCH 02/11] Small style issues --- cpp/src/parquet/arrow/writer.cc | 60 +++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index c399dd91bd2..3a1d6b23fa0 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -80,8 +80,7 @@ namespace { class LevelBuilder { public: - explicit LevelBuilder(MemoryPool* pool) - : def_levels_(pool), rep_levels_(pool) {} + explicit LevelBuilder(MemoryPool* pool) : def_levels_(pool), rep_levels_(pool) {} Status VisitInline(const Array& array); @@ -101,6 +100,7 @@ class LevelBuilder { null_counts_.push_back(array.null_count()); offsets_.push_back(array.raw_value_offsets()); + // Min offset isn't always zero in the case of sliced Arrays. min_offset_idx_ = array.value_offset(min_offset_idx_); max_offset_idx_ = array.value_offset(max_offset_idx_); @@ -133,7 +133,6 @@ class LevelBuilder { *num_values = max_offset_idx_ - min_offset_idx_; *values_offset = min_offset_idx_; *values_array = values_array_; - // Walk downwards to extract nullability std::shared_ptr current_field = field; nullable_.push_back(current_field->nullable()); @@ -175,6 +174,9 @@ class LevelBuilder { } *num_levels = array.length(); } else { + def_levels_.Reserve(num_values); + rep_levels_.Reserve(num_values); + RETURN_NOT_OK(rep_levels_.Append(0)); RETURN_NOT_OK(HandleListEntries(0, 0, 0, array.length())); @@ -192,7 +194,7 @@ class LevelBuilder { BitUtil::GetBit(valid_bitmaps_[rep_level], index + array_offsets_[rep_level])) { return HandleNonNullList(static_cast(def_level + 1), rep_level, index); } else { - return def_levels_.Append(def_level); + return def_levels_.UnsafeAppend(def_level); } } else { return HandleNonNullList(def_level, rep_level, index); @@ -216,26 +218,31 @@ class LevelBuilder { const int64_t level_null_count = null_counts_[recursion_level]; const uint8_t* level_valid_bitmap = valid_bitmaps_[recursion_level]; - for (int64_t i = 0; i < inner_length; i++) { - if (i > 0) { - RETURN_NOT_OK(rep_levels_.Append(static_cast(rep_level + 1))); - } - if (level_null_count && level_valid_bitmap == nullptr) { - // Special case: this is a null array (all elements are null) - RETURN_NOT_OK(def_levels_.Append(static_cast(def_level + 1))); - } else if (nullable_level && - ((level_null_count == 0) || - BitUtil::GetBit( - level_valid_bitmap, - inner_offset + i + array_offsets_[recursion_level]))) { - // Non-null element in a null level - RETURN_NOT_OK(def_levels_.Append(static_cast(def_level + 2))); - } else { - // This can be produced in two case: - // * elements are nullable and this one is null (i.e. max_def_level = def_level - // + 2) - // * elements are non-nullable (i.e. max_def_level = def_level + 1) - RETURN_NOT_OK(def_levels_.Append(static_cast(def_level + 1))); + if (inner_length >= 1) { + RETURN_NOT_OK( + rep_levels_.Append(inner_length - 1, static_cast(rep_level + 1))); + } + + // Special case: this is a null array (all elements are null) + if (level_null_count && level_valid_bitmap == nullptr) { + RETURN_NOT_OK( + def_levels_.Append(inner_length, static_cast(def_level + 1))); + } else { + for (int64_t i = 0; i < inner_length; i++) { + if (nullable_level && + ((level_null_count == 0) || + BitUtil::GetBit(level_valid_bitmap, + inner_offset + i + array_offsets_[recursion_level]))) { + // Non-null element in a null level + RETURN_NOT_OK(def_levels_.Append(static_cast(def_level + 2))); + } else { + // This can be produced in two case: + // * elements are nullable and this one is null (i.e. max_def_level = + // def_level + // + 2) + // * elements are non-nullable (i.e. max_def_level = def_level + 1) + RETURN_NOT_OK(def_levels_.Append(static_cast(def_level + 1))); + } } } return Status::OK(); @@ -246,7 +253,7 @@ class LevelBuilder { int64_t length) { for (int64_t i = 0; i < length; i++) { if (i > 0) { - RETURN_NOT_OK(rep_levels_.Append(rep_level)); + RETURN_NOT_OK(rep_levels_.UnsafeAppend(rep_level)); } RETURN_NOT_OK(HandleList(def_level, rep_level, offset + i)); } @@ -300,7 +307,8 @@ struct ColumnWriterContext { Status GetLeafType(const ::arrow::DataType& type, ::arrow::Type::type* leaf_type) { if (type.id() == ::arrow::Type::LIST || type.id() == ::arrow::Type::STRUCT) { if (type.num_children() != 1) { - return Status::Invalid("Nested column branch had multiple children: ", type.ToString()); + return Status::Invalid("Nested column branch had multiple children: ", + type.ToString()); } return GetLeafType(*type.child(0)->type(), leaf_type); } else { From 325af5208238909bcf6e14ccc823be82a9068e67 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 8 Jul 2019 00:15:09 -0700 Subject: [PATCH 03/11] undo reserve --- cpp/src/arrow/buffer-builder.h | 6 ++---- cpp/src/parquet/arrow/writer.cc | 15 +++++++++------ 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/buffer-builder.h b/cpp/src/arrow/buffer-builder.h index 85f36ee3f5a..797e50b78e7 100644 --- a/cpp/src/arrow/buffer-builder.h +++ b/cpp/src/arrow/buffer-builder.h @@ -220,10 +220,8 @@ class TypedBufferBuilder::value void UnsafeAppend(const int64_t num_copies, T value) { auto data = mutable_data() + length(); - bytes_builder_.UnsafeAppend(num_copies * sizeof(T), 0); - for (const auto end = data + num_copies; data != end; ++data) { - *data = value; - } + bytes_builder_.UnsafeAdvance(num_copies * sizeof(T)); + std::fill(data, data + num_copies, value); } Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) { diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 3a1d6b23fa0..b27deb86f00 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -133,6 +133,7 @@ class LevelBuilder { *num_values = max_offset_idx_ - min_offset_idx_; *values_offset = min_offset_idx_; *values_array = values_array_; + // Walk downwards to extract nullability std::shared_ptr current_field = field; nullable_.push_back(current_field->nullable()); @@ -174,9 +175,11 @@ class LevelBuilder { } *num_levels = array.length(); } else { - def_levels_.Reserve(num_values); - rep_levels_.Reserve(num_values); - + // Note it is hard to estimate memory consumption due to zero length + // arrays otherwise we would preallocate. An upper boun on memory + // is the sum of the length of each list array + number of elements + // but this might be too loose of an upper bound so we choose to use + // safe methods. RETURN_NOT_OK(rep_levels_.Append(0)); RETURN_NOT_OK(HandleListEntries(0, 0, 0, array.length())); @@ -194,7 +197,7 @@ class LevelBuilder { BitUtil::GetBit(valid_bitmaps_[rep_level], index + array_offsets_[rep_level])) { return HandleNonNullList(static_cast(def_level + 1), rep_level, index); } else { - return def_levels_.UnsafeAppend(def_level); + return def_levels_.Append(def_level); } } else { return HandleNonNullList(def_level, rep_level, index); @@ -245,15 +248,15 @@ class LevelBuilder { } } } - return Status::OK(); } + return Status::OK(); } Status HandleListEntries(int16_t def_level, int16_t rep_level, int64_t offset, int64_t length) { for (int64_t i = 0; i < length; i++) { if (i > 0) { - RETURN_NOT_OK(rep_levels_.UnsafeAppend(rep_level)); + RETURN_NOT_OK(rep_levels_.Append(rep_level)); } RETURN_NOT_OK(HandleList(def_level, rep_level, offset + i)); } From 83ee5ae0c926a4ea1984a7b1d2d309dd4e1f26e4 Mon Sep 17 00:00:00 2001 From: emkornfield Date: Tue, 9 Jul 2019 23:57:27 -0700 Subject: [PATCH 04/11] Update cpp/src/parquet/arrow/writer.cc Co-Authored-By: Benjamin Kietzman --- cpp/src/parquet/arrow/writer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index b27deb86f00..b6ee850804e 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -49,7 +49,7 @@ using arrow::ChunkedArray; using arrow::Decimal128Array; using arrow::Field; using arrow::FixedSizeBinaryArray; -using Int16BufferBuilder = arrow::TypedBufferBuilder; +using UInt16BufferBuilder = arrow::TypedBufferBuilder; using arrow::ListArray; using arrow::MemoryPool; using arrow::NumericArray; From 7cbc8e4d48f991b94b0eb20c722e9560538f190d Mon Sep 17 00:00:00 2001 From: emkornfield Date: Tue, 9 Jul 2019 23:57:36 -0700 Subject: [PATCH 05/11] Update cpp/src/parquet/arrow/writer.cc Co-Authored-By: Benjamin Kietzman --- cpp/src/parquet/arrow/writer.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index b6ee850804e..bfce31bf798 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -240,7 +240,8 @@ class LevelBuilder { RETURN_NOT_OK(def_levels_.Append(static_cast(def_level + 2))); } else { // This can be produced in two case: - // * elements are nullable and this one is null (i.e. max_def_level = + // * elements are nullable and this one is null + // (i.e. max_def_level = def_level + 2) // def_level // + 2) // * elements are non-nullable (i.e. max_def_level = def_level + 1) From 6776e6109dbeadcf99735c14fb87c51310c45a93 Mon Sep 17 00:00:00 2001 From: emkornfield Date: Tue, 9 Jul 2019 23:57:48 -0700 Subject: [PATCH 06/11] Update cpp/src/parquet/arrow/writer.cc Co-Authored-By: Benjamin Kietzman --- cpp/src/parquet/arrow/writer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index bfce31bf798..d873fc4fbbe 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -311,7 +311,7 @@ struct ColumnWriterContext { Status GetLeafType(const ::arrow::DataType& type, ::arrow::Type::type* leaf_type) { if (type.id() == ::arrow::Type::LIST || type.id() == ::arrow::Type::STRUCT) { if (type.num_children() != 1) { - return Status::Invalid("Nested column branch had multiple children: ", + return Status::Invalid("Nested column branch had multiple children: ", *type); type.ToString()); } return GetLeafType(*type.child(0)->type(), leaf_type); From ef7fade899b344b047a7808309b603f4101c7204 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 10 Jul 2019 00:04:36 -0700 Subject: [PATCH 07/11] remove unneeded nesting --- cpp/src/parquet/arrow/writer.cc | 55 +++++++++++++++------------------ 1 file changed, 25 insertions(+), 30 deletions(-) diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index d873fc4fbbe..5af7377e6bc 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -215,39 +215,34 @@ class LevelBuilder { return HandleListEntries(static_cast(def_level + 1), static_cast(rep_level + 1), inner_offset, inner_length); - } else { - // We have reached the leaf: primitive list, handle remaining nullables - const bool nullable_level = nullable_[recursion_level]; - const int64_t level_null_count = null_counts_[recursion_level]; - const uint8_t* level_valid_bitmap = valid_bitmaps_[recursion_level]; + } + // We have reached the leaf: primitive list, handle remaining nullables + const bool nullable_level = nullable_[recursion_level]; + const int64_t level_null_count = null_counts_[recursion_level]; + const uint8_t* level_valid_bitmap = valid_bitmaps_[recursion_level]; - if (inner_length >= 1) { - RETURN_NOT_OK( - rep_levels_.Append(inner_length - 1, static_cast(rep_level + 1))); - } + if (inner_length >= 1) { + RETURN_NOT_OK( + rep_levels_.Append(inner_length - 1, static_cast(rep_level + 1))); + } - // Special case: this is a null array (all elements are null) - if (level_null_count && level_valid_bitmap == nullptr) { - RETURN_NOT_OK( - def_levels_.Append(inner_length, static_cast(def_level + 1))); + // Special case: this is a null array (all elements are null) + if (level_null_count && level_valid_bitmap == nullptr) { + return def_levels_.Append(inner_length, static_cast(def_level + 1))); + } + for (int64_t i = 0; i < inner_length; i++) { + if (nullable_level && + ((level_null_count == 0) || + BitUtil::GetBit(level_valid_bitmap, + inner_offset + i + array_offsets_[recursion_level]))) { + // Non-null element in a null level + RETURN_NOT_OK(def_levels_.Append(static_cast(def_level + 2))); } else { - for (int64_t i = 0; i < inner_length; i++) { - if (nullable_level && - ((level_null_count == 0) || - BitUtil::GetBit(level_valid_bitmap, - inner_offset + i + array_offsets_[recursion_level]))) { - // Non-null element in a null level - RETURN_NOT_OK(def_levels_.Append(static_cast(def_level + 2))); - } else { - // This can be produced in two case: - // * elements are nullable and this one is null - // (i.e. max_def_level = def_level + 2) - // def_level - // + 2) - // * elements are non-nullable (i.e. max_def_level = def_level + 1) - RETURN_NOT_OK(def_levels_.Append(static_cast(def_level + 1))); - } - } + // This can be produced in two cases: + // * elements are nullable and this one is null + // (i.e. max_def_level = def_level + 2) + // * elements are non-nullable (i.e. max_def_level = def_level + 1) + RETURN_NOT_OK(def_levels_.Append(static_cast(def_level + 1))); } } return Status::OK(); From 67ccf3ec3fb12224d361bf4970ccab1932ea7618 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 10 Jul 2019 00:08:13 -0700 Subject: [PATCH 08/11] update references --- cpp/src/parquet/arrow/writer.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 5af7377e6bc..4808764c3bb 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -260,8 +260,8 @@ class LevelBuilder { } private: - Int16BufferBuilder def_levels_; - Int16BufferBuilder rep_levels_; + UInt16BufferBuilder def_levels_; + UInt16BufferBuilder rep_levels_; std::vector null_counts_; std::vector valid_bitmaps_; From 152a6ee802773904289f5ea2c397a359dd815532 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 10 Jul 2019 00:10:36 -0700 Subject: [PATCH 09/11] change back to signed it seems to be what the API uses --- cpp/src/parquet/arrow/writer.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 4808764c3bb..2f0a48c7511 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -49,7 +49,7 @@ using arrow::ChunkedArray; using arrow::Decimal128Array; using arrow::Field; using arrow::FixedSizeBinaryArray; -using UInt16BufferBuilder = arrow::TypedBufferBuilder; +using Int16BufferBuilder = arrow::TypedBufferBuilder; using arrow::ListArray; using arrow::MemoryPool; using arrow::NumericArray; @@ -260,8 +260,8 @@ class LevelBuilder { } private: - UInt16BufferBuilder def_levels_; - UInt16BufferBuilder rep_levels_; + Int16BufferBuilder def_levels_; + Int16BufferBuilder rep_levels_; std::vector null_counts_; std::vector valid_bitmaps_; From e1698e55122bf3758e2f8339760cb3c03c1bfb2c Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 10 Jul 2019 00:12:44 -0700 Subject: [PATCH 10/11] fix compilation issues --- cpp/src/parquet/arrow/writer.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 2f0a48c7511..fd40319fde8 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -228,7 +228,7 @@ class LevelBuilder { // Special case: this is a null array (all elements are null) if (level_null_count && level_valid_bitmap == nullptr) { - return def_levels_.Append(inner_length, static_cast(def_level + 1))); + return def_levels_.Append(inner_length, static_cast(def_level + 1)); } for (int64_t i = 0; i < inner_length; i++) { if (nullable_level && @@ -239,7 +239,7 @@ class LevelBuilder { RETURN_NOT_OK(def_levels_.Append(static_cast(def_level + 2))); } else { // This can be produced in two cases: - // * elements are nullable and this one is null + // * elements are nullable and this one is null // (i.e. max_def_level = def_level + 2) // * elements are non-nullable (i.e. max_def_level = def_level + 1) RETURN_NOT_OK(def_levels_.Append(static_cast(def_level + 1))); @@ -306,8 +306,7 @@ struct ColumnWriterContext { Status GetLeafType(const ::arrow::DataType& type, ::arrow::Type::type* leaf_type) { if (type.id() == ::arrow::Type::LIST || type.id() == ::arrow::Type::STRUCT) { if (type.num_children() != 1) { - return Status::Invalid("Nested column branch had multiple children: ", *type); - type.ToString()); + return Status::Invalid("Nested column branch had multiple children: ", type); } return GetLeafType(*type.child(0)->type(), leaf_type); } else { From 2f773139847602542dd4441275d956f64ce00a41 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 10 Jul 2019 14:59:50 -0500 Subject: [PATCH 11/11] Export operator<< symbols for DataType, TimeUnit on Windows --- cpp/src/arrow/type.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 10c1f90b58f..fc235bb2d67 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -215,6 +215,7 @@ class ARROW_EXPORT DataType { ARROW_DISALLOW_COPY_AND_ASSIGN(DataType); }; +ARROW_EXPORT std::ostream& operator<<(std::ostream& os, const DataType& type); /// \brief Base class for all fixed-width data types @@ -762,6 +763,7 @@ struct TimeUnit { enum type { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 }; }; +ARROW_EXPORT std::ostream& operator<<(std::ostream& os, TimeUnit::type unit); /// Base type class for time data