diff --git a/cpp/src/arrow/buffer-builder.h b/cpp/src/arrow/buffer-builder.h index 85f36ee3f5a..797e50b78e7 100644 --- a/cpp/src/arrow/buffer-builder.h +++ b/cpp/src/arrow/buffer-builder.h @@ -220,10 +220,8 @@ class TypedBufferBuilder::value void UnsafeAppend(const int64_t num_copies, T value) { auto data = mutable_data() + length(); - bytes_builder_.UnsafeAppend(num_copies * sizeof(T), 0); - for (const auto end = data + num_copies; data != end; ++data) { - *data = value; - } + bytes_builder_.UnsafeAdvance(num_copies * sizeof(T)); + std::fill(data, data + num_copies, value); } Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 10c1f90b58f..fc235bb2d67 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -215,6 +215,7 @@ class ARROW_EXPORT DataType { ARROW_DISALLOW_COPY_AND_ASSIGN(DataType); }; +ARROW_EXPORT std::ostream& operator<<(std::ostream& os, const DataType& type); /// \brief Base class for all fixed-width data types @@ -762,6 +763,7 @@ struct TimeUnit { enum type { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 }; }; +ARROW_EXPORT std::ostream& operator<<(std::ostream& os, TimeUnit::type unit); /// Base type class for time data diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 9c487b97e2e..fd40319fde8 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -49,8 +49,7 @@ using arrow::ChunkedArray; using arrow::Decimal128Array; using arrow::Field; using arrow::FixedSizeBinaryArray; -using arrow::Int16Array; -using arrow::Int16Builder; +using Int16BufferBuilder = arrow::TypedBufferBuilder; using arrow::ListArray; using arrow::MemoryPool; using arrow::NumericArray; @@ -81,8 +80,7 @@ namespace { class LevelBuilder { public: - explicit LevelBuilder(MemoryPool* pool) - : def_levels_(::arrow::int16(), pool), rep_levels_(::arrow::int16(), pool) {} + explicit LevelBuilder(MemoryPool* pool) : def_levels_(pool), rep_levels_(pool) {} Status VisitInline(const Array& array); @@ -102,6 +100,7 @@ class LevelBuilder { null_counts_.push_back(array.null_count()); offsets_.push_back(array.raw_value_offsets()); + // Min offset isn't always zero in the case of sliced Arrays. min_offset_idx_ = array.value_offset(min_offset_idx_); max_offset_idx_ = array.value_offset(max_offset_idx_); @@ -176,18 +175,17 @@ class LevelBuilder { } *num_levels = array.length(); } else { + // Note it is hard to estimate memory consumption due to zero length + // arrays otherwise we would preallocate. An upper boun on memory + // is the sum of the length of each list array + number of elements + // but this might be too loose of an upper bound so we choose to use + // safe methods. RETURN_NOT_OK(rep_levels_.Append(0)); RETURN_NOT_OK(HandleListEntries(0, 0, 0, array.length())); - std::shared_ptr def_levels_array; - std::shared_ptr rep_levels_array; - - RETURN_NOT_OK(def_levels_.Finish(&def_levels_array)); - RETURN_NOT_OK(rep_levels_.Finish(&rep_levels_array)); - - *def_levels_out = static_cast(def_levels_array.get())->values(); - *rep_levels_out = static_cast(rep_levels_array.get())->values(); - *num_levels = rep_levels_array->length(); + RETURN_NOT_OK(def_levels_.Finish(def_levels_out)); + RETURN_NOT_OK(rep_levels_.Finish(rep_levels_out)); + *num_levels = (*rep_levels_out)->size() / sizeof(int16_t); } return Status::OK(); @@ -217,36 +215,37 @@ class LevelBuilder { return HandleListEntries(static_cast(def_level + 1), static_cast(rep_level + 1), inner_offset, inner_length); - } else { - // We have reached the leaf: primitive list, handle remaining nullables - const bool nullable_level = nullable_[recursion_level]; - const int64_t level_null_count = null_counts_[recursion_level]; - const uint8_t* level_valid_bitmap = valid_bitmaps_[recursion_level]; - - for (int64_t i = 0; i < inner_length; i++) { - if (i > 0) { - RETURN_NOT_OK(rep_levels_.Append(static_cast(rep_level + 1))); - } - if (level_null_count && level_valid_bitmap == nullptr) { - // Special case: this is a null array (all elements are null) - RETURN_NOT_OK(def_levels_.Append(static_cast(def_level + 1))); - } else if (nullable_level && - ((level_null_count == 0) || - BitUtil::GetBit( - level_valid_bitmap, - inner_offset + i + array_offsets_[recursion_level]))) { - // Non-null element in a null level - RETURN_NOT_OK(def_levels_.Append(static_cast(def_level + 2))); - } else { - // This can be produced in two case: - // * elements are nullable and this one is null (i.e. max_def_level = def_level - // + 2) - // * elements are non-nullable (i.e. max_def_level = def_level + 1) - RETURN_NOT_OK(def_levels_.Append(static_cast(def_level + 1))); - } + } + // We have reached the leaf: primitive list, handle remaining nullables + const bool nullable_level = nullable_[recursion_level]; + const int64_t level_null_count = null_counts_[recursion_level]; + const uint8_t* level_valid_bitmap = valid_bitmaps_[recursion_level]; + + if (inner_length >= 1) { + RETURN_NOT_OK( + rep_levels_.Append(inner_length - 1, static_cast(rep_level + 1))); + } + + // Special case: this is a null array (all elements are null) + if (level_null_count && level_valid_bitmap == nullptr) { + return def_levels_.Append(inner_length, static_cast(def_level + 1)); + } + for (int64_t i = 0; i < inner_length; i++) { + if (nullable_level && + ((level_null_count == 0) || + BitUtil::GetBit(level_valid_bitmap, + inner_offset + i + array_offsets_[recursion_level]))) { + // Non-null element in a null level + RETURN_NOT_OK(def_levels_.Append(static_cast(def_level + 2))); + } else { + // This can be produced in two cases: + // * elements are nullable and this one is null + // (i.e. max_def_level = def_level + 2) + // * elements are non-nullable (i.e. max_def_level = def_level + 1) + RETURN_NOT_OK(def_levels_.Append(static_cast(def_level + 1))); } - return Status::OK(); } + return Status::OK(); } Status HandleListEntries(int16_t def_level, int16_t rep_level, int64_t offset, @@ -261,8 +260,8 @@ class LevelBuilder { } private: - Int16Builder def_levels_; - Int16Builder rep_levels_; + Int16BufferBuilder def_levels_; + Int16BufferBuilder rep_levels_; std::vector null_counts_; std::vector valid_bitmaps_; @@ -307,7 +306,7 @@ struct ColumnWriterContext { Status GetLeafType(const ::arrow::DataType& type, ::arrow::Type::type* leaf_type) { if (type.id() == ::arrow::Type::LIST || type.id() == ::arrow::Type::STRUCT) { if (type.num_children() != 1) { - return Status::Invalid("Nested column branch had multiple children"); + return Status::Invalid("Nested column branch had multiple children: ", type); } return GetLeafType(*type.child(0)->type(), leaf_type); } else {