From bae692233c8738bc7038de8319eb996ef96b805c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 4 Feb 2017 15:26:59 -0500 Subject: [PATCH 01/15] Temporary work on adding offset parameter to Array classes for slicing Change-Id: I8d6845bbd58aca80fc63b3d7171558e36ce130c6 --- cpp/src/arrow/array-list-test.cc | 6 +- cpp/src/arrow/array.cc | 71 ++++++++++++-------- cpp/src/arrow/array.h | 107 ++++++++++++++++++------------- cpp/src/arrow/compare.cc | 16 ++--- 4 files changed, 115 insertions(+), 85 deletions(-) diff --git a/cpp/src/arrow/array-list-test.cc b/cpp/src/arrow/array-list-test.cc index 8e4d319f5dc..4d92cd4288c 100644 --- a/cpp/src/arrow/array-list-test.cc +++ b/cpp/src/arrow/array-list-test.cc @@ -138,8 +138,8 @@ TEST_F(TestListBuilder, TestAppendNull) { ASSERT_TRUE(result_->IsNull(1)); ASSERT_EQ(0, result_->raw_offsets()[0]); - ASSERT_EQ(0, result_->offset(1)); - ASSERT_EQ(0, result_->offset(2)); + ASSERT_EQ(0, result_->value_offset(1)); + ASSERT_EQ(0, result_->value_offset(2)); Int32Array* values = static_cast(result_->values().get()); ASSERT_EQ(0, values->length()); @@ -154,7 +154,7 @@ void ValidateBasicListArray(const ListArray* result, const vector& valu ASSERT_EQ(3, result->length()); vector ex_offsets = {0, 3, 3, 7}; for (size_t i = 0; i < ex_offsets.size(); ++i) { - ASSERT_EQ(ex_offsets[i], result->offset(i)); + ASSERT_EQ(ex_offsets[i], result->value_offset(i)); } for (int i = 0; i < result->length(); ++i) { diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 6fc7fb60bf3..d6d81cbfe36 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -17,6 +17,7 @@ #include "arrow/array.h" +#include #include #include #include @@ -44,11 +45,12 @@ Status GetEmptyBitmap( // Base array class Array::Array(const std::shared_ptr& type, int32_t length, int32_t null_count, - const std::shared_ptr& null_bitmap) { - type_ = type; - length_ = length; - null_count_ = null_count; - null_bitmap_ = null_bitmap; + const std::shared_ptr& null_bitmap, int32_t offset) + : type_(type), + length_(length), + offset_(offset), + null_count_(null_count), + null_bitmap_(null_bitmap) { if (null_bitmap_) { null_bitmap_data_ = null_bitmap_->data(); } } @@ -85,7 +87,7 @@ bool Array::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_ if (!error.ok()) { DCHECK(false) << "Arrays not comparable: " << error.ToString(); } return are_equal; } - +n Status Array::Validate() const { return Status::OK(); } @@ -99,8 +101,8 @@ Status NullArray::Accept(ArrayVisitor* visitor) const { PrimitiveArray::PrimitiveArray(const std::shared_ptr& type, int32_t length, const std::shared_ptr& data, int32_t null_count, - const std::shared_ptr& null_bitmap) - : Array(type, length, null_count, null_bitmap) { + const std::shared_ptr& null_bitmap, int32_t offset) + : Array(type, length, null_count, null_bitmap, offset) { data_ = data; raw_data_ = data == nullptr ? nullptr : data_->data(); } @@ -110,6 +112,16 @@ Status NumericArray::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); } +template +std::shared_ptr NumericArray::Slice(int32_t offset, int32_t length) const { + DCHECK_LE(offset, length_); + + length = std::min(length_ - offset, length); + + return std::make_shared> + return visitor->Visit(*this); +} + template class NumericArray; template class NumericArray; template class NumericArray; @@ -129,14 +141,9 @@ template class NumericArray; // BooleanArray BooleanArray::BooleanArray(int32_t length, const std::shared_ptr& data, - int32_t null_count, const std::shared_ptr& null_bitmap) - : PrimitiveArray( - std::make_shared(), length, data, null_count, null_bitmap) {} - -BooleanArray::BooleanArray(const std::shared_ptr& type, int32_t length, - const std::shared_ptr& data, int32_t null_count, - const std::shared_ptr& null_bitmap) - : PrimitiveArray(type, length, data, null_count, null_bitmap) {} + int32_t null_count, const std::shared_ptr& null_bitmap, int32_t offset) + : PrimitiveArray(std::make_shared(), length, data, null_count, + null_bitmap, offset) {} Status BooleanArray::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); @@ -154,7 +161,7 @@ Status ListArray::Validate() const { << " isn't large enough for length: " << length_; return Status::Invalid(ss.str()); } - const int32_t last_offset = offset(length_); + const int32_t last_offset = this->value_offset(length_); if (last_offset > 0) { if (!values_) { return Status::Invalid("last offset was non-zero and values was null"); @@ -174,10 +181,10 @@ Status ListArray::Validate() const { } } - int32_t prev_offset = offset(0); + int32_t prev_offset = this->value_offset(0); if (prev_offset != 0) { return Status::Invalid("The first offset wasn't zero"); } for (int32_t i = 1; i <= length_; ++i) { - int32_t current_offset = offset(i); + int32_t current_offset = this->value_offset(i); if (IsNull(i - 1) && current_offset != prev_offset) { std::stringstream ss; ss << "Offset invariant failure at: " << i << " inconsistent offsets for null slot" @@ -208,13 +215,13 @@ static std::shared_ptr kString = std::make_shared(); BinaryArray::BinaryArray(int32_t length, const std::shared_ptr& offsets, const std::shared_ptr& data, int32_t null_count, - const std::shared_ptr& null_bitmap) - : BinaryArray(kBinary, length, offsets, data, null_count, null_bitmap) {} + const std::shared_ptr& null_bitmap, int32_t offset) + : BinaryArray(kBinary, length, offsets, data, null_count, null_bitmap, offset) {} BinaryArray::BinaryArray(const std::shared_ptr& type, int32_t length, const std::shared_ptr& offsets, const std::shared_ptr& data, - int32_t null_count, const std::shared_ptr& null_bitmap) - : Array(type, length, null_count, null_bitmap), + int32_t null_count, const std::shared_ptr& null_bitmap, int32_t offset) + : Array(type, length, null_count, null_bitmap, offset), offsets_buffer_(offsets), offsets_(reinterpret_cast(offsets_buffer_->data())), data_buffer_(data), @@ -233,8 +240,8 @@ Status BinaryArray::Accept(ArrayVisitor* visitor) const { StringArray::StringArray(int32_t length, const std::shared_ptr& offsets, const std::shared_ptr& data, int32_t null_count, - const std::shared_ptr& null_bitmap) - : BinaryArray(kString, length, offsets, data, null_count, null_bitmap) {} + const std::shared_ptr& null_bitmap, int32_t offset) + : BinaryArray(kString, length, offsets, data, null_count, null_bitmap, offset) {} Status StringArray::Validate() const { // TODO(emkornfield) Validate proper UTF8 code points? @@ -248,6 +255,14 @@ Status StringArray::Accept(ArrayVisitor* visitor) const { // ---------------------------------------------------------------------- // Struct +StructArray::StructArray(const std::shared_ptr& type, int32_t length, + const std::vector>& field_arrays, int32_t null_count, + std::shared_ptr null_bitmap, int32_t offset) + : Array(type, length, null_count, null_bitmap, offset) { + type_ = type; + field_arrays_ = field_arrays; +} + std::shared_ptr StructArray::field(int32_t pos) const { DCHECK_GT(field_arrays_.size(), 0); return field_arrays_[pos]; @@ -299,8 +314,8 @@ Status StructArray::Accept(ArrayVisitor* visitor) const { UnionArray::UnionArray(const std::shared_ptr& type, int32_t length, const std::vector>& children, const std::shared_ptr& type_ids, const std::shared_ptr& offsets, - int32_t null_count, const std::shared_ptr& null_bitmap) - : Array(type, length, null_count, null_bitmap), + int32_t null_count, const std::shared_ptr& null_bitmap, int32_t offset) + : Array(type, length, null_count, null_bitmap, offset), children_(children), type_ids_buffer_(type_ids), offsets_buffer_(offsets) { @@ -347,7 +362,7 @@ Status DictionaryArray::FromBuffer(const std::shared_ptr& type, int32_ DictionaryArray::DictionaryArray( const std::shared_ptr& type, const std::shared_ptr& indices) - : Array(type, indices->length(), indices->null_count(), indices->null_bitmap()), + : Array(type, indices->length(), indices->null_count(), indices->null_bitmap(), 0), dict_type_(static_cast(type.get())), indices_(indices) { DCHECK_EQ(type->type, Type::DICTIONARY); diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 3b6e93f9cb3..ee19df89102 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -74,18 +74,22 @@ class ArrayVisitor { class ARROW_EXPORT Array { public: Array(const std::shared_ptr& type, int32_t length, int32_t null_count = 0, - const std::shared_ptr& null_bitmap = nullptr); + const std::shared_ptr& null_bitmap = nullptr, int32_t offset = 0); virtual ~Array() = default; /// Determine if a slot is null. For inner loops. Does *not* boundscheck bool IsNull(int i) const { - return null_count_ > 0 && BitUtil::BitNotSet(null_bitmap_data_, i); + return null_count_ > 0 && BitUtil::BitNotSet(null_bitmap_data_, i + offset_); } /// Size in the number of elements this array contains. int32_t length() const { return length_; } + /// A relative position into another array's data, to enable zero-copy + /// slicing. This value defaults to zero + int32_t offset() const { return offset_; } + /// The number of null entries in the array. int32_t null_count() const { return null_count_; } @@ -120,10 +124,15 @@ class ARROW_EXPORT Array { virtual Status Accept(ArrayVisitor* visitor) const = 0; + /// Construct a zero-copy slice of the array with the indicated offset and + /// length + virtual std::shared_ptr Slice(int32_t offset, int32_t length) const = 0; + protected: std::shared_ptr type_; - int32_t null_count_; int32_t length_; + int32_t offset_; + int32_t null_count_; std::shared_ptr null_bitmap_; const uint8_t* null_bitmap_data_; @@ -152,14 +161,15 @@ Status ARROW_EXPORT GetEmptyBitmap( /// Base class for fixed-size logical types class ARROW_EXPORT PrimitiveArray : public Array { public: + PrimitiveArray(const std::shared_ptr& type, int32_t length, + const std::shared_ptr& data, int32_t null_count = 0, + const std::shared_ptr& null_bitmap = nullptr, int32_t offset = 0); + virtual ~PrimitiveArray() {} std::shared_ptr data() const { return data_; } protected: - PrimitiveArray(const std::shared_ptr& type, int32_t length, - const std::shared_ptr& data, int32_t null_count = 0, - const std::shared_ptr& null_bitmap = nullptr); std::shared_ptr data_; const uint8_t* raw_data_; }; @@ -169,17 +179,17 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray { public: using TypeClass = TYPE; using value_type = typename TypeClass::c_type; + + using PrimitiveArray::PrimitiveArray; + NumericArray(int32_t length, const std::shared_ptr& data, - int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr) - : PrimitiveArray( - std::make_shared(), length, data, null_count, null_bitmap) {} - NumericArray(const std::shared_ptr& type, int32_t length, - const std::shared_ptr& data, int32_t null_count = 0, - const std::shared_ptr& null_bitmap = nullptr) - : PrimitiveArray(type, length, data, null_count, null_bitmap) {} + int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr, + int32_t offset = 0) + : PrimitiveArray(std::make_shared(), length, data, null_count, + null_bitmap, offset) {} const value_type* raw_data() const { - return reinterpret_cast(raw_data_); + return reinterpret_cast(raw_data_) + offset_; } Status Accept(ArrayVisitor* visitor) const override; @@ -191,17 +201,17 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { public: using TypeClass = BooleanType; + using PrimitiveArray::PrimitiveArray; + BooleanArray(int32_t length, const std::shared_ptr& data, - int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); - BooleanArray(const std::shared_ptr& type, int32_t length, - const std::shared_ptr& data, int32_t null_count = 0, - const std::shared_ptr& null_bitmap = nullptr); + int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr, + int32_t offset = 0); Status Accept(ArrayVisitor* visitor) const override; - const uint8_t* raw_data() const { return reinterpret_cast(raw_data_); } - - bool Value(int i) const { return BitUtil::GetBit(raw_data(), i); } + bool Value(int i) const { + return BitUtil::GetBit(reinterpret_cast(raw_data_), i + offset_); + } }; // ---------------------------------------------------------------------- @@ -213,8 +223,9 @@ class ARROW_EXPORT ListArray : public Array { ListArray(const std::shared_ptr& type, int32_t length, const std::shared_ptr& offsets, const std::shared_ptr& values, - int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr) - : Array(type, length, null_count, null_bitmap) { + int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr, + int32_t offset = 0) + : Array(type, length, null_count, null_bitmap, offset) { offsets_buffer_ = offsets; offsets_ = offsets == nullptr ? nullptr : reinterpret_cast( offsets_buffer_->data()); @@ -234,11 +245,14 @@ class ARROW_EXPORT ListArray : public Array { const int32_t* raw_offsets() const { return offsets_; } - int32_t offset(int i) const { return offsets_[i]; } - // Neither of these functions will perform boundschecking - int32_t value_offset(int i) const { return offsets_[i]; } - int32_t value_length(int i) const { return offsets_[i + 1] - offsets_[i]; } + int32_t value_offset(int i) const { + return offsets_[i + offset_]; + } + int32_t value_length(int i) const { + i += offset_; + return offsets_[i + 1] - offsets_[i]; + } Status Accept(ArrayVisitor* visitor) const override; @@ -257,18 +271,15 @@ class ARROW_EXPORT BinaryArray : public Array { BinaryArray(int32_t length, const std::shared_ptr& offsets, const std::shared_ptr& data, int32_t null_count = 0, - const std::shared_ptr& null_bitmap = nullptr); - - // Constructor that allows sub-classes/builders to propagate there logical type up the - // class hierarchy. - BinaryArray(const std::shared_ptr& type, int32_t length, - const std::shared_ptr& offsets, const std::shared_ptr& data, - int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); + const std::shared_ptr& null_bitmap = nullptr, int32_t offset = 0); // Return the pointer to the given elements bytes // TODO(emkornfield) introduce a StringPiece or something similar to capture zero-copy // pointer + offset const uint8_t* GetValue(int i, int32_t* out_length) const { + // Account for base offset + i += offset_; + const int32_t pos = offsets_[i]; *out_length = offsets_[i + 1] - pos; return data_ + pos; @@ -279,17 +290,25 @@ class ARROW_EXPORT BinaryArray : public Array { const int32_t* raw_offsets() const { return offsets_; } - int32_t offset(int i) const { return offsets_[i]; } - // Neither of these functions will perform boundschecking - int32_t value_offset(int i) const { return offsets_[i]; } - int32_t value_length(int i) const { return offsets_[i + 1] - offsets_[i]; } + int32_t value_offset(int i) const { return offsets_[i + offset_]; } + int32_t value_length(int i) const { + i += offset_; + return offsets_[i + 1] - offsets_[i]; + } Status Validate() const override; Status Accept(ArrayVisitor* visitor) const override; - private: + protected: + // Constructor that allows sub-classes/builders to propagate there logical type up the + // class hierarchy. + BinaryArray(const std::shared_ptr& type, int32_t length, + const std::shared_ptr& offsets, const std::shared_ptr& data, + int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr, + int32_t offset = 0); + std::shared_ptr offsets_buffer_; const int32_t* offsets_; @@ -303,7 +322,7 @@ class ARROW_EXPORT StringArray : public BinaryArray { StringArray(int32_t length, const std::shared_ptr& offsets, const std::shared_ptr& data, int32_t null_count = 0, - const std::shared_ptr& null_bitmap = nullptr); + const std::shared_ptr& null_bitmap = nullptr, int32_t offset = 0); // Construct a std::string // TODO: std::bad_alloc possibility @@ -327,11 +346,7 @@ class ARROW_EXPORT StructArray : public Array { StructArray(const std::shared_ptr& type, int32_t length, const std::vector>& field_arrays, int32_t null_count = 0, - std::shared_ptr null_bitmap = nullptr) - : Array(type, length, null_count, null_bitmap) { - type_ = type; - field_arrays_ = field_arrays; - } + std::shared_ptr null_bitmap = nullptr, int32_t offset = 0); Status Validate() const override; @@ -361,7 +376,7 @@ class ARROW_EXPORT UnionArray : public Array { const std::vector>& children, const std::shared_ptr& type_ids, const std::shared_ptr& offsets = nullptr, int32_t null_count = 0, - const std::shared_ptr& null_bitmap = nullptr); + const std::shared_ptr& null_bitmap = nullptr, int32_t offset = 0); Status Validate() const override; diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index d039bba0882..6b1424a40fe 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -76,10 +76,10 @@ class RangeEqualsVisitor : public ArrayVisitor { const bool is_null = left.IsNull(i); if (is_null != right.IsNull(o_i)) { return false; } if (is_null) continue; - const int32_t begin_offset = left.offset(i); - const int32_t end_offset = left.offset(i + 1); - const int32_t right_begin_offset = right.offset(o_i); - const int32_t right_end_offset = right.offset(o_i + 1); + const int32_t begin_offset = left.value_offset(i); + const int32_t end_offset = left.value_offset(i + 1); + const int32_t right_begin_offset = right.value_offset(o_i); + const int32_t right_end_offset = right.value_offset(o_i + 1); // Underlying can't be equal if the size isn't equal if (end_offset - begin_offset != right_end_offset - right_begin_offset) { return false; @@ -169,10 +169,10 @@ class RangeEqualsVisitor : public ArrayVisitor { const bool is_null = left.IsNull(i); if (is_null != right.IsNull(o_i)) { return false; } if (is_null) continue; - const int32_t begin_offset = left.offset(i); - const int32_t end_offset = left.offset(i + 1); - const int32_t right_begin_offset = right.offset(o_i); - const int32_t right_end_offset = right.offset(o_i + 1); + const int32_t begin_offset = left.value_offset(i); + const int32_t end_offset = left.value_offset(i + 1); + const int32_t right_begin_offset = right.value_offset(o_i); + const int32_t right_end_offset = right.value_offset(o_i + 1); // Underlying can't be equal if the size isn't equal if (end_offset - begin_offset != right_end_offset - right_begin_offset) { return false; From e5029010b2c1e14fced13bd6eb1bd0a110b95c03 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 4 Feb 2017 18:20:53 -0500 Subject: [PATCH 02/15] Move null_count and offset as last two parameters of all array ctors. Implement/test bitmap set bit count with offset Change-Id: Icfb98992e0c060422362ff99e44bbb1bbe40be7b --- cpp/src/arrow/array-primitive-test.cc | 9 +-- cpp/src/arrow/array-string-test.cc | 16 ++-- cpp/src/arrow/array-test.cc | 6 +- cpp/src/arrow/array.cc | 88 ++++++++++++---------- cpp/src/arrow/array.h | 81 ++++++++++++-------- cpp/src/arrow/builder.cc | 12 +-- cpp/src/arrow/ipc/adapter.cc | 14 ++-- cpp/src/arrow/ipc/ipc-json-test.cc | 4 +- cpp/src/arrow/ipc/json-integration-test.cc | 6 +- cpp/src/arrow/ipc/json-internal.cc | 6 +- cpp/src/arrow/ipc/test-common.h | 46 +++++------ cpp/src/arrow/table.cc | 6 +- cpp/src/arrow/test-util.h | 33 +------- cpp/src/arrow/util/bit-util-test.cc | 36 ++++++++- cpp/src/arrow/util/bit-util.cc | 46 +++++++++++ cpp/src/arrow/util/bit-util.h | 16 ++++ cpp/src/arrow/util/logging.h | 4 +- cpp/src/arrow/util/macros.h | 2 +- 18 files changed, 257 insertions(+), 174 deletions(-) diff --git a/cpp/src/arrow/array-primitive-test.cc b/cpp/src/arrow/array-primitive-test.cc index c839fb9b192..3dea678c712 100644 --- a/cpp/src/arrow/array-primitive-test.cc +++ b/cpp/src/arrow/array-primitive-test.cc @@ -121,7 +121,7 @@ class TestPrimitiveBuilder : public TestBuilder { } auto expected = - std::make_shared(size, ex_data, ex_null_count, ex_null_bitmap); + std::make_shared(size, ex_data, ex_null_bitmap, ex_null_count); std::shared_ptr out; ASSERT_OK(builder->Finish(&out)); @@ -217,7 +217,7 @@ void TestPrimitiveBuilder::Check( } auto expected = - std::make_shared(size, ex_data, ex_null_count, ex_null_bitmap); + std::make_shared(size, ex_data, ex_null_bitmap, ex_null_count); std::shared_ptr out; ASSERT_OK(builder->Finish(&out)); @@ -235,15 +235,14 @@ void TestPrimitiveBuilder::Check( for (int i = 0; i < result->length(); ++i) { if (nullable) { ASSERT_EQ(valid_bytes_[i] == 0, result->IsNull(i)) << i; } - bool actual = BitUtil::GetBit(result->raw_data(), i); + bool actual = BitUtil::GetBit(result->data()->data(), i); ASSERT_EQ(static_cast(draws_[i]), actual) << i; } ASSERT_TRUE(result->Equals(*expected)); } typedef ::testing::Types - Primitives; + PInt32, PInt64, PFloat, PDouble> Primitives; TYPED_TEST_CASE(TestPrimitiveBuilder, Primitives); diff --git a/cpp/src/arrow/array-string-test.cc b/cpp/src/arrow/array-string-test.cc index 5ea384acb1c..e737d127196 100644 --- a/cpp/src/arrow/array-string-test.cc +++ b/cpp/src/arrow/array-string-test.cc @@ -70,7 +70,7 @@ class TestStringArray : public ::testing::Test { null_count_ = test::null_count(valid_bytes_); strings_ = std::make_shared( - length_, offsets_buf_, value_buf_, null_count_, null_bitmap_); + length_, offsets_buf_, value_buf_, null_bitmap_, null_count_); } protected: @@ -114,7 +114,7 @@ TEST_F(TestStringArray, TestListFunctions) { TEST_F(TestStringArray, TestDestructor) { auto arr = std::make_shared( - length_, offsets_buf_, value_buf_, null_count_, null_bitmap_); + length_, offsets_buf_, value_buf_, null_bitmap_, null_count_); } TEST_F(TestStringArray, TestGetString) { @@ -133,9 +133,9 @@ TEST_F(TestStringArray, TestEmptyStringComparison) { length_ = offsets_.size() - 1; auto strings_a = std::make_shared( - length_, offsets_buf_, nullptr, null_count_, null_bitmap_); + length_, offsets_buf_, nullptr, null_bitmap_, null_count_); auto strings_b = std::make_shared( - length_, offsets_buf_, nullptr, null_count_, null_bitmap_); + length_, offsets_buf_, nullptr, null_bitmap_, null_count_); ASSERT_TRUE(strings_a->Equals(strings_b)); } @@ -195,7 +195,7 @@ TEST_F(TestStringBuilder, TestScalarAppend) { } else { ASSERT_FALSE(result_->IsNull(i)); result_->GetValue(i, &length); - ASSERT_EQ(pos, result_->offset(i)); + ASSERT_EQ(pos, result_->value_offset(i)); ASSERT_EQ(static_cast(strings[i % N].size()), length); ASSERT_EQ(strings[i % N], result_->GetString(i)); @@ -232,7 +232,7 @@ class TestBinaryArray : public ::testing::Test { null_count_ = test::null_count(valid_bytes_); strings_ = std::make_shared( - length_, offsets_buf_, value_buf_, null_count_, null_bitmap_); + length_, offsets_buf_, value_buf_, null_bitmap_, null_count_); } protected: @@ -276,7 +276,7 @@ TEST_F(TestBinaryArray, TestListFunctions) { TEST_F(TestBinaryArray, TestDestructor) { auto arr = std::make_shared( - length_, offsets_buf_, value_buf_, null_count_, null_bitmap_); + length_, offsets_buf_, value_buf_, null_bitmap_, null_count_); } TEST_F(TestBinaryArray, TestGetValue) { @@ -307,7 +307,7 @@ TEST_F(TestBinaryArray, TestEqualsEmptyStrings) { const BinaryArray& left = static_cast(*left_arr); std::shared_ptr right = std::make_shared( - left.length(), left.offsets(), nullptr, left.null_count(), left.null_bitmap()); + left.length(), left.offsets(), nullptr, left.null_bitmap(), left.null_count()); ASSERT_TRUE(left.Equals(right)); ASSERT_TRUE(left.RangeEquals(0, left.length(), 0, right)); diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index a1d8fdfa91e..5a69913086d 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -43,7 +43,7 @@ TEST_F(TestArray, TestNullCount) { auto data = std::make_shared(pool_); auto null_bitmap = std::make_shared(pool_); - std::unique_ptr arr(new Int32Array(100, data, 10, null_bitmap)); + std::unique_ptr arr(new Int32Array(100, data, null_bitmap, 10)); ASSERT_EQ(10, arr->null_count()); std::unique_ptr arr_no_nulls(new Int32Array(100, data)); @@ -67,7 +67,7 @@ std::shared_ptr MakeArrayFromValidBytes( } std::shared_ptr arr( - new Int32Array(v.size(), value_builder.Finish(), null_count, null_buf)); + new Int32Array(v.size(), value_builder.Finish(), null_buf, null_count)); return arr; } @@ -102,7 +102,7 @@ TEST_F(TestArray, TestIsNull) { std::shared_ptr null_buf = test::bytes_to_null_buffer(null_bitmap); std::unique_ptr arr; - arr.reset(new Int32Array(null_bitmap.size(), nullptr, null_count, null_buf)); + arr.reset(new Int32Array(null_bitmap.size(), nullptr, null_buf, null_count)); ASSERT_EQ(null_count, arr->null_count()); ASSERT_EQ(5, null_buf->size()); diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index d6d81cbfe36..f2be69a79c2 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -44,16 +44,24 @@ Status GetEmptyBitmap( // ---------------------------------------------------------------------- // Base array class -Array::Array(const std::shared_ptr& type, int32_t length, int32_t null_count, - const std::shared_ptr& null_bitmap, int32_t offset) +Array::Array(const std::shared_ptr& type, int32_t length, + const std::shared_ptr& null_bitmap, int32_t null_count, int32_t offset) : type_(type), length_(length), offset_(offset), null_count_(null_count), - null_bitmap_(null_bitmap) { + null_bitmap_(null_bitmap), + null_bitmap_data_(nullptr) { if (null_bitmap_) { null_bitmap_data_ = null_bitmap_->data(); } } +int32_t Array::null_count() const { + if (null_count_ < 0 && null_bitmap_) { + null_count_ = CountSetBits(null_bitmap_data_, offset_, length_); + } + return null_count_; +} + bool Array::Equals(const Array& arr) const { bool are_equal = false; Status error = ArrayEquals(*this, arr, &are_equal); @@ -87,7 +95,7 @@ bool Array::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_ if (!error.ok()) { DCHECK(false) << "Arrays not comparable: " << error.ToString(); } return are_equal; } -n + Status Array::Validate() const { return Status::OK(); } @@ -100,9 +108,9 @@ Status NullArray::Accept(ArrayVisitor* visitor) const { // Primitive array base PrimitiveArray::PrimitiveArray(const std::shared_ptr& type, int32_t length, - const std::shared_ptr& data, int32_t null_count, - const std::shared_ptr& null_bitmap, int32_t offset) - : Array(type, length, null_count, null_bitmap, offset) { + const std::shared_ptr& data, const std::shared_ptr& null_bitmap, + int32_t null_count, int32_t offset) + : Array(type, length, null_bitmap, null_count, offset) { data_ = data; raw_data_ = data == nullptr ? nullptr : data_->data(); } @@ -112,15 +120,14 @@ Status NumericArray::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); } -template -std::shared_ptr NumericArray::Slice(int32_t offset, int32_t length) const { - DCHECK_LE(offset, length_); +// template +// std::shared_ptr NumericArray::Slice(int32_t offset, int32_t length) const { +// DCHECK_LE(offset, length_); - length = std::min(length_ - offset, length); +// length = std::min(length_ - offset, length); - return std::make_shared> - return visitor->Visit(*this); -} +// return std::make_shared> return visitor->Visit(*this); +// } template class NumericArray; template class NumericArray; @@ -141,9 +148,9 @@ template class NumericArray; // BooleanArray BooleanArray::BooleanArray(int32_t length, const std::shared_ptr& data, - int32_t null_count, const std::shared_ptr& null_bitmap, int32_t offset) - : PrimitiveArray(std::make_shared(), length, data, null_count, - null_bitmap, offset) {} + const std::shared_ptr& null_bitmap, int32_t null_count, int32_t offset) + : PrimitiveArray(std::make_shared(), length, data, null_bitmap, + null_count, offset) {} Status BooleanArray::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); @@ -214,14 +221,14 @@ static std::shared_ptr kBinary = std::make_shared(); static std::shared_ptr kString = std::make_shared(); BinaryArray::BinaryArray(int32_t length, const std::shared_ptr& offsets, - const std::shared_ptr& data, int32_t null_count, - const std::shared_ptr& null_bitmap, int32_t offset) - : BinaryArray(kBinary, length, offsets, data, null_count, null_bitmap, offset) {} + const std::shared_ptr& data, const std::shared_ptr& null_bitmap, + int32_t null_count, int32_t offset) + : BinaryArray(kBinary, length, offsets, data, null_bitmap, null_count, offset) {} BinaryArray::BinaryArray(const std::shared_ptr& type, int32_t length, const std::shared_ptr& offsets, const std::shared_ptr& data, - int32_t null_count, const std::shared_ptr& null_bitmap, int32_t offset) - : Array(type, length, null_count, null_bitmap, offset), + const std::shared_ptr& null_bitmap, int32_t null_count, int32_t offset) + : Array(type, length, null_bitmap, null_count, offset), offsets_buffer_(offsets), offsets_(reinterpret_cast(offsets_buffer_->data())), data_buffer_(data), @@ -239,9 +246,9 @@ Status BinaryArray::Accept(ArrayVisitor* visitor) const { } StringArray::StringArray(int32_t length, const std::shared_ptr& offsets, - const std::shared_ptr& data, int32_t null_count, - const std::shared_ptr& null_bitmap, int32_t offset) - : BinaryArray(kString, length, offsets, data, null_count, null_bitmap, offset) {} + const std::shared_ptr& data, const std::shared_ptr& null_bitmap, + int32_t null_count, int32_t offset) + : BinaryArray(kString, length, offsets, data, null_bitmap, null_count, offset) {} Status StringArray::Validate() const { // TODO(emkornfield) Validate proper UTF8 code points? @@ -256,9 +263,9 @@ Status StringArray::Accept(ArrayVisitor* visitor) const { // Struct StructArray::StructArray(const std::shared_ptr& type, int32_t length, - const std::vector>& field_arrays, int32_t null_count, - std::shared_ptr null_bitmap, int32_t offset) - : Array(type, length, null_count, null_bitmap, offset) { + const std::vector>& field_arrays, + std::shared_ptr null_bitmap, int32_t null_count, int32_t offset) + : Array(type, length, null_bitmap, null_count, offset) { type_ = type; field_arrays_ = field_arrays; } @@ -314,8 +321,8 @@ Status StructArray::Accept(ArrayVisitor* visitor) const { UnionArray::UnionArray(const std::shared_ptr& type, int32_t length, const std::vector>& children, const std::shared_ptr& type_ids, const std::shared_ptr& offsets, - int32_t null_count, const std::shared_ptr& null_bitmap, int32_t offset) - : Array(type, length, null_count, null_bitmap, offset), + const std::shared_ptr& null_bitmap, int32_t null_count, int32_t offset) + : Array(type, length, null_bitmap, null_count, offset), children_(children), type_ids_buffer_(type_ids), offsets_buffer_(offsets) { @@ -347,14 +354,14 @@ Status UnionArray::Accept(ArrayVisitor* visitor) const { // DictionaryArray Status DictionaryArray::FromBuffer(const std::shared_ptr& type, int32_t length, - const std::shared_ptr& indices, int32_t null_count, - const std::shared_ptr& null_bitmap, std::shared_ptr* out) { + const std::shared_ptr& indices, const std::shared_ptr& null_bitmap, + int32_t null_count, int32_t offset, std::shared_ptr* out) { DCHECK_EQ(type->type, Type::DICTIONARY); const auto& dict_type = static_cast(type.get()); std::shared_ptr boxed_indices; - RETURN_NOT_OK(MakePrimitiveArray( - dict_type->index_type(), length, indices, null_count, null_bitmap, &boxed_indices)); + RETURN_NOT_OK(MakePrimitiveArray(dict_type->index_type(), length, indices, null_bitmap, + null_count, offset, &boxed_indices)); *out = std::make_shared(type, boxed_indices); return Status::OK(); @@ -362,7 +369,8 @@ Status DictionaryArray::FromBuffer(const std::shared_ptr& type, int32_ DictionaryArray::DictionaryArray( const std::shared_ptr& type, const std::shared_ptr& indices) - : Array(type, indices->length(), indices->null_count(), indices->null_bitmap(), 0), + : Array(type, indices->length(), indices->null_bitmap(), indices->null_count(), + indices->offset()), dict_type_(static_cast(type.get())), indices_(indices) { DCHECK_EQ(type->type, Type::DICTIONARY); @@ -386,14 +394,14 @@ Status DictionaryArray::Accept(ArrayVisitor* visitor) const { // ---------------------------------------------------------------------- -#define MAKE_PRIMITIVE_ARRAY_CASE(ENUM, ArrayType) \ - case Type::ENUM: \ - out->reset(new ArrayType(type, length, data, null_count, null_bitmap)); \ +#define MAKE_PRIMITIVE_ARRAY_CASE(ENUM, ArrayType) \ + case Type::ENUM: \ + out->reset(new ArrayType(type, length, data, null_bitmap, null_count, offset)); \ break; Status MakePrimitiveArray(const std::shared_ptr& type, int32_t length, - const std::shared_ptr& data, int32_t null_count, - const std::shared_ptr& null_bitmap, std::shared_ptr* out) { + const std::shared_ptr& data, const std::shared_ptr& null_bitmap, + int32_t null_count, int32_t offset, std::shared_ptr* out) { switch (type->type) { MAKE_PRIMITIVE_ARRAY_CASE(BOOL, BooleanArray); MAKE_PRIMITIVE_ARRAY_CASE(UINT8, UInt8Array); diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index ee19df89102..e8271018401 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -71,16 +71,22 @@ class ArrayVisitor { /// /// The base class is only required to have a null bitmap buffer if the null /// count is greater than 0 +/// +/// If known, the null count can be provided in the base Array constructor. If +/// the null count is not known, pass -1 to indicate that the null count is to +/// be computed on the first call to null_count() class ARROW_EXPORT Array { public: - Array(const std::shared_ptr& type, int32_t length, int32_t null_count = 0, - const std::shared_ptr& null_bitmap = nullptr, int32_t offset = 0); + Array(const std::shared_ptr& type, int32_t length, + const std::shared_ptr& null_bitmap = nullptr, int32_t null_count = 0, + int32_t offset = 0); virtual ~Array() = default; /// Determine if a slot is null. For inner loops. Does *not* boundscheck bool IsNull(int i) const { - return null_count_ > 0 && BitUtil::BitNotSet(null_bitmap_data_, i + offset_); + return null_bitmap_data_ != nullptr && + BitUtil::BitNotSet(null_bitmap_data_, i + offset_); } /// Size in the number of elements this array contains. @@ -90,8 +96,11 @@ class ARROW_EXPORT Array { /// slicing. This value defaults to zero int32_t offset() const { return offset_; } - /// The number of null entries in the array. - int32_t null_count() const { return null_count_; } + /// The number of null entries in the array. If the null count was not known + /// at time of construction (and set to a negative value), then the null + /// count will be computed and cached on the first invocation of this + /// function + int32_t null_count() const; std::shared_ptr type() const { return type_; } Type::type type_enum() const { return type_->type; } @@ -126,13 +135,17 @@ class ARROW_EXPORT Array { /// Construct a zero-copy slice of the array with the indicated offset and /// length - virtual std::shared_ptr Slice(int32_t offset, int32_t length) const = 0; + // virtual std::shared_ptr Slice(int32_t offset, int32_t length) const = 0; protected: std::shared_ptr type_; int32_t length_; int32_t offset_; - int32_t null_count_; + + // This member is marked mutable so that it can be modified when null_count() + // is called from a const context and the null count has to be computed (if + // it is not already known) + mutable int32_t null_count_; std::shared_ptr null_bitmap_; const uint8_t* null_bitmap_data_; @@ -148,7 +161,7 @@ class ARROW_EXPORT NullArray : public Array { using TypeClass = NullType; NullArray(const std::shared_ptr& type, int32_t length) - : Array(type, length, length, nullptr) {} + : Array(type, length, nullptr, length) {} explicit NullArray(int32_t length) : NullArray(std::make_shared(), length) {} @@ -162,8 +175,9 @@ Status ARROW_EXPORT GetEmptyBitmap( class ARROW_EXPORT PrimitiveArray : public Array { public: PrimitiveArray(const std::shared_ptr& type, int32_t length, - const std::shared_ptr& data, int32_t null_count = 0, - const std::shared_ptr& null_bitmap = nullptr, int32_t offset = 0); + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = nullptr, int32_t null_count = 0, + int32_t offset = 0); virtual ~PrimitiveArray() {} @@ -183,10 +197,10 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray { using PrimitiveArray::PrimitiveArray; NumericArray(int32_t length, const std::shared_ptr& data, - int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr, + const std::shared_ptr& null_bitmap = nullptr, int32_t null_count = 0, int32_t offset = 0) - : PrimitiveArray(std::make_shared(), length, data, null_count, - null_bitmap, offset) {} + : PrimitiveArray(std::make_shared(), length, data, null_bitmap, + null_count, offset) {} const value_type* raw_data() const { return reinterpret_cast(raw_data_) + offset_; @@ -204,7 +218,7 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { using PrimitiveArray::PrimitiveArray; BooleanArray(int32_t length, const std::shared_ptr& data, - int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr, + const std::shared_ptr& null_bitmap = nullptr, int32_t null_count = 0, int32_t offset = 0); Status Accept(ArrayVisitor* visitor) const override; @@ -223,9 +237,9 @@ class ARROW_EXPORT ListArray : public Array { ListArray(const std::shared_ptr& type, int32_t length, const std::shared_ptr& offsets, const std::shared_ptr& values, - int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr, + const std::shared_ptr& null_bitmap = nullptr, int32_t null_count = 0, int32_t offset = 0) - : Array(type, length, null_count, null_bitmap, offset) { + : Array(type, length, null_bitmap, null_count, offset) { offsets_buffer_ = offsets; offsets_ = offsets == nullptr ? nullptr : reinterpret_cast( offsets_buffer_->data()); @@ -246,9 +260,7 @@ class ARROW_EXPORT ListArray : public Array { const int32_t* raw_offsets() const { return offsets_; } // Neither of these functions will perform boundschecking - int32_t value_offset(int i) const { - return offsets_[i + offset_]; - } + int32_t value_offset(int i) const { return offsets_[i + offset_]; } int32_t value_length(int i) const { i += offset_; return offsets_[i + 1] - offsets_[i]; @@ -270,8 +282,9 @@ class ARROW_EXPORT BinaryArray : public Array { using TypeClass = BinaryType; BinaryArray(int32_t length, const std::shared_ptr& offsets, - const std::shared_ptr& data, int32_t null_count = 0, - const std::shared_ptr& null_bitmap = nullptr, int32_t offset = 0); + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = nullptr, int32_t null_count = 0, + int32_t offset = 0); // Return the pointer to the given elements bytes // TODO(emkornfield) introduce a StringPiece or something similar to capture zero-copy @@ -306,7 +319,7 @@ class ARROW_EXPORT BinaryArray : public Array { // class hierarchy. BinaryArray(const std::shared_ptr& type, int32_t length, const std::shared_ptr& offsets, const std::shared_ptr& data, - int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr, + const std::shared_ptr& null_bitmap = nullptr, int32_t null_count = 0, int32_t offset = 0); std::shared_ptr offsets_buffer_; @@ -321,8 +334,9 @@ class ARROW_EXPORT StringArray : public BinaryArray { using TypeClass = StringType; StringArray(int32_t length, const std::shared_ptr& offsets, - const std::shared_ptr& data, int32_t null_count = 0, - const std::shared_ptr& null_bitmap = nullptr, int32_t offset = 0); + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = nullptr, int32_t null_count = 0, + int32_t offset = 0); // Construct a std::string // TODO: std::bad_alloc possibility @@ -345,8 +359,9 @@ class ARROW_EXPORT StructArray : public Array { using TypeClass = StructType; StructArray(const std::shared_ptr& type, int32_t length, - const std::vector>& field_arrays, int32_t null_count = 0, - std::shared_ptr null_bitmap = nullptr, int32_t offset = 0); + const std::vector>& field_arrays, + std::shared_ptr null_bitmap = nullptr, int32_t null_count = 0, + int32_t offset = 0); Status Validate() const override; @@ -375,8 +390,9 @@ class ARROW_EXPORT UnionArray : public Array { UnionArray(const std::shared_ptr& type, int32_t length, const std::vector>& children, const std::shared_ptr& type_ids, - const std::shared_ptr& offsets = nullptr, int32_t null_count = 0, - const std::shared_ptr& null_bitmap = nullptr, int32_t offset = 0); + const std::shared_ptr& offsets = nullptr, + const std::shared_ptr& null_bitmap = nullptr, int32_t null_count = 0, + int32_t offset = 0); Status Validate() const override; @@ -434,8 +450,8 @@ class ARROW_EXPORT DictionaryArray : public Array { // Alternate ctor; other attributes (like null count) are inherited from the // passed indices array static Status FromBuffer(const std::shared_ptr& type, int32_t length, - const std::shared_ptr& indices, int32_t null_count, - const std::shared_ptr& null_bitmap, std::shared_ptr* out); + const std::shared_ptr& indices, const std::shared_ptr& null_bitmap, + int32_t null_count, int32_t offset, std::shared_ptr* out); Status Validate() const override; @@ -486,8 +502,9 @@ extern template class ARROW_EXPORT NumericArray; // Create new arrays for logical types that are backed by primitive arrays. Status ARROW_EXPORT MakePrimitiveArray(const std::shared_ptr& type, - int32_t length, const std::shared_ptr& data, int32_t null_count, - const std::shared_ptr& null_bitmap, std::shared_ptr* out); + int32_t length, const std::shared_ptr& data, + const std::shared_ptr& null_bitmap, int32_t null_count, int32_t offset, + std::shared_ptr* out); } // namespace arrow diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index b0dc41baf42..efed00a04b6 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -185,7 +185,7 @@ Status PrimitiveBuilder::Finish(std::shared_ptr* out) { RETURN_NOT_OK(data_->Resize(bytes_required)); } *out = std::make_shared::ArrayType>( - type_, length_, data_, null_count_, null_bitmap_); + type_, length_, data_, null_bitmap_, null_count_); data_ = null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; @@ -244,7 +244,7 @@ Status BooleanBuilder::Finish(std::shared_ptr* out) { // Trim buffers RETURN_NOT_OK(data_->Resize(bytes_required)); } - *out = std::make_shared(type_, length_, data_, null_count_, null_bitmap_); + *out = std::make_shared(type_, length_, data_, null_bitmap_, null_count_); data_ = null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; @@ -313,7 +313,7 @@ Status ListBuilder::Finish(std::shared_ptr* out) { std::shared_ptr offsets = offset_builder_.Finish(); *out = std::make_shared( - type_, length_, offsets, items, null_count_, null_bitmap_); + type_, length_, offsets, items, null_bitmap_, null_count_); Reset(); @@ -352,7 +352,7 @@ Status BinaryBuilder::Finish(std::shared_ptr* out) { auto values = std::dynamic_pointer_cast(list->values()); *out = std::make_shared(list->length(), list->offsets(), values->data(), - list->null_count(), list->null_bitmap()); + list->null_bitmap(), list->null_count()); return Status::OK(); } @@ -364,7 +364,7 @@ Status StringBuilder::Finish(std::shared_ptr* out) { auto values = std::dynamic_pointer_cast(list->values()); *out = std::make_shared(list->length(), list->offsets(), values->data(), - list->null_count(), list->null_bitmap()); + list->null_bitmap(), list->null_count()); return Status::OK(); } @@ -377,7 +377,7 @@ Status StructBuilder::Finish(std::shared_ptr* out) { RETURN_NOT_OK(field_builders_[i]->Finish(&fields[i])); } - *out = std::make_shared(type_, length_, fields, null_count_, null_bitmap_); + *out = std::make_shared(type_, length_, fields, null_bitmap_, null_count_); null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index c8e631c564b..67b05aa5fb2 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -372,7 +372,7 @@ class ArrayLoader : public TypeVisitor { BufferMetadata metadata = context_->metadata->buffer(buffer_index); if (metadata.length == 0) { - *out = std::make_shared(nullptr, 0); + *out = nullptr; return Status::OK(); } else { return file_->ReadAt(metadata.offset, metadata.length, out); @@ -412,8 +412,8 @@ class ArrayLoader : public TypeVisitor { context_->buffer_index++; data.reset(new Buffer(nullptr, 0)); } - return MakePrimitiveArray(field_.type, field_meta.length, data, field_meta.null_count, - null_bitmap, &result_); + return MakePrimitiveArray(field_.type, field_meta.length, data, null_bitmap, + field_meta.null_count, 0, &result_); } template @@ -428,7 +428,7 @@ class ArrayLoader : public TypeVisitor { RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &values)); result_ = std::make_shared( - field_meta.length, offsets, values, field_meta.null_count, null_bitmap); + field_meta.length, offsets, values, null_bitmap, field_meta.null_count); return Status::OK(); } @@ -496,7 +496,7 @@ class ArrayLoader : public TypeVisitor { RETURN_NOT_OK(LoadChild(*type.child(0).get(), &values_array)); result_ = std::make_shared(field_.type, field_meta.length, offsets, - values_array, field_meta.null_count, null_bitmap); + values_array, null_bitmap, field_meta.null_count); return Status::OK(); } @@ -521,7 +521,7 @@ class ArrayLoader : public TypeVisitor { RETURN_NOT_OK(LoadChildren(type.children(), &fields)); result_ = std::make_shared( - field_.type, field_meta.length, fields, field_meta.null_count, null_bitmap); + field_.type, field_meta.length, fields, null_bitmap, field_meta.null_count); return Status::OK(); } @@ -542,7 +542,7 @@ class ArrayLoader : public TypeVisitor { RETURN_NOT_OK(LoadChildren(type.children(), &fields)); result_ = std::make_shared(field_.type, field_meta.length, fields, - type_ids, offsets, field_meta.null_count, null_bitmap); + type_ids, offsets, null_bitmap, field_meta.null_count); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc index 30f968c2bfd..ef1a0f36173 100644 --- a/cpp/src/arrow/ipc/ipc-json-test.cc +++ b/cpp/src/arrow/ipc/ipc-json-test.cc @@ -161,7 +161,7 @@ TEST(TestJsonArrayWriter, NestedTypes) { ASSERT_OK(test::GetBitmapFromBoolVector(list_is_valid, &list_bitmap)); std::shared_ptr offsets_buffer = test::GetBufferFromVector(offsets); - ListArray list_array(list(value_type), 5, offsets_buffer, values_array, 1, list_bitmap); + ListArray list_array(list(value_type), 5, offsets_buffer, values_array, list_bitmap, 1); TestArrayRoundTrip(list_array); @@ -175,7 +175,7 @@ TEST(TestJsonArrayWriter, NestedTypes) { std::vector> fields = {values_array, values_array, values_array}; StructArray struct_array( - struct_type, static_cast(struct_is_valid.size()), fields, 2, struct_bitmap); + struct_type, static_cast(struct_is_valid.size()), fields, struct_bitmap, 2); TestArrayRoundTrip(struct_array); } diff --git a/cpp/src/arrow/ipc/json-integration-test.cc b/cpp/src/arrow/ipc/json-integration-test.cc index 95bc742054f..17ccc4ac1d0 100644 --- a/cpp/src/arrow/ipc/json-integration-test.cc +++ b/cpp/src/arrow/ipc/json-integration-test.cc @@ -144,10 +144,8 @@ static Status ValidateArrowVsJson( if (!json_schema->Equals(arrow_schema)) { std::stringstream ss; - ss << "JSON schema: \n" - << json_schema->ToString() << "\n" - << "Arrow schema: \n" - << arrow_schema->ToString(); + ss << "JSON schema: \n" << json_schema->ToString() << "\n" + << "Arrow schema: \n" << arrow_schema->ToString(); if (FLAGS_verbose) { std::cout << ss.str() << std::endl; } return Status::Invalid("Schemas did not match"); diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index 43bd8a4a4e8..3fdd5f83f88 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -901,7 +901,7 @@ class JsonArrayReader { DCHECK_EQ(children.size(), 1); *array = std::make_shared( - type, length, offsets_buffer, children[0], null_count, validity_buffer); + type, length, offsets_buffer, children[0], validity_buffer, null_count); return Status::OK(); } @@ -918,7 +918,7 @@ class JsonArrayReader { RETURN_NOT_OK(GetChildren(json_array, type, &fields)); *array = - std::make_shared(type, length, fields, null_count, validity_buffer); + std::make_shared(type, length, fields, validity_buffer, null_count); return Status::OK(); } @@ -953,7 +953,7 @@ class JsonArrayReader { RETURN_NOT_OK(GetChildren(json_array, type, &children)); *array = std::make_shared(type, length, children, type_id_buffer, - offsets_buffer, null_count, validity_buffer); + offsets_buffer, validity_buffer, null_count); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index ca790ded921..9b155d3c647 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -104,8 +104,8 @@ Status MakeIntRecordBatch(std::shared_ptr* out) { const int length = 1000; // Make the schema - auto f0 = std::make_shared("f0", int32()); - auto f1 = std::make_shared("f1", int32()); + auto f0 = field("f0", int32()); + auto f1 = field("f1", int32()); std::shared_ptr schema(new Schema({f0, f1})); // Example data @@ -141,8 +141,8 @@ Status MakeStringTypesRecordBatch(std::shared_ptr* out) { const int32_t length = 500; auto string_type = utf8(); auto binary_type = binary(); - auto f0 = std::make_shared("f0", string_type); - auto f1 = std::make_shared("f1", binary_type); + auto f0 = field("f0", string_type); + auto f1 = field("f1", binary_type); std::shared_ptr schema(new Schema({f0, f1})); std::shared_ptr a0, a1; @@ -164,9 +164,9 @@ Status MakeStringTypesRecordBatch(std::shared_ptr* out) { Status MakeListRecordBatch(std::shared_ptr* out) { // Make the schema - auto f0 = std::make_shared("f0", kListInt32); - auto f1 = std::make_shared("f1", kListListInt32); - auto f2 = std::make_shared("f2", int32()); + auto f0 = field("f0", kListInt32); + auto f1 = field("f1", kListListInt32); + auto f2 = field("f2", int32()); std::shared_ptr schema(new Schema({f0, f1, f2})); // Example data @@ -187,9 +187,9 @@ Status MakeListRecordBatch(std::shared_ptr* out) { Status MakeZeroLengthRecordBatch(std::shared_ptr* out) { // Make the schema - auto f0 = std::make_shared("f0", kListInt32); - auto f1 = std::make_shared("f1", kListListInt32); - auto f2 = std::make_shared("f2", int32()); + auto f0 = field("f0", kListInt32); + auto f1 = field("f1", kListListInt32); + auto f2 = field("f2", int32()); std::shared_ptr schema(new Schema({f0, f1, f2})); // Example data @@ -208,9 +208,9 @@ Status MakeZeroLengthRecordBatch(std::shared_ptr* out) { Status MakeNonNullRecordBatch(std::shared_ptr* out) { // Make the schema - auto f0 = std::make_shared("f0", kListInt32); - auto f1 = std::make_shared("f1", kListListInt32); - auto f2 = std::make_shared("f2", int32()); + auto f0 = field("f0", kListInt32); + auto f1 = field("f1", kListListInt32); + auto f2 = field("f2", int32()); std::shared_ptr schema(new Schema({f0, f1, f2})); // Example data @@ -242,7 +242,7 @@ Status MakeDeeplyNestedList(std::shared_ptr* out) { RETURN_NOT_OK(MakeRandomListArray(array, batch_length, include_nulls, pool, &array)); } - auto f0 = std::make_shared("f0", type); + auto f0 = field("f0", type); std::shared_ptr schema(new Schema({f0})); std::vector> arrays = {array}; out->reset(new RecordBatch(schema, batch_length, arrays)); @@ -260,8 +260,8 @@ Status MakeStruct(std::shared_ptr* out) { // Define schema std::shared_ptr type(new StructType( {list_schema->field(0), list_schema->field(1), list_schema->field(2)})); - auto f0 = std::make_shared("non_null_struct", type); - auto f1 = std::make_shared("null_struct", type); + auto f0 = field("non_null_struct", type); + auto f1 = field("null_struct", type); std::shared_ptr schema(new Schema({f0, f1})); // construct individual nullable/non-nullable struct arrays @@ -271,7 +271,7 @@ Status MakeStruct(std::shared_ptr* out) { std::shared_ptr null_bitmask; RETURN_NOT_OK(BitUtil::BytesToBits(null_bytes, &null_bitmask)); std::shared_ptr with_nulls( - new StructArray(type, list_batch->num_rows(), columns, 1, null_bitmask)); + new StructArray(type, list_batch->num_rows(), columns, null_bitmask, 1)); // construct batch std::vector> arrays = {no_nulls, with_nulls}; @@ -282,7 +282,7 @@ Status MakeStruct(std::shared_ptr* out) { Status MakeUnion(std::shared_ptr* out) { // Define schema std::vector> union_types( - {std::make_shared("u0", int32()), std::make_shared("u1", uint8())}); + {field("u0", int32()), field("u1", uint8())}); std::vector type_codes = {5, 10}; auto sparse_type = @@ -291,9 +291,9 @@ Status MakeUnion(std::shared_ptr* out) { auto dense_type = std::make_shared(union_types, type_codes, UnionMode::DENSE); - auto f0 = std::make_shared("sparse_nonnull", sparse_type, false); - auto f1 = std::make_shared("sparse", sparse_type); - auto f2 = std::make_shared("dense", dense_type); + auto f0 = field("sparse_nonnull", sparse_type, false); + auto f1 = field("sparse", sparse_type); + auto f2 = field("dense", dense_type); std::shared_ptr schema(new Schema({f0, f1, f2})); @@ -337,10 +337,10 @@ Status MakeUnion(std::shared_ptr* out) { auto sparse_no_nulls = std::make_shared(sparse_type, length, sparse_children, type_ids_buffer); auto sparse = std::make_shared( - sparse_type, length, sparse_children, type_ids_buffer, nullptr, 1, null_bitmask); + sparse_type, length, sparse_children, type_ids_buffer, nullptr, null_bitmask, 1); auto dense = std::make_shared(dense_type, length, dense_children, - type_ids_buffer, offsets_buffer, 1, null_bitmask); + type_ids_buffer, offsets_buffer, null_bitmask, 1); // construct batch std::vector> arrays = {sparse_no_nulls, sparse, dense}; diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index b3563eaae7b..0908f71d447 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -93,8 +93,7 @@ Status Table::FromRecordBatches(const std::string& name, if (!batches[i]->schema()->Equals(schema)) { std::stringstream ss; ss << "Schema at index " << static_cast(i) << " was different: \n" - << schema->ToString() << "\nvs\n" - << batches[i]->schema()->ToString(); + << schema->ToString() << "\nvs\n" << batches[i]->schema()->ToString(); return Status::Invalid(ss.str()); } } @@ -126,8 +125,7 @@ Status ConcatenateTables(const std::string& output_name, if (!tables[i]->schema()->Equals(schema)) { std::stringstream ss; ss << "Schema at index " << static_cast(i) << " was different: \n" - << schema->ToString() << "\nvs\n" - << tables[i]->schema()->ToString(); + << schema->ToString() << "\nvs\n" << tables[i]->schema()->ToString(); return Status::Invalid(ss.str()); } } diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 4e525804b47..2df8e957685 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -61,14 +61,6 @@ EXPECT_TRUE(s.ok()); \ } while (0) -// Alias MSVC popcount to GCC name -#ifdef _MSC_VER -#include -#define __builtin_popcount __popcnt -#include -#define __builtin_popcountll _mm_popcnt_u64 -#endif - namespace arrow { namespace test { @@ -175,29 +167,6 @@ void rand_uniform_int(int n, uint32_t seed, T min_value, T max_value, T* out) { } } -static inline int bitmap_popcount(const uint8_t* data, int length) { - // book keeping - constexpr int pop_len = sizeof(uint64_t); - const uint64_t* i64_data = reinterpret_cast(data); - const int fast_counts = length / pop_len; - const uint64_t* end = i64_data + fast_counts; - - int count = 0; - // popcount as much as possible with the widest possible count - for (auto iter = i64_data; iter < end; ++iter) { - count += __builtin_popcountll(*iter); - } - - // Account for left over bytes (in theory we could fall back to smaller - // versions of popcount but the code complexity is likely not worth it) - const int loop_tail_index = fast_counts * pop_len; - for (int i = loop_tail_index; i < length; ++i) { - if (BitUtil::GetBit(data, i)) { ++count; } - } - - return count; -} - static inline int null_count(const std::vector& valid_bytes) { int result = 0; for (size_t i = 0; i < valid_bytes.size(); ++i) { @@ -254,7 +223,7 @@ class TestBase : public ::testing::Test { auto null_bitmap = std::make_shared(pool_); EXPECT_OK(null_bitmap->Resize(BitUtil::BytesForBits(length))); - return std::make_shared(length, data, null_count, null_bitmap); + return std::make_shared(length, data, null_bitmap, null_count); } protected: diff --git a/cpp/src/arrow/util/bit-util-test.cc b/cpp/src/arrow/util/bit-util-test.cc index cfdee04f6e2..b9a912b35bd 100644 --- a/cpp/src/arrow/util/bit-util-test.cc +++ b/cpp/src/arrow/util/bit-util-test.cc @@ -17,11 +17,16 @@ #include "arrow/util/bit-util.h" +#include +#include + #include "gtest/gtest.h" +#include "arrow/test-util.h" + namespace arrow { -TEST(UtilTests, TestIsMultipleOf64) { +TEST(BitUtilTests, TestIsMultipleOf64) { using BitUtil::IsMultipleOf64; EXPECT_TRUE(IsMultipleOf64(64)); EXPECT_TRUE(IsMultipleOf64(0)); @@ -31,7 +36,7 @@ TEST(UtilTests, TestIsMultipleOf64) { EXPECT_FALSE(IsMultipleOf64(32)); } -TEST(UtilTests, TestNextPower2) { +TEST(BitUtilTests, TestNextPower2) { using BitUtil::NextPower2; ASSERT_EQ(8, NextPower2(6)); @@ -51,4 +56,31 @@ TEST(UtilTests, TestNextPower2) { ASSERT_EQ(1LL << 62, NextPower2((1LL << 62) - 1)); } +static inline int64_t SlowCountBits( + const uint8_t* data, int64_t bit_offset, int64_t length) { + int64_t count = 0; + for (int64_t i = bit_offset; i < bit_offset + length; ++i) { + if (BitUtil::GetBit(data, i)) { ++count; } + } + return count; +} + +TEST(BitUtilTests, TestCountSetBits) { + const int kBufferSize = 1000; + uint8_t buffer[kBufferSize] = {0}; + + test::random_bytes(kBufferSize, 0, buffer); + + const int num_bits = kBufferSize * 8; + + std::vector offsets = { + 0, 12, 16, 32, 37, 63, 64, 128, num_bits - 30, num_bits - 64}; + for (int64_t offset : offsets) { + int64_t result = CountSetBits(buffer, offset, num_bits - offset); + int64_t expected = SlowCountBits(buffer, offset, num_bits - offset); + + ASSERT_EQ(expected, result); + } +} + } // namespace arrow diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc index 9c82407ecc0..d4cf344f40b 100644 --- a/cpp/src/arrow/util/bit-util.cc +++ b/cpp/src/arrow/util/bit-util.cc @@ -15,6 +15,15 @@ // specific language governing permissions and limitations // under the License. +// Alias MSVC popcount to GCC name +#ifdef _MSC_VER +#include +#define __builtin_popcount __popcnt +#include +#define __builtin_popcountll _mm_popcnt_u64 +#endif + +#include #include #include @@ -43,4 +52,41 @@ Status BitUtil::BytesToBits( return Status::OK(); } +int64_t CountSetBits(const uint8_t* data, int64_t bit_offset, int64_t length) { + constexpr int64_t pop_len = sizeof(uint64_t) * 8; + + int64_t count = 0; + + // The first bit offset where we can use a 64-bit wide hardware popcount + const int64_t fast_count_start = BitUtil::RoundUp(bit_offset, pop_len); + + // The number of bits until fast_count_start + const int64_t initial_bits = std::min(length, fast_count_start - bit_offset); + for (int64_t i = bit_offset; i < bit_offset + initial_bits; ++i) { + if (BitUtil::GetBit(data, i)) { ++count; } + } + + const int64_t fast_counts = (length - initial_bits) / pop_len; + + // Advance until the first aligned 8-byte word after the initial bits + const uint64_t* u64_data = + reinterpret_cast(data) + fast_count_start / pop_len; + + const uint64_t* end = u64_data + fast_counts; + + // popcount as much as possible with the widest possible count + for (auto iter = u64_data; iter < end; ++iter) { + count += __builtin_popcountll(*iter); + } + + // Account for left over bit (in theory we could fall back to smaller + // versions of popcount but the code complexity is likely not worth it) + const int64_t tail_index = bit_offset + initial_bits + fast_counts * pop_len; + for (int64_t i = tail_index; i < bit_offset + length; ++i) { + if (BitUtil::GetBit(data, i)) { ++count; } + } + + return count; +} + } // namespace arrow diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 5c8055f9c61..cbb4be0844b 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -82,6 +82,11 @@ static inline bool IsMultipleOf8(int64_t n) { return (n & 7) == 0; } +/// Returns 'value' rounded up to the nearest multiple of 'factor' +inline int64_t RoundUp(int64_t value, int64_t factor) { + return (value + (factor - 1)) / factor * factor; +} + inline int64_t RoundUpToMultipleOf64(int64_t num) { // TODO(wesm): is this definitely needed? // DCHECK_GE(num, 0); @@ -98,6 +103,17 @@ void BytesToBits(const std::vector& bytes, uint8_t* bits); ARROW_EXPORT Status BytesToBits(const std::vector&, std::shared_ptr*); } // namespace BitUtil + +/// Compute the number of 1's in the given data array +/// +/// \param[in] data a packed LSB-ordered bitmap as a byte array +/// \param[in] bit_offset a bitwise offset into the bitmap +/// \param[in] length the number of bits to inspect in the bitmap relative to the offset +/// +/// \return The number of set (1) bits in the range +int64_t ARROW_EXPORT CountSetBits( + const uint8_t* data, int64_t bit_offset, int64_t length); + } // namespace arrow #endif // ARROW_UTIL_BIT_UTIL_H diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h index b22f07dd634..06ee8411e28 100644 --- a/cpp/src/arrow/util/logging.h +++ b/cpp/src/arrow/util/logging.h @@ -118,9 +118,9 @@ class CerrLog { class FatalLog : public CerrLog { public: explicit FatalLog(int /* severity */) // NOLINT - : CerrLog(ARROW_FATAL){} // NOLINT + : CerrLog(ARROW_FATAL) {} // NOLINT - [[noreturn]] ~FatalLog() { + [[noreturn]] ~FatalLog() { if (has_logged_) { std::cerr << std::endl; } std::exit(1); } diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h index c4a62a475b9..81a9b0cff56 100644 --- a/cpp/src/arrow/util/macros.h +++ b/cpp/src/arrow/util/macros.h @@ -25,6 +25,6 @@ TypeName& operator=(const TypeName&) = delete #endif -#define UNUSED(x) (void)x +#define UNUSED(x) (void) x #endif // ARROW_UTIL_MACROS_H From a228b5035d742b0f1a259d9623b1bcab55668ac2 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 5 Feb 2017 16:40:35 -0500 Subject: [PATCH 03/15] Implement Slice methods on Array classes Change-Id: Ie40f415d18f3f24f35cd1a49e2138ed9cd734a35 --- cpp/src/arrow/array.cc | 130 +++++++++++++++++++++++------ cpp/src/arrow/array.h | 116 ++++++++++++++----------- cpp/src/arrow/builder.h | 7 -- cpp/src/arrow/ipc/json-internal.cc | 2 +- cpp/src/arrow/type_traits.h | 9 ++ 5 files changed, 180 insertions(+), 84 deletions(-) diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index f2be69a79c2..54e4ca282f5 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -31,6 +31,8 @@ namespace arrow { +constexpr int32_t kUnknownNullCount = -1; + Status GetEmptyBitmap( MemoryPool* pool, int32_t length, std::shared_ptr* result) { auto buffer = std::make_shared(pool); @@ -100,6 +102,15 @@ Status Array::Validate() const { return Status::OK(); } +NullArray::NullArray(int32_t length) + : Array(null(), length, nullptr, length) {} + +std::shared_ptr NullArray::Slice(int32_t offset, int32_t length) const { + DCHECK_LE(offset, length_); + length = std::min(length_ - offset, length); + return std::make_shared(length); +} + Status NullArray::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); } @@ -120,14 +131,16 @@ Status NumericArray::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); } -// template -// std::shared_ptr NumericArray::Slice(int32_t offset, int32_t length) const { -// DCHECK_LE(offset, length_); - -// length = std::min(length_ - offset, length); +template +std::shared_ptr NumericArray::Slice(int32_t offset, int32_t length) const { + DCHECK_LE(offset, length_); + length = std::min(length_ - offset, length); -// return std::make_shared> return visitor->Visit(*this); -// } + // Combine this offset with any existing offset + int64_t new_offset = offset_ + offset; + return std::make_shared>(type_, length, data_, null_bitmap_, + kUnknownNullCount, new_offset); +} template class NumericArray; template class NumericArray; @@ -156,15 +169,25 @@ Status BooleanArray::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); } +std::shared_ptr BooleanArray::Slice(int32_t offset, int32_t length) const { + DCHECK_LE(offset, length_); + length = std::min(length_ - offset, length); + + // Combine this offset with any existing offset + int64_t new_offset = offset_ + offset; + return std::make_shared(length, data_, null_bitmap_, kUnknownNullCount, + new_offset); +} + // ---------------------------------------------------------------------- // ListArray Status ListArray::Validate() const { if (length_ < 0) { return Status::Invalid("Length was negative"); } - if (!offsets_buffer_) { return Status::Invalid("offsets_buffer_ was null"); } - if (offsets_buffer_->size() / static_cast(sizeof(int32_t)) < length_) { + if (!offsets_) { return Status::Invalid("offsets_ was null"); } + if (offsets_->size() / static_cast(sizeof(int32_t)) < length_) { std::stringstream ss; - ss << "offset buffer size (bytes): " << offsets_buffer_->size() + ss << "offset buffer size (bytes): " << offsets_->size() << " isn't large enough for length: " << length_; return Status::Invalid(ss.str()); } @@ -214,6 +237,16 @@ Status ListArray::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); } +std::shared_ptr ListArray::Slice(int32_t offset, int32_t length) const { + DCHECK_LE(offset, length_); + length = std::min(length_ - offset, length); + + // Combine this offset with any existing offset + int64_t new_offset = offset_ + offset; + return std::make_shared(type_, length, offsets_, values_, null_bitmap_, + kUnknownNullCount, new_offset); +} + // ---------------------------------------------------------------------- // String and binary @@ -229,11 +262,11 @@ BinaryArray::BinaryArray(const std::shared_ptr& type, int32_t length, const std::shared_ptr& offsets, const std::shared_ptr& data, const std::shared_ptr& null_bitmap, int32_t null_count, int32_t offset) : Array(type, length, null_bitmap, null_count, offset), - offsets_buffer_(offsets), - offsets_(reinterpret_cast(offsets_buffer_->data())), - data_buffer_(data), - data_(nullptr) { - if (data_buffer_ != nullptr) { data_ = data_buffer_->data(); } + offsets_(offsets), + raw_offsets_(reinterpret_cast(offsets_->data())), + data_(data), + raw_data_(nullptr) { + if (data_ != nullptr) { raw_data_ = data_->data(); } } Status BinaryArray::Validate() const { @@ -245,6 +278,16 @@ Status BinaryArray::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); } +std::shared_ptr BinaryArray::Slice(int32_t offset, int32_t length) const { + DCHECK_LE(offset, length_); + length = std::min(length_ - offset, length); + + // Combine this offset with any existing offset + int64_t new_offset = offset_ + offset; + return std::make_shared(length, offsets_, data_, null_bitmap_, + kUnknownNullCount, new_offset); +} + StringArray::StringArray(int32_t length, const std::shared_ptr& offsets, const std::shared_ptr& data, const std::shared_ptr& null_bitmap, int32_t null_count, int32_t offset) @@ -259,20 +302,30 @@ Status StringArray::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); } +std::shared_ptr StringArray::Slice(int32_t offset, int32_t length) const { + DCHECK_LE(offset, length_); + length = std::min(length_ - offset, length); + + // Combine this offset with any existing offset + int64_t new_offset = offset_ + offset; + return std::make_shared(length, offsets_, data_, null_bitmap_, + kUnknownNullCount, new_offset); +} + // ---------------------------------------------------------------------- // Struct StructArray::StructArray(const std::shared_ptr& type, int32_t length, - const std::vector>& field_arrays, + const std::vector>& children, std::shared_ptr null_bitmap, int32_t null_count, int32_t offset) : Array(type, length, null_bitmap, null_count, offset) { type_ = type; - field_arrays_ = field_arrays; + children_ = children; } std::shared_ptr StructArray::field(int32_t pos) const { - DCHECK_GT(field_arrays_.size(), 0); - return field_arrays_[pos]; + DCHECK_GT(children_.size(), 0); + return children_[pos]; } Status StructArray::Validate() const { @@ -282,11 +335,11 @@ Status StructArray::Validate() const { return Status::Invalid("Null count exceeds the length of this struct"); } - if (field_arrays_.size() > 0) { + if (children_.size() > 0) { // Validate fields - int32_t array_length = field_arrays_[0]->length(); + int32_t array_length = children_[0]->length(); size_t idx = 0; - for (auto it : field_arrays_) { + for (auto it : children_) { if (it->length() != array_length) { std::stringstream ss; ss << "Length is not equal from field " << it->type()->ToString() @@ -315,6 +368,16 @@ Status StructArray::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); } +std::shared_ptr StructArray::Slice(int32_t offset, int32_t length) const { + DCHECK_LE(offset, length_); + length = std::min(length_ - offset, length); + + // Combine this offset with any existing offset + int64_t new_offset = offset_ + offset; + return std::make_shared(type_, length, children_, null_bitmap_, + kUnknownNullCount, new_offset); +} + // ---------------------------------------------------------------------- // UnionArray @@ -324,10 +387,10 @@ UnionArray::UnionArray(const std::shared_ptr& type, int32_t length, const std::shared_ptr& null_bitmap, int32_t null_count, int32_t offset) : Array(type, length, null_bitmap, null_count, offset), children_(children), - type_ids_buffer_(type_ids), - offsets_buffer_(offsets) { - type_ids_ = reinterpret_cast(type_ids->data()); - if (offsets) { offsets_ = reinterpret_cast(offsets->data()); } + type_ids_(type_ids), + offsets_(offsets) { + raw_type_ids_ = reinterpret_cast(type_ids->data()); + if (offsets) { raw_offsets_ = reinterpret_cast(offsets->data()); } } std::shared_ptr UnionArray::child(int32_t pos) const { @@ -350,6 +413,16 @@ Status UnionArray::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); } +std::shared_ptr UnionArray::Slice(int32_t offset, int32_t length) const { + DCHECK_LE(offset, length_); + length = std::min(length_ - offset, length); + + // Combine this offset with any existing offset + int64_t new_offset = offset_ + offset; + return std::make_shared(type_, length, children_, type_ids_, offsets_, + null_bitmap_, kUnknownNullCount, new_offset); +} + // ---------------------------------------------------------------------- // DictionaryArray @@ -392,6 +465,11 @@ Status DictionaryArray::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); } +std::shared_ptr DictionaryArray::Slice(int32_t offset, int32_t length) const { + std::shared_ptr sliced_indices = indices_->Slice(offset, length); + return std::make_shared(type_, sliced_indices); +} + // ---------------------------------------------------------------------- #define MAKE_PRIMITIVE_ARRAY_CASE(ENUM, ArrayType) \ diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index e8271018401..05241bde19e 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -27,6 +27,7 @@ #include "arrow/buffer.h" #include "arrow/type.h" #include "arrow/type_fwd.h" +#include "arrow/type_traits.h" #include "arrow/util/bit-util.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" @@ -135,7 +136,13 @@ class ARROW_EXPORT Array { /// Construct a zero-copy slice of the array with the indicated offset and /// length - // virtual std::shared_ptr Slice(int32_t offset, int32_t length) const = 0; + /// + /// \param[in] offset the position of the first element in the constructed slice + /// \param[in] length the length of the slice. If there are not enough elements in the array, + /// the length will be adjusted accordingly + /// + /// \return a new object wrapped in std::shared_ptr + virtual std::shared_ptr Slice(int32_t offset, int32_t length) const = 0; protected: std::shared_ptr type_; @@ -160,12 +167,11 @@ class ARROW_EXPORT NullArray : public Array { public: using TypeClass = NullType; - NullArray(const std::shared_ptr& type, int32_t length) - : Array(type, length, nullptr, length) {} - - explicit NullArray(int32_t length) : NullArray(std::make_shared(), length) {} + explicit NullArray(int32_t length); Status Accept(ArrayVisitor* visitor) const override; + + std::shared_ptr Slice(int32_t offset, int32_t length) const override; }; Status ARROW_EXPORT GetEmptyBitmap( @@ -179,8 +185,6 @@ class ARROW_EXPORT PrimitiveArray : public Array { const std::shared_ptr& null_bitmap = nullptr, int32_t null_count = 0, int32_t offset = 0); - virtual ~PrimitiveArray() {} - std::shared_ptr data() const { return data_; } protected: @@ -196,11 +200,13 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray { using PrimitiveArray::PrimitiveArray; - NumericArray(int32_t length, const std::shared_ptr& data, - const std::shared_ptr& null_bitmap = nullptr, int32_t null_count = 0, - int32_t offset = 0) - : PrimitiveArray(std::make_shared(), length, data, null_bitmap, - null_count, offset) {} + // Only enable this constructor without a type argument for types without additional metadata + template + NumericArray(typename std::enable_if::is_parameter_free, int32_t>::type length, + const std::shared_ptr& data, const std::shared_ptr& null_bitmap = nullptr, + int32_t null_count = 0, int32_t offset = 0) + : PrimitiveArray(TypeTraits::type_singleton(), length, data, null_bitmap, + null_count, offset) {} const value_type* raw_data() const { return reinterpret_cast(raw_data_) + offset_; @@ -208,6 +214,8 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray { Status Accept(ArrayVisitor* visitor) const override; + std::shared_ptr Slice(int32_t offset, int32_t length) const override; + value_type Value(int i) const { return raw_data()[i]; } }; @@ -223,6 +231,8 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { Status Accept(ArrayVisitor* visitor) const override; + std::shared_ptr Slice(int32_t offset, int32_t length) const override; + bool Value(int i) const { return BitUtil::GetBit(reinterpret_cast(raw_data_), i + offset_); } @@ -240,37 +250,37 @@ class ARROW_EXPORT ListArray : public Array { const std::shared_ptr& null_bitmap = nullptr, int32_t null_count = 0, int32_t offset = 0) : Array(type, length, null_bitmap, null_count, offset) { - offsets_buffer_ = offsets; - offsets_ = offsets == nullptr ? nullptr : reinterpret_cast( - offsets_buffer_->data()); + offsets_ = offsets; + raw_offsets_ = offsets == nullptr ? nullptr : reinterpret_cast( + offsets_->data()); values_ = values; } Status Validate() const override; - virtual ~ListArray() = default; - // Return a shared pointer in case the requestor desires to share ownership // with this array. std::shared_ptr values() const { return values_; } - std::shared_ptr offsets() const { return offsets_buffer_; } + std::shared_ptr offsets() const { return offsets_; } std::shared_ptr value_type() const { return values_->type(); } - const int32_t* raw_offsets() const { return offsets_; } + const int32_t* raw_offsets() const { return raw_offsets_; } // Neither of these functions will perform boundschecking - int32_t value_offset(int i) const { return offsets_[i + offset_]; } + int32_t value_offset(int i) const { return raw_offsets_[i + offset_]; } int32_t value_length(int i) const { i += offset_; - return offsets_[i + 1] - offsets_[i]; + return raw_offsets_[i + 1] - raw_offsets_[i]; } Status Accept(ArrayVisitor* visitor) const override; + std::shared_ptr Slice(int32_t offset, int32_t length) const override; + protected: - std::shared_ptr offsets_buffer_; - const int32_t* offsets_; + std::shared_ptr offsets_; + const int32_t* raw_offsets_; std::shared_ptr values_; }; @@ -293,27 +303,29 @@ class ARROW_EXPORT BinaryArray : public Array { // Account for base offset i += offset_; - const int32_t pos = offsets_[i]; - *out_length = offsets_[i + 1] - pos; - return data_ + pos; + const int32_t pos = raw_offsets_[i]; + *out_length = raw_offsets_[i + 1] - pos; + return raw_data_ + pos; } - std::shared_ptr data() const { return data_buffer_; } - std::shared_ptr offsets() const { return offsets_buffer_; } + std::shared_ptr data() const { return data_; } + std::shared_ptr offsets() const { return offsets_; } - const int32_t* raw_offsets() const { return offsets_; } + const int32_t* raw_offsets() const { return raw_offsets_; } // Neither of these functions will perform boundschecking - int32_t value_offset(int i) const { return offsets_[i + offset_]; } + int32_t value_offset(int i) const { return raw_offsets_[i + offset_]; } int32_t value_length(int i) const { i += offset_; - return offsets_[i + 1] - offsets_[i]; + return raw_offsets_[i + 1] - raw_offsets_[i]; } Status Validate() const override; Status Accept(ArrayVisitor* visitor) const override; + std::shared_ptr Slice(int32_t offset, int32_t length) const override; + protected: // Constructor that allows sub-classes/builders to propagate there logical type up the // class hierarchy. @@ -322,11 +334,11 @@ class ARROW_EXPORT BinaryArray : public Array { const std::shared_ptr& null_bitmap = nullptr, int32_t null_count = 0, int32_t offset = 0); - std::shared_ptr offsets_buffer_; - const int32_t* offsets_; + std::shared_ptr offsets_; + const int32_t* raw_offsets_; - std::shared_ptr data_buffer_; - const uint8_t* data_; + std::shared_ptr data_; + const uint8_t* raw_data_; }; class ARROW_EXPORT StringArray : public BinaryArray { @@ -349,6 +361,8 @@ class ARROW_EXPORT StringArray : public BinaryArray { Status Validate() const override; Status Accept(ArrayVisitor* visitor) const override; + + std::shared_ptr Slice(int32_t offset, int32_t length) const override; }; // ---------------------------------------------------------------------- @@ -359,25 +373,25 @@ class ARROW_EXPORT StructArray : public Array { using TypeClass = StructType; StructArray(const std::shared_ptr& type, int32_t length, - const std::vector>& field_arrays, + const std::vector>& children, std::shared_ptr null_bitmap = nullptr, int32_t null_count = 0, int32_t offset = 0); Status Validate() const override; - virtual ~StructArray() {} - // Return a shared pointer in case the requestor desires to share ownership // with this array. std::shared_ptr field(int32_t pos) const; - const std::vector>& fields() const { return field_arrays_; } + const std::vector>& fields() const { return children_; } Status Accept(ArrayVisitor* visitor) const override; + std::shared_ptr Slice(int32_t offset, int32_t length) const override; + protected: // The child arrays corresponding to each field of the struct data type. - std::vector> field_arrays_; + std::vector> children_; }; // ---------------------------------------------------------------------- @@ -396,13 +410,11 @@ class ARROW_EXPORT UnionArray : public Array { Status Validate() const override; - virtual ~UnionArray() {} - - std::shared_ptr type_ids() const { return type_ids_buffer_; } - const uint8_t* raw_type_ids() const { return type_ids_; } + std::shared_ptr type_ids() const { return type_ids_; } + const uint8_t* raw_type_ids() const { return raw_type_ids_; } - std::shared_ptr offsets() const { return offsets_buffer_; } - const int32_t* raw_offsets() const { return offsets_; } + std::shared_ptr offsets() const { return offsets_; } + const int32_t* raw_offsets() const { return raw_offsets_; } UnionMode mode() const { return static_cast(*type_.get()).mode; } @@ -412,14 +424,16 @@ class ARROW_EXPORT UnionArray : public Array { Status Accept(ArrayVisitor* visitor) const override; + std::shared_ptr Slice(int32_t offset, int32_t length) const override; + protected: std::vector> children_; - std::shared_ptr type_ids_buffer_; - const uint8_t* type_ids_; + std::shared_ptr type_ids_; + const uint8_t* raw_type_ids_; - std::shared_ptr offsets_buffer_; - const int32_t* offsets_; + std::shared_ptr offsets_; + const int32_t* raw_offsets_; }; // ---------------------------------------------------------------------- @@ -462,6 +476,8 @@ class ARROW_EXPORT DictionaryArray : public Array { Status Accept(ArrayVisitor* visitor) const override; + std::shared_ptr Slice(int32_t offset, int32_t length) const override; + protected: const DictionaryType* dict_type_; std::shared_ptr indices_; diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 672d2d8f23e..7a32aae9ba4 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -143,8 +143,6 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { explicit PrimitiveBuilder(MemoryPool* pool, const TypePtr& type) : ArrayBuilder(pool, type), data_(nullptr) {} - virtual ~PrimitiveBuilder() {} - using ArrayBuilder::Advance; /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory @@ -244,8 +242,6 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { explicit BooleanBuilder(MemoryPool* pool, const TypePtr& type = boolean()) : ArrayBuilder(pool, type), data_(nullptr) {} - virtual ~BooleanBuilder() {} - using ArrayBuilder::Advance; /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory @@ -321,8 +317,6 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder { ListBuilder( MemoryPool* pool, std::shared_ptr values, const TypePtr& type = nullptr); - virtual ~ListBuilder() {} - Status Init(int32_t elements) override; Status Resize(int32_t capacity) override; Status Finish(std::shared_ptr* out) override; @@ -369,7 +363,6 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder { class ARROW_EXPORT BinaryBuilder : public ListBuilder { public: explicit BinaryBuilder(MemoryPool* pool, const TypePtr& type); - virtual ~BinaryBuilder() {} Status Append(const uint8_t* value, int32_t length) { RETURN_NOT_OK(ListBuilder::Append()); diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index 3fdd5f83f88..27ed98cfff7 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -962,7 +962,7 @@ class JsonArrayReader { typename std::enable_if::value, Status>::type ReadArray( const RjObject& json_array, int32_t length, const std::vector& is_valid, const std::shared_ptr& type, std::shared_ptr* array) { - *array = std::make_shared(type, length); + *array = std::make_shared(length); return Status::OK(); } diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 5cd5f45466b..c4898b1ac8c 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -125,6 +125,15 @@ struct TypeTraits { constexpr static bool is_parameter_free = false; }; +template <> +struct TypeTraits { + using ArrayType = TimeArray; + // using BuilderType = TimestampBuilder; + + static inline int bytes_required(int elements) { return elements * sizeof(int64_t); } + constexpr static bool is_parameter_free = false; +}; + template <> struct TypeTraits { using ArrayType = HalfFloatArray; From 0355f71086436f72f506357fd47d01d102243e9d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 5 Feb 2017 17:32:14 -0500 Subject: [PATCH 04/15] Implement CopyBitmap function Change-Id: I73120a386114b91063be6d4cf2f30423798eceb8 --- cpp/src/arrow/array.cc | 41 ++++++++++++----------------- cpp/src/arrow/array.h | 25 +++++++++--------- cpp/src/arrow/buffer.cc | 16 +++++++++++ cpp/src/arrow/buffer.h | 19 ++++++++++--- cpp/src/arrow/io/file.cc | 4 +-- cpp/src/arrow/io/hdfs.cc | 8 +++--- cpp/src/arrow/io/io-hdfs-test.cc | 10 ++++--- cpp/src/arrow/io/io-memory-test.cc | 4 +-- cpp/src/arrow/ipc/json-internal.cc | 5 ++-- cpp/src/arrow/util/bit-util-test.cc | 26 ++++++++++++++++++ cpp/src/arrow/util/bit-util.cc | 25 ++++++++++++++++-- cpp/src/arrow/util/bit-util.h | 26 ++++++++++++++++++ 12 files changed, 154 insertions(+), 55 deletions(-) diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 54e4ca282f5..1cf86311ad2 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -31,18 +31,12 @@ namespace arrow { +// When slicing, we do not know the null count of the sliced range without +// doing some computation. To avoid doing this eagerly, we set the null count +// to -1 (any negative number will do). When Array::null_count is called the +// first time, the null count will be computed. See ARROW-33 constexpr int32_t kUnknownNullCount = -1; -Status GetEmptyBitmap( - MemoryPool* pool, int32_t length, std::shared_ptr* result) { - auto buffer = std::make_shared(pool); - RETURN_NOT_OK(buffer->Resize(BitUtil::BytesForBits(length))); - memset(buffer->mutable_data(), 0, buffer->size()); - - *result = buffer; - return Status::OK(); -} - // ---------------------------------------------------------------------- // Base array class @@ -102,8 +96,7 @@ Status Array::Validate() const { return Status::OK(); } -NullArray::NullArray(int32_t length) - : Array(null(), length, nullptr, length) {} +NullArray::NullArray(int32_t length) : Array(null(), length, nullptr, length) {} std::shared_ptr NullArray::Slice(int32_t offset, int32_t length) const { DCHECK_LE(offset, length_); @@ -138,8 +131,8 @@ std::shared_ptr NumericArray::Slice(int32_t offset, int32_t length) co // Combine this offset with any existing offset int64_t new_offset = offset_ + offset; - return std::make_shared>(type_, length, data_, null_bitmap_, - kUnknownNullCount, new_offset); + return std::make_shared>( + type_, length, data_, null_bitmap_, kUnknownNullCount, new_offset); } template class NumericArray; @@ -175,8 +168,8 @@ std::shared_ptr BooleanArray::Slice(int32_t offset, int32_t length) const // Combine this offset with any existing offset int64_t new_offset = offset_ + offset; - return std::make_shared(length, data_, null_bitmap_, kUnknownNullCount, - new_offset); + return std::make_shared( + length, data_, null_bitmap_, kUnknownNullCount, new_offset); } // ---------------------------------------------------------------------- @@ -243,8 +236,8 @@ std::shared_ptr ListArray::Slice(int32_t offset, int32_t length) const { // Combine this offset with any existing offset int64_t new_offset = offset_ + offset; - return std::make_shared(type_, length, offsets_, values_, null_bitmap_, - kUnknownNullCount, new_offset); + return std::make_shared( + type_, length, offsets_, values_, null_bitmap_, kUnknownNullCount, new_offset); } // ---------------------------------------------------------------------- @@ -284,8 +277,8 @@ std::shared_ptr BinaryArray::Slice(int32_t offset, int32_t length) const // Combine this offset with any existing offset int64_t new_offset = offset_ + offset; - return std::make_shared(length, offsets_, data_, null_bitmap_, - kUnknownNullCount, new_offset); + return std::make_shared( + length, offsets_, data_, null_bitmap_, kUnknownNullCount, new_offset); } StringArray::StringArray(int32_t length, const std::shared_ptr& offsets, @@ -308,8 +301,8 @@ std::shared_ptr StringArray::Slice(int32_t offset, int32_t length) const // Combine this offset with any existing offset int64_t new_offset = offset_ + offset; - return std::make_shared(length, offsets_, data_, null_bitmap_, - kUnknownNullCount, new_offset); + return std::make_shared( + length, offsets_, data_, null_bitmap_, kUnknownNullCount, new_offset); } // ---------------------------------------------------------------------- @@ -374,8 +367,8 @@ std::shared_ptr StructArray::Slice(int32_t offset, int32_t length) const // Combine this offset with any existing offset int64_t new_offset = offset_ + offset; - return std::make_shared(type_, length, children_, null_bitmap_, - kUnknownNullCount, new_offset); + return std::make_shared( + type_, length, children_, null_bitmap_, kUnknownNullCount, new_offset); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 05241bde19e..509ae08da74 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -138,7 +138,8 @@ class ARROW_EXPORT Array { /// length /// /// \param[in] offset the position of the first element in the constructed slice - /// \param[in] length the length of the slice. If there are not enough elements in the array, + /// \param[in] length the length of the slice. If there are not enough elements in the + /// array, /// the length will be adjusted accordingly /// /// \return a new object wrapped in std::shared_ptr @@ -174,9 +175,6 @@ class ARROW_EXPORT NullArray : public Array { std::shared_ptr Slice(int32_t offset, int32_t length) const override; }; -Status ARROW_EXPORT GetEmptyBitmap( - MemoryPool* pool, int32_t length, std::shared_ptr* result); - /// Base class for fixed-size logical types class ARROW_EXPORT PrimitiveArray : public Array { public: @@ -200,13 +198,16 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray { using PrimitiveArray::PrimitiveArray; - // Only enable this constructor without a type argument for types without additional metadata + // Only enable this constructor without a type argument for types without additional + // metadata template - NumericArray(typename std::enable_if::is_parameter_free, int32_t>::type length, - const std::shared_ptr& data, const std::shared_ptr& null_bitmap = nullptr, - int32_t null_count = 0, int32_t offset = 0) - : PrimitiveArray(TypeTraits::type_singleton(), length, data, null_bitmap, - null_count, offset) {} + NumericArray( + typename std::enable_if::is_parameter_free, int32_t>::type length, + const std::shared_ptr& data, + const std::shared_ptr& null_bitmap = nullptr, int32_t null_count = 0, + int32_t offset = 0) + : PrimitiveArray(TypeTraits::type_singleton(), length, data, null_bitmap, + null_count, offset) {} const value_type* raw_data() const { return reinterpret_cast(raw_data_) + offset_; @@ -251,8 +252,8 @@ class ARROW_EXPORT ListArray : public Array { int32_t offset = 0) : Array(type, length, null_bitmap, null_count, offset) { offsets_ = offsets; - raw_offsets_ = offsets == nullptr ? nullptr : reinterpret_cast( - offsets_->data()); + raw_offsets_ = + offsets == nullptr ? nullptr : reinterpret_cast(offsets_->data()); values_ = values; } diff --git a/cpp/src/arrow/buffer.cc b/cpp/src/arrow/buffer.cc index 6cce0efa377..fb5a010efa2 100644 --- a/cpp/src/arrow/buffer.cc +++ b/cpp/src/arrow/buffer.cc @@ -116,4 +116,20 @@ Status PoolBuffer::Resize(int64_t new_size, bool shrink_to_fit) { return Status::OK(); } +Status AllocateBuffer( + MemoryPool* pool, int64_t size, std::shared_ptr* out) { + auto buffer = std::make_shared(pool); + RETURN_NOT_OK(buffer->Resize(size)); + *out = buffer; + return Status::OK(); +} + +Status AllocateResizableBuffer( + MemoryPool* pool, int64_t size, std::shared_ptr* out) { + auto buffer = std::make_shared(pool); + RETURN_NOT_OK(buffer->Resize(size)); + *out = buffer; + return Status::OK(); +} + } // namespace arrow diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index d43ab0375b7..2271a399585 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -#ifndef ARROW_UTIL_BUFFER_H -#define ARROW_UTIL_BUFFER_H +#ifndef ARROW_BUFFER_H +#define ARROW_BUFFER_H #include #include @@ -232,6 +232,19 @@ class ARROW_EXPORT BufferBuilder { int64_t size_; }; +/// Allocate a new mutable buffer from a memory pool +/// +/// \param[in] pool a memory pool +/// \param[in] size size of buffer to allocate +/// \param[out] out the allocated buffer with padding +/// +/// \return Status message +Status ARROW_EXPORT AllocateBuffer( + MemoryPool* pool, int64_t size, std::shared_ptr* out); + +Status ARROW_EXPORT AllocateResizableBuffer( + MemoryPool* pool, int64_t size, std::shared_ptr* out); + } // namespace arrow -#endif // ARROW_UTIL_BUFFER_H +#endif // ARROW_BUFFER_H diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc index ff58e539b93..c1f0ea48a9d 100644 --- a/cpp/src/arrow/io/file.cc +++ b/cpp/src/arrow/io/file.cc @@ -401,8 +401,8 @@ class ReadableFile::ReadableFileImpl : public OSFile { Status Open(const std::string& path) { return OpenReadable(path); } Status ReadBuffer(int64_t nbytes, std::shared_ptr* out) { - auto buffer = std::make_shared(pool_); - RETURN_NOT_OK(buffer->Resize(nbytes)); + std::shared_ptr buffer; + RETURN_NOT_OK(AllocateResizableBuffer(pool_, nbytes, &buffer)); int64_t bytes_read = 0; RETURN_NOT_OK(Read(nbytes, &bytes_read, buffer->mutable_data())); diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc index 2845b0d8c44..5682f44b94a 100644 --- a/cpp/src/arrow/io/hdfs.cc +++ b/cpp/src/arrow/io/hdfs.cc @@ -125,8 +125,8 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { } Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) { - auto buffer = std::make_shared(pool_); - RETURN_NOT_OK(buffer->Resize(nbytes)); + std::shared_ptr buffer; + RETURN_NOT_OK(AllocateResizableBuffer(pool_, nbytes, &buffer)); int64_t bytes_read = 0; RETURN_NOT_OK(ReadAt(position, nbytes, &bytes_read, buffer->mutable_data())); @@ -152,8 +152,8 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { } Status Read(int64_t nbytes, std::shared_ptr* out) { - auto buffer = std::make_shared(pool_); - RETURN_NOT_OK(buffer->Resize(nbytes)); + std::shared_ptr buffer; + RETURN_NOT_OK(AllocateResizableBuffer(pool_, nbytes, &buffer)); int64_t bytes_read = 0; RETURN_NOT_OK(Read(nbytes, &bytes_read, buffer->mutable_data())); diff --git a/cpp/src/arrow/io/io-hdfs-test.cc b/cpp/src/arrow/io/io-hdfs-test.cc index 72e0ba8f298..f0e5a280d31 100644 --- a/cpp/src/arrow/io/io-hdfs-test.cc +++ b/cpp/src/arrow/io/io-hdfs-test.cc @@ -336,8 +336,9 @@ TYPED_TEST(TestHdfsClient, LargeFile) { std::shared_ptr file; ASSERT_OK(this->client_->OpenReadable(path, &file)); - auto buffer = std::make_shared(); - ASSERT_OK(buffer->Resize(size)); + std::shared_ptr buffer; + ASSERT_OK(AllocateBuffer(nullptr, size, &buffer)); + int64_t bytes_read = 0; ASSERT_OK(file->Read(size, &bytes_read, buffer->mutable_data())); @@ -348,8 +349,9 @@ TYPED_TEST(TestHdfsClient, LargeFile) { std::shared_ptr file2; ASSERT_OK(this->client_->OpenReadable(path, 1 << 18, &file2)); - auto buffer2 = std::make_shared(); - ASSERT_OK(buffer2->Resize(size)); + std::shared_ptr buffer2; + ASSERT_OK(AllocateBuffer(nullptr, size, &buffer2)); + ASSERT_OK(file2->Read(size, &bytes_read, buffer2->mutable_data())); ASSERT_EQ(0, std::memcmp(buffer2->data(), data.data(), size)); ASSERT_EQ(size, bytes_read); diff --git a/cpp/src/arrow/io/io-memory-test.cc b/cpp/src/arrow/io/io-memory-test.cc index c0b01653cb1..442cd0c4bbc 100644 --- a/cpp/src/arrow/io/io-memory-test.cc +++ b/cpp/src/arrow/io/io-memory-test.cc @@ -73,8 +73,8 @@ TEST(TestBufferReader, RetainParentReference) { std::shared_ptr slice1; std::shared_ptr slice2; { - auto buffer = std::make_shared(); - ASSERT_OK(buffer->Resize(static_cast(data.size()))); + std::shared_ptr buffer; + ASSERT_OK(AllocateBuffer(nullptr, static_cast(data.size()), &buffer)); std::memcpy(buffer->mutable_data(), data.c_str(), data.size()); BufferReader reader(buffer); ASSERT_OK(reader.Read(4, &slice1)); diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index 27ed98cfff7..cafa6f5b5ab 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -869,8 +869,9 @@ class JsonArrayReader { template Status GetIntArray( const RjArray& json_array, const int32_t length, std::shared_ptr* out) { - auto buffer = std::make_shared(pool_); - RETURN_NOT_OK(buffer->Resize(length * sizeof(T))); + std::shared_ptr buffer; + RETURN_NOT_OK(AllocateBuffer(pool_, length * sizeof(T), &buffer)); + T* values = reinterpret_cast(buffer->mutable_data()); for (int i = 0; i < length; ++i) { const rj::Value& val = json_array[i]; diff --git a/cpp/src/arrow/util/bit-util-test.cc b/cpp/src/arrow/util/bit-util-test.cc index b9a912b35bd..cb2fd1ab276 100644 --- a/cpp/src/arrow/util/bit-util-test.cc +++ b/cpp/src/arrow/util/bit-util-test.cc @@ -22,6 +22,7 @@ #include "gtest/gtest.h" +#include "arrow/buffer.h" #include "arrow/test-util.h" namespace arrow { @@ -83,4 +84,29 @@ TEST(BitUtilTests, TestCountSetBits) { } } +TEST(BitUtilTests, TestCopyBitmap) { + const int kBufferSize = 1000; + + std::shared_ptr buffer; + ASSERT_OK(AllocateBuffer(default_memory_pool(), kBufferSize, &buffer)); + memset(buffer->mutable_data(), 0, kBufferSize); + test::random_bytes(kBufferSize, 0, buffer->mutable_data()); + + const int num_bits = kBufferSize * 8; + + const uint8_t* src = buffer->data(); + + std::vector offsets = {0, 12, 16, 32, 37, 63, 64, 128}; + for (int64_t offset : offsets) { + const int64_t copy_length = num_bits - offset; + + std::shared_ptr copy; + ASSERT_OK(CopyBitmap(default_memory_pool(), src, offset, copy_length, ©)); + + for (int64_t i = 0; i < copy_length; ++i) { + ASSERT_EQ(BitUtil::GetBit(src, i + offset), BitUtil::GetBit(copy->data(), i)); + } + } +} + } // namespace arrow diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc index d4cf344f40b..5c6792f9cb0 100644 --- a/cpp/src/arrow/util/bit-util.cc +++ b/cpp/src/arrow/util/bit-util.cc @@ -28,6 +28,7 @@ #include #include "arrow/buffer.h" +#include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/util/bit-util.h" @@ -43,8 +44,9 @@ Status BitUtil::BytesToBits( const std::vector& bytes, std::shared_ptr* out) { int bit_length = BitUtil::BytesForBits(bytes.size()); - auto buffer = std::make_shared(); - RETURN_NOT_OK(buffer->Resize(bit_length)); + std::shared_ptr buffer; + RETURN_NOT_OK(AllocateBuffer(default_memory_pool(), bit_length, &buffer)); + memset(buffer->mutable_data(), 0, bit_length); BytesToBits(bytes, buffer->mutable_data()); @@ -89,4 +91,23 @@ int64_t CountSetBits(const uint8_t* data, int64_t bit_offset, int64_t length) { return count; } +Status GetEmptyBitmap( + MemoryPool* pool, int32_t length, std::shared_ptr* result) { + RETURN_NOT_OK(AllocateBuffer(pool, BitUtil::BytesForBits(length), result)); + memset((*result)->mutable_data(), 0, (*result)->size()); + return Status::OK(); +} + +Status CopyBitmap(MemoryPool* pool, const uint8_t* data, int32_t offset, int32_t length, + std::shared_ptr* out) { + std::shared_ptr buffer; + RETURN_NOT_OK(AllocateBuffer(pool, BitUtil::BytesForBits(length), &buffer)); + uint8_t* dest = buffer->mutable_data(); + for (int64_t i = 0; i < length; ++i) { + BitUtil::SetBitTo(dest, i, BitUtil::GetBit(data, i + offset)); + } + *out = buffer; + return Status::OK(); +} + } // namespace arrow diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index cbb4be0844b..fe71ed32772 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -28,6 +28,8 @@ namespace arrow { class Buffer; +class MemoryPool; +class MutableBuffer; class Status; namespace BitUtil { @@ -62,6 +64,12 @@ static inline void SetBit(uint8_t* bits, int i) { bits[i / 8] |= kBitmask[i % 8]; } +static inline void SetBitTo(uint8_t* bits, int i, bool bit_is_set) { + // See https://graphics.stanford.edu/~seander/bithacks.html + // "Conditionally set or clear bits without branching" + bits[i / 8] ^= (-bit_is_set ^ bits[i / 8]) & kBitmask[i % 8]; +} + static inline int64_t NextPower2(int64_t n) { n--; n |= n >> 1; @@ -104,6 +112,24 @@ ARROW_EXPORT Status BytesToBits(const std::vector&, std::shared_ptr* result); + +/// Copy a bit range of an existing bitmap +/// +/// \param[in] pool memory pool to allocate memory from +/// \param[in] bitmap source data +/// \param[in] offset bit offset into the source data +/// \param[in] length number of bits to copy +/// \param[out] out the resulting copy +/// +/// \return Status message +Status ARROW_EXPORT CopyBitmap(MemoryPool* pool, const uint8_t* bitmap, int32_t offset, + int32_t length, std::shared_ptr* out); + /// Compute the number of 1's in the given data array /// /// \param[in] data a packed LSB-ordered bitmap as a byte array From a72653d6c4250fa7ff3373c29b1ffa29015f2791 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 5 Feb 2017 18:17:50 -0500 Subject: [PATCH 05/15] Rename offsets to value_offsets in list/binary/string/union for better clarity. Test Slice for primitive arrays Change-Id: I11fce17b5581838ae907a69ea3d09094bddf9c96 --- cpp/src/arrow/array-list-test.cc | 2 +- cpp/src/arrow/array-primitive-test.cc | 29 +++++++ cpp/src/arrow/array-string-test.cc | 4 +- cpp/src/arrow/array-struct-test.cc | 2 +- cpp/src/arrow/array.cc | 107 ++++++++++++-------------- cpp/src/arrow/array.h | 71 ++++++++++------- cpp/src/arrow/builder.cc | 8 +- cpp/src/arrow/compare.cc | 14 ++-- cpp/src/arrow/ipc/adapter.cc | 6 +- cpp/src/arrow/ipc/json-internal.cc | 6 +- cpp/src/arrow/pretty_print.cc | 12 +-- 11 files changed, 149 insertions(+), 112 deletions(-) diff --git a/cpp/src/arrow/array-list-test.cc b/cpp/src/arrow/array-list-test.cc index 4d92cd4288c..4ff7cac77db 100644 --- a/cpp/src/arrow/array-list-test.cc +++ b/cpp/src/arrow/array-list-test.cc @@ -137,7 +137,7 @@ TEST_F(TestListBuilder, TestAppendNull) { ASSERT_TRUE(result_->IsNull(0)); ASSERT_TRUE(result_->IsNull(1)); - ASSERT_EQ(0, result_->raw_offsets()[0]); + ASSERT_EQ(0, result_->raw_value_offsets()[0]); ASSERT_EQ(0, result_->value_offset(1)); ASSERT_EQ(0, result_->value_offset(2)); diff --git a/cpp/src/arrow/array-primitive-test.cc b/cpp/src/arrow/array-primitive-test.cc index 3dea678c712..9245655fa5e 100644 --- a/cpp/src/arrow/array-primitive-test.cc +++ b/cpp/src/arrow/array-primitive-test.cc @@ -346,6 +346,35 @@ TYPED_TEST(TestPrimitiveBuilder, Equality) { array->RangeEquals(first_valid_idx + 1, size, first_valid_idx + 1, unequal_array)); } +TYPED_TEST(TestPrimitiveBuilder, SliceEquality) { + DECL_T(); + + const int size = 1000; + this->RandomData(size); + vector& draws = this->draws_; + vector& valid_bytes = this->valid_bytes_; + auto builder = this->builder_.get(); + + std::shared_ptr array; + ASSERT_OK(MakeArray(valid_bytes, draws, size, builder, &array)); + + std::shared_ptr slice, slice2; + + slice = array->Slice(5); + slice2 = array->Slice(5); + ASSERT_EQ(size - 5, slice->length()); + + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(5, slice->length(), 0, slice)); + + slice = array->Slice(5, 10); + slice2 = array->Slice(5, 10); + ASSERT_EQ(10, slice->length()); + + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(5, 10, 0, slice)); +} + TYPED_TEST(TestPrimitiveBuilder, TestAppendScalar) { DECL_T(); diff --git a/cpp/src/arrow/array-string-test.cc b/cpp/src/arrow/array-string-test.cc index e737d127196..b57ce057304 100644 --- a/cpp/src/arrow/array-string-test.cc +++ b/cpp/src/arrow/array-string-test.cc @@ -306,8 +306,8 @@ TEST_F(TestBinaryArray, TestEqualsEmptyStrings) { ASSERT_OK(builder.Finish(&left_arr)); const BinaryArray& left = static_cast(*left_arr); - std::shared_ptr right = std::make_shared( - left.length(), left.offsets(), nullptr, left.null_bitmap(), left.null_count()); + std::shared_ptr right = std::make_shared(left.length(), + left.value_offsets(), nullptr, left.null_bitmap(), left.null_count()); ASSERT_TRUE(left.Equals(right)); ASSERT_TRUE(left.RangeEquals(0, left.length(), 0, right)); diff --git a/cpp/src/arrow/array-struct-test.cc b/cpp/src/arrow/array-struct-test.cc index 5827c399dda..6790e9b9c29 100644 --- a/cpp/src/arrow/array-struct-test.cc +++ b/cpp/src/arrow/array-struct-test.cc @@ -75,7 +75,7 @@ void ValidateBasicStructArray(const StructArray* result, ASSERT_EQ(4, list_char_arr->length()); ASSERT_EQ(10, list_char_arr->values()->length()); for (size_t i = 0; i < list_offsets.size(); ++i) { - ASSERT_EQ(list_offsets[i], list_char_arr->raw_offsets()[i]); + ASSERT_EQ(list_offsets[i], list_char_arr->raw_value_offsets()[i]); } for (size_t i = 0; i < list_values.size(); ++i) { ASSERT_EQ(list_values[i], char_arr->Value(i)); diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 1cf86311ad2..5bf39315702 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -92,6 +92,20 @@ bool Array::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_ return are_equal; } +// Last two parameters are in-out parameters +static inline void ConformSliceParams( + int32_t array_offset, int32_t array_length, int32_t* offset, int32_t* length) { + DCHECK_LE(*offset, array_length); + DCHECK_GE(offset, 0); + *length = std::min(array_length - *offset, *length); + *offset = array_offset + *offset; +} + +std::shared_ptr Array::Slice(int32_t offset) const { + int32_t slice_length = length_ - offset; + return Slice(offset, slice_length); +} + Status Array::Validate() const { return Status::OK(); } @@ -126,13 +140,9 @@ Status NumericArray::Accept(ArrayVisitor* visitor) const { template std::shared_ptr NumericArray::Slice(int32_t offset, int32_t length) const { - DCHECK_LE(offset, length_); - length = std::min(length_ - offset, length); - - // Combine this offset with any existing offset - int64_t new_offset = offset_ + offset; + ConformSliceParams(offset_, length_, &offset, &length); return std::make_shared>( - type_, length, data_, null_bitmap_, kUnknownNullCount, new_offset); + type_, length, data_, null_bitmap_, kUnknownNullCount, offset); } template class NumericArray; @@ -163,13 +173,9 @@ Status BooleanArray::Accept(ArrayVisitor* visitor) const { } std::shared_ptr BooleanArray::Slice(int32_t offset, int32_t length) const { - DCHECK_LE(offset, length_); - length = std::min(length_ - offset, length); - - // Combine this offset with any existing offset - int64_t new_offset = offset_ + offset; + ConformSliceParams(offset_, length_, &offset, &length); return std::make_shared( - length, data_, null_bitmap_, kUnknownNullCount, new_offset); + length, data_, null_bitmap_, kUnknownNullCount, offset); } // ---------------------------------------------------------------------- @@ -177,10 +183,10 @@ std::shared_ptr BooleanArray::Slice(int32_t offset, int32_t length) const Status ListArray::Validate() const { if (length_ < 0) { return Status::Invalid("Length was negative"); } - if (!offsets_) { return Status::Invalid("offsets_ was null"); } - if (offsets_->size() / static_cast(sizeof(int32_t)) < length_) { + if (!value_offsets_) { return Status::Invalid("value_offsets_ was null"); } + if (value_offsets_->size() / static_cast(sizeof(int32_t)) < length_) { std::stringstream ss; - ss << "offset buffer size (bytes): " << offsets_->size() + ss << "offset buffer size (bytes): " << value_offsets_->size() << " isn't large enough for length: " << length_; return Status::Invalid(ss.str()); } @@ -210,8 +216,9 @@ Status ListArray::Validate() const { int32_t current_offset = this->value_offset(i); if (IsNull(i - 1) && current_offset != prev_offset) { std::stringstream ss; - ss << "Offset invariant failure at: " << i << " inconsistent offsets for null slot" - << current_offset << "!=" << prev_offset; + ss << "Offset invariant failure at: " << i + << " inconsistent value_offsets for null slot" << current_offset + << "!=" << prev_offset; return Status::Invalid(ss.str()); } if (current_offset < prev_offset) { @@ -231,13 +238,9 @@ Status ListArray::Accept(ArrayVisitor* visitor) const { } std::shared_ptr ListArray::Slice(int32_t offset, int32_t length) const { - DCHECK_LE(offset, length_); - length = std::min(length_ - offset, length); - - // Combine this offset with any existing offset - int64_t new_offset = offset_ + offset; + ConformSliceParams(offset_, length_, &offset, &length); return std::make_shared( - type_, length, offsets_, values_, null_bitmap_, kUnknownNullCount, new_offset); + type_, length, value_offsets_, values_, null_bitmap_, kUnknownNullCount, offset); } // ---------------------------------------------------------------------- @@ -246,17 +249,18 @@ std::shared_ptr ListArray::Slice(int32_t offset, int32_t length) const { static std::shared_ptr kBinary = std::make_shared(); static std::shared_ptr kString = std::make_shared(); -BinaryArray::BinaryArray(int32_t length, const std::shared_ptr& offsets, +BinaryArray::BinaryArray(int32_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& data, const std::shared_ptr& null_bitmap, int32_t null_count, int32_t offset) - : BinaryArray(kBinary, length, offsets, data, null_bitmap, null_count, offset) {} + : BinaryArray(kBinary, length, value_offsets, data, null_bitmap, null_count, offset) { +} BinaryArray::BinaryArray(const std::shared_ptr& type, int32_t length, - const std::shared_ptr& offsets, const std::shared_ptr& data, + const std::shared_ptr& value_offsets, const std::shared_ptr& data, const std::shared_ptr& null_bitmap, int32_t null_count, int32_t offset) : Array(type, length, null_bitmap, null_count, offset), - offsets_(offsets), - raw_offsets_(reinterpret_cast(offsets_->data())), + value_offsets_(value_offsets), + raw_value_offsets_(reinterpret_cast(value_offsets_->data())), data_(data), raw_data_(nullptr) { if (data_ != nullptr) { raw_data_ = data_->data(); } @@ -272,19 +276,16 @@ Status BinaryArray::Accept(ArrayVisitor* visitor) const { } std::shared_ptr BinaryArray::Slice(int32_t offset, int32_t length) const { - DCHECK_LE(offset, length_); - length = std::min(length_ - offset, length); - - // Combine this offset with any existing offset - int64_t new_offset = offset_ + offset; + ConformSliceParams(offset_, length_, &offset, &length); return std::make_shared( - length, offsets_, data_, null_bitmap_, kUnknownNullCount, new_offset); + length, value_offsets_, data_, null_bitmap_, kUnknownNullCount, offset); } -StringArray::StringArray(int32_t length, const std::shared_ptr& offsets, +StringArray::StringArray(int32_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& data, const std::shared_ptr& null_bitmap, int32_t null_count, int32_t offset) - : BinaryArray(kString, length, offsets, data, null_bitmap, null_count, offset) {} + : BinaryArray(kString, length, value_offsets, data, null_bitmap, null_count, offset) { +} Status StringArray::Validate() const { // TODO(emkornfield) Validate proper UTF8 code points? @@ -296,13 +297,9 @@ Status StringArray::Accept(ArrayVisitor* visitor) const { } std::shared_ptr StringArray::Slice(int32_t offset, int32_t length) const { - DCHECK_LE(offset, length_); - length = std::min(length_ - offset, length); - - // Combine this offset with any existing offset - int64_t new_offset = offset_ + offset; + ConformSliceParams(offset_, length_, &offset, &length); return std::make_shared( - length, offsets_, data_, null_bitmap_, kUnknownNullCount, new_offset); + length, value_offsets_, data_, null_bitmap_, kUnknownNullCount, offset); } // ---------------------------------------------------------------------- @@ -362,13 +359,9 @@ Status StructArray::Accept(ArrayVisitor* visitor) const { } std::shared_ptr StructArray::Slice(int32_t offset, int32_t length) const { - DCHECK_LE(offset, length_); - length = std::min(length_ - offset, length); - - // Combine this offset with any existing offset - int64_t new_offset = offset_ + offset; + ConformSliceParams(offset_, length_, &offset, &length); return std::make_shared( - type_, length, children_, null_bitmap_, kUnknownNullCount, new_offset); + type_, length, children_, null_bitmap_, kUnknownNullCount, offset); } // ---------------------------------------------------------------------- @@ -376,14 +369,16 @@ std::shared_ptr StructArray::Slice(int32_t offset, int32_t length) const UnionArray::UnionArray(const std::shared_ptr& type, int32_t length, const std::vector>& children, - const std::shared_ptr& type_ids, const std::shared_ptr& offsets, + const std::shared_ptr& type_ids, const std::shared_ptr& value_offsets, const std::shared_ptr& null_bitmap, int32_t null_count, int32_t offset) : Array(type, length, null_bitmap, null_count, offset), children_(children), type_ids_(type_ids), - offsets_(offsets) { + value_offsets_(value_offsets) { raw_type_ids_ = reinterpret_cast(type_ids->data()); - if (offsets) { raw_offsets_ = reinterpret_cast(offsets->data()); } + if (value_offsets) { + raw_value_offsets_ = reinterpret_cast(value_offsets->data()); + } } std::shared_ptr UnionArray::child(int32_t pos) const { @@ -407,13 +402,9 @@ Status UnionArray::Accept(ArrayVisitor* visitor) const { } std::shared_ptr UnionArray::Slice(int32_t offset, int32_t length) const { - DCHECK_LE(offset, length_); - length = std::min(length_ - offset, length); - - // Combine this offset with any existing offset - int64_t new_offset = offset_ + offset; - return std::make_shared(type_, length, children_, type_ids_, offsets_, - null_bitmap_, kUnknownNullCount, new_offset); + ConformSliceParams(offset_, length_, &offset, &length); + return std::make_shared(type_, length, children_, type_ids_, value_offsets_, + null_bitmap_, kUnknownNullCount, offset); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 509ae08da74..e089a5f94df 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -109,11 +109,13 @@ class ARROW_EXPORT Array { /// Buffer for the null bitmap. /// /// Note that for `null_count == 0`, this can be a `nullptr`. + /// This buffer does not account for any slice offset std::shared_ptr null_bitmap() const { return null_bitmap_; } /// Raw pointer to the null bitmap. /// /// Note that for `null_count == 0`, this can be a `nullptr`. + /// This buffer does not account for any slice offset const uint8_t* null_bitmap_data() const { return null_bitmap_data_; } bool Equals(const Array& arr) const; @@ -145,6 +147,9 @@ class ARROW_EXPORT Array { /// \return a new object wrapped in std::shared_ptr virtual std::shared_ptr Slice(int32_t offset, int32_t length) const = 0; + /// Slice from offset until end of the array + std::shared_ptr Slice(int32_t offset) const; + protected: std::shared_ptr type_; int32_t length_; @@ -183,6 +188,8 @@ class ARROW_EXPORT PrimitiveArray : public Array { const std::shared_ptr& null_bitmap = nullptr, int32_t null_count = 0, int32_t offset = 0); + /// The memory containing this array's data + /// This buffer does not account for any slice offset std::shared_ptr data() const { return data_; } protected: @@ -247,13 +254,14 @@ class ARROW_EXPORT ListArray : public Array { using TypeClass = ListType; ListArray(const std::shared_ptr& type, int32_t length, - const std::shared_ptr& offsets, const std::shared_ptr& values, + const std::shared_ptr& value_offsets, const std::shared_ptr& values, const std::shared_ptr& null_bitmap = nullptr, int32_t null_count = 0, int32_t offset = 0) : Array(type, length, null_bitmap, null_count, offset) { - offsets_ = offsets; - raw_offsets_ = - offsets == nullptr ? nullptr : reinterpret_cast(offsets_->data()); + value_offsets_ = value_offsets; + raw_value_offsets_ = value_offsets == nullptr + ? nullptr + : reinterpret_cast(value_offsets_->data()); values_ = values; } @@ -262,17 +270,20 @@ class ARROW_EXPORT ListArray : public Array { // Return a shared pointer in case the requestor desires to share ownership // with this array. std::shared_ptr values() const { return values_; } - std::shared_ptr offsets() const { return offsets_; } + + /// Note that this buffer does not account for any slice offset + std::shared_ptr value_offsets() const { return value_offsets_; } std::shared_ptr value_type() const { return values_->type(); } - const int32_t* raw_offsets() const { return raw_offsets_; } + /// Return pointer to raw value offsets accounting for any slice offset + const int32_t* raw_value_offsets() const { return raw_value_offsets_ + offset_; } // Neither of these functions will perform boundschecking - int32_t value_offset(int i) const { return raw_offsets_[i + offset_]; } + int32_t value_offset(int i) const { return raw_value_offsets_[i + offset_]; } int32_t value_length(int i) const { i += offset_; - return raw_offsets_[i + 1] - raw_offsets_[i]; + return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; } Status Accept(ArrayVisitor* visitor) const override; @@ -280,8 +291,8 @@ class ARROW_EXPORT ListArray : public Array { std::shared_ptr Slice(int32_t offset, int32_t length) const override; protected: - std::shared_ptr offsets_; - const int32_t* raw_offsets_; + std::shared_ptr value_offsets_; + const int32_t* raw_value_offsets_; std::shared_ptr values_; }; @@ -292,7 +303,7 @@ class ARROW_EXPORT BinaryArray : public Array { public: using TypeClass = BinaryType; - BinaryArray(int32_t length, const std::shared_ptr& offsets, + BinaryArray(int32_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& data, const std::shared_ptr& null_bitmap = nullptr, int32_t null_count = 0, int32_t offset = 0); @@ -304,21 +315,24 @@ class ARROW_EXPORT BinaryArray : public Array { // Account for base offset i += offset_; - const int32_t pos = raw_offsets_[i]; - *out_length = raw_offsets_[i + 1] - pos; + const int32_t pos = raw_value_offsets_[i]; + *out_length = raw_value_offsets_[i + 1] - pos; return raw_data_ + pos; } + /// Note that this buffer does not account for any slice offset std::shared_ptr data() const { return data_; } - std::shared_ptr offsets() const { return offsets_; } - const int32_t* raw_offsets() const { return raw_offsets_; } + /// Note that this buffer does not account for any slice offset + std::shared_ptr value_offsets() const { return value_offsets_; } + + const int32_t* raw_value_offsets() const { return raw_value_offsets_ + offset_; } // Neither of these functions will perform boundschecking - int32_t value_offset(int i) const { return raw_offsets_[i + offset_]; } + int32_t value_offset(int i) const { return raw_value_offsets_[i + offset_]; } int32_t value_length(int i) const { i += offset_; - return raw_offsets_[i + 1] - raw_offsets_[i]; + return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; } Status Validate() const override; @@ -331,12 +345,12 @@ class ARROW_EXPORT BinaryArray : public Array { // Constructor that allows sub-classes/builders to propagate there logical type up the // class hierarchy. BinaryArray(const std::shared_ptr& type, int32_t length, - const std::shared_ptr& offsets, const std::shared_ptr& data, + const std::shared_ptr& value_offsets, const std::shared_ptr& data, const std::shared_ptr& null_bitmap = nullptr, int32_t null_count = 0, int32_t offset = 0); - std::shared_ptr offsets_; - const int32_t* raw_offsets_; + std::shared_ptr value_offsets_; + const int32_t* raw_value_offsets_; std::shared_ptr data_; const uint8_t* raw_data_; @@ -346,7 +360,7 @@ class ARROW_EXPORT StringArray : public BinaryArray { public: using TypeClass = StringType; - StringArray(int32_t length, const std::shared_ptr& offsets, + StringArray(int32_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& data, const std::shared_ptr& null_bitmap = nullptr, int32_t null_count = 0, int32_t offset = 0); @@ -405,17 +419,20 @@ class ARROW_EXPORT UnionArray : public Array { UnionArray(const std::shared_ptr& type, int32_t length, const std::vector>& children, const std::shared_ptr& type_ids, - const std::shared_ptr& offsets = nullptr, + const std::shared_ptr& value_offsets = nullptr, const std::shared_ptr& null_bitmap = nullptr, int32_t null_count = 0, int32_t offset = 0); Status Validate() const override; + /// Note that this buffer does not account for any slice offset std::shared_ptr type_ids() const { return type_ids_; } - const uint8_t* raw_type_ids() const { return raw_type_ids_; } - std::shared_ptr offsets() const { return offsets_; } - const int32_t* raw_offsets() const { return raw_offsets_; } + /// Note that this buffer does not account for any slice offset + std::shared_ptr value_offsets() const { return value_offsets_; } + + const uint8_t* raw_type_ids() const { return raw_type_ids_; } + const int32_t* raw_value_offsets() const { return raw_value_offsets_ + offset_; } UnionMode mode() const { return static_cast(*type_.get()).mode; } @@ -433,8 +450,8 @@ class ARROW_EXPORT UnionArray : public Array { std::shared_ptr type_ids_; const uint8_t* raw_type_ids_; - std::shared_ptr offsets_; - const int32_t* raw_offsets_; + std::shared_ptr value_offsets_; + const int32_t* raw_value_offsets_; }; // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index efed00a04b6..5c6b19ff228 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -351,8 +351,8 @@ Status BinaryBuilder::Finish(std::shared_ptr* out) { const auto list = std::dynamic_pointer_cast(result); auto values = std::dynamic_pointer_cast(list->values()); - *out = std::make_shared(list->length(), list->offsets(), values->data(), - list->null_bitmap(), list->null_count()); + *out = std::make_shared(list->length(), list->value_offsets(), + values->data(), list->null_bitmap(), list->null_count()); return Status::OK(); } @@ -363,8 +363,8 @@ Status StringBuilder::Finish(std::shared_ptr* out) { const auto list = std::dynamic_pointer_cast(result); auto values = std::dynamic_pointer_cast(list->values()); - *out = std::make_shared(list->length(), list->offsets(), values->data(), - list->null_bitmap(), list->null_count()); + *out = std::make_shared(list->length(), list->value_offsets(), + values->data(), list->null_bitmap(), list->null_count()); return Status::OK(); } diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 6b1424a40fe..679650920d5 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -255,8 +255,8 @@ class RangeEqualsVisitor : public ArrayVisitor { return false; } } else { - const int32_t offset = left.raw_offsets()[i]; - const int32_t o_offset = right.raw_offsets()[i]; + const int32_t offset = left.raw_value_offsets()[i]; + const int32_t o_offset = right.raw_value_offsets()[i]; if (!left.child(child_num)->RangeEquals( offset, offset + 1, o_offset, right.child(child_num))) { return false; @@ -378,11 +378,11 @@ class EqualsVisitor : public RangeEqualsVisitor { bool CompareBinary(const BinaryArray& left) { const auto& right = static_cast(right_); - bool equal_offsets = - left.offsets()->Equals(*right.offsets(), (left.length() + 1) * sizeof(int32_t)); + bool equal_offsets = left.value_offsets()->Equals( + *right.value_offsets(), (left.length() + 1) * sizeof(int32_t)); if (!equal_offsets) { return false; } if (!left.data() && !(right.data())) { return true; } - return left.data()->Equals(*right.data(), left.raw_offsets()[left.length()]); + return left.data()->Equals(*right.data(), left.raw_value_offsets()[left.length()]); } Status Visit(const StringArray& left) override { @@ -397,8 +397,8 @@ class EqualsVisitor : public RangeEqualsVisitor { Status Visit(const ListArray& left) override { const auto& right = static_cast(right_); - if (!left.offsets()->Equals( - *right.offsets(), (left.length() + 1) * sizeof(int32_t))) { + if (!left.value_offsets()->Equals( + *right.value_offsets(), (left.length() + 1) * sizeof(int32_t))) { result_ = false; } else { result_ = left.values()->Equals(right.values()); diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index 67b05aa5fb2..4ca716add77 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -214,7 +214,7 @@ class RecordBatchWriter : public ArrayVisitor { } Status VisitBinary(const BinaryArray& array) { - buffers_.push_back(array.offsets()); + buffers_.push_back(array.value_offsets()); buffers_.push_back(array.data()); return Status::OK(); } @@ -262,7 +262,7 @@ class RecordBatchWriter : public ArrayVisitor { } Status Visit(const ListArray& array) override { - buffers_.push_back(array.offsets()); + buffers_.push_back(array.value_offsets()); --max_recursion_depth_; RETURN_NOT_OK(VisitArray(*array.values().get())); ++max_recursion_depth_; @@ -281,7 +281,7 @@ class RecordBatchWriter : public ArrayVisitor { Status Visit(const UnionArray& array) override { buffers_.push_back(array.type_ids()); - if (array.mode() == UnionMode::DENSE) { buffers_.push_back(array.offsets()); } + if (array.mode() == UnionMode::DENSE) { buffers_.push_back(array.value_offsets()); } --max_recursion_depth_; for (const auto& field : array.children()) { diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index cafa6f5b5ab..fd184958d51 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -464,7 +464,7 @@ class JsonArrayWriter : public ArrayVisitor { template Status WriteVarBytes(const T& array) { WriteValidityField(array); - WriteIntegerField("OFFSET", array.raw_offsets(), array.length() + 1); + WriteIntegerField("OFFSET", array.raw_value_offsets(), array.length() + 1); WriteDataField(array); SetNoChildren(); return Status::OK(); @@ -532,7 +532,7 @@ class JsonArrayWriter : public ArrayVisitor { Status Visit(const ListArray& array) override { WriteValidityField(array); - WriteIntegerField("OFFSET", array.raw_offsets(), array.length() + 1); + WriteIntegerField("OFFSET", array.raw_value_offsets(), array.length() + 1); auto type = static_cast(array.type().get()); return WriteChildren(type->children(), {array.values()}); } @@ -549,7 +549,7 @@ class JsonArrayWriter : public ArrayVisitor { WriteIntegerField("TYPE_ID", array.raw_type_ids(), array.length()); if (type->mode == UnionMode::DENSE) { - WriteIntegerField("OFFSET", array.raw_offsets(), array.length()); + WriteIntegerField("OFFSET", array.raw_value_offsets(), array.length()); } return WriteChildren(type->children(), array.children()); } diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index e30f4cc58d7..a8d6443a6bc 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -172,9 +172,9 @@ class ArrayPrinter : public ArrayVisitor { RETURN_NOT_OK(WriteValidityBitmap(array)); Newline(); - Write("-- offsets: "); - Int32Array offsets(array.length() + 1, array.offsets()); - RETURN_NOT_OK(PrettyPrint(offsets, indent_ + 2, sink_)); + Write("-- value_offsets: "); + Int32Array value_offsets(array.length() + 1, array.value_offsets()); + RETURN_NOT_OK(PrettyPrint(value_offsets, indent_ + 2, sink_)); Newline(); Write("-- values: "); @@ -209,9 +209,9 @@ class ArrayPrinter : public ArrayVisitor { if (array.mode() == UnionMode::DENSE) { Newline(); - Write("-- offsets: "); - Int32Array offsets(array.length(), array.offsets()); - RETURN_NOT_OK(PrettyPrint(offsets, indent_ + 2, sink_)); + Write("-- value_offsets: "); + Int32Array value_offsets(array.length(), array.value_offsets()); + RETURN_NOT_OK(PrettyPrint(value_offsets, indent_ + 2, sink_)); } return PrintChildren(array.children()); From 55454d776136cfd3e98902ddc1765d46e5aa6e32 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 5 Feb 2017 19:36:41 -0500 Subject: [PATCH 06/15] Add slice tests for struct, union, string, list Change-Id: I5128f1a3372c5f3c219a9f2eaaee438418962a82 --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/array-list-test.cc | 28 ++++++++++-- cpp/src/arrow/array-primitive-test.cc | 8 +++- cpp/src/arrow/array-string-test.cc | 63 ++++++++++++++++++++++++++- cpp/src/arrow/array-struct-test.cc | 17 ++++++++ cpp/src/arrow/array.h | 2 +- 6 files changed, 110 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index b002bb75ca9..824ced1a51e 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -53,6 +53,7 @@ ADD_ARROW_TEST(array-list-test) ADD_ARROW_TEST(array-primitive-test) ADD_ARROW_TEST(array-string-test) ADD_ARROW_TEST(array-struct-test) +ADD_ARROW_TEST(array-union-test) ADD_ARROW_TEST(buffer-test) ADD_ARROW_TEST(column-test) ADD_ARROW_TEST(memory_pool-test) diff --git a/cpp/src/arrow/array-list-test.cc b/cpp/src/arrow/array-list-test.cc index 4ff7cac77db..a144fd937d7 100644 --- a/cpp/src/arrow/array-list-test.cc +++ b/cpp/src/arrow/array-list-test.cc @@ -90,9 +90,9 @@ TEST_F(TestListBuilder, Equality) { Int32Builder* vb = static_cast(builder_->value_builder().get()); std::shared_ptr array, equal_array, unequal_array; - vector equal_offsets = {0, 1, 2, 5}; - vector equal_values = {1, 2, 3, 4, 5, 2, 2, 2}; - vector unequal_offsets = {0, 1, 4}; + vector equal_offsets = {0, 1, 2, 5, 6, 7, 8, 10}; + vector equal_values = {1, 2, 3, 4, 5, 2, 2, 2, 5, 6}; + vector unequal_offsets = {0, 1, 4, 7}; vector unequal_values = {1, 2, 2, 2, 3, 4, 5}; // setup two equal arrays @@ -122,7 +122,27 @@ TEST_F(TestListBuilder, Equality) { EXPECT_FALSE(array->RangeEquals(0, 2, 0, unequal_array)); EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_array)); EXPECT_TRUE(array->RangeEquals(2, 3, 2, unequal_array)); - EXPECT_TRUE(array->RangeEquals(3, 4, 1, unequal_array)); + + // Check with slices, ARROW-33 + std::shared_ptr slice, slice2; + + slice = array->Slice(2); + slice2 = array->Slice(2); + ASSERT_EQ(array->length() - 2, slice->length()); + + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(2, slice->length(), 0, slice)); + + // Chained slices + slice2 = array->Slice(1)->Slice(1); + ASSERT_TRUE(slice->Equals(slice2)); + + slice = array->Slice(1, 4); + slice2 = array->Slice(1, 4); + ASSERT_EQ(4, slice->length()); + + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(1, 5, 0, slice)); } TEST_F(TestListBuilder, TestResize) {} diff --git a/cpp/src/arrow/array-primitive-test.cc b/cpp/src/arrow/array-primitive-test.cc index 9245655fa5e..d781e9cde00 100644 --- a/cpp/src/arrow/array-primitive-test.cc +++ b/cpp/src/arrow/array-primitive-test.cc @@ -365,14 +365,18 @@ TYPED_TEST(TestPrimitiveBuilder, SliceEquality) { ASSERT_EQ(size - 5, slice->length()); ASSERT_TRUE(slice->Equals(slice2)); - ASSERT_TRUE(array->RangeEquals(5, slice->length(), 0, slice)); + ASSERT_TRUE(array->RangeEquals(5, array->length(), 0, slice)); + + // Chained slices + slice2 = array->Slice(2)->Slice(3); + ASSERT_TRUE(slice->Equals(slice2)); slice = array->Slice(5, 10); slice2 = array->Slice(5, 10); ASSERT_EQ(10, slice->length()); ASSERT_TRUE(slice->Equals(slice2)); - ASSERT_TRUE(array->RangeEquals(5, 10, 0, slice)); + ASSERT_TRUE(array->RangeEquals(5, 15, 0, slice)); } TYPED_TEST(TestPrimitiveBuilder, TestAppendScalar) { diff --git a/cpp/src/arrow/array-string-test.cc b/cpp/src/arrow/array-string-test.cc index b57ce057304..e2712738628 100644 --- a/cpp/src/arrow/array-string-test.cc +++ b/cpp/src/arrow/array-string-test.cc @@ -27,6 +27,7 @@ #include "arrow/builder.h" #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/type_traits.h" namespace arrow { @@ -348,8 +349,7 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) { if (is_null[i]) { builder_->AppendNull(); } else { - builder_->Append( - reinterpret_cast(strings[i].data()), strings[i].size()); + builder_->Append(strings[i]); } } } @@ -377,4 +377,63 @@ TEST_F(TestBinaryBuilder, TestZeroLength) { Done(); } +// ---------------------------------------------------------------------- +// Slice tests + +template +void CheckSliceEquality() { + using Traits = TypeTraits; + using BuilderType = typename Traits::BuilderType; + + auto type = Traits::type_singleton(); + BuilderType builder(default_memory_pool(), type); + + std::vector strings = {"foo", "", "bar", "baz", "qux", ""}; + std::vector is_null = {0, 1, 0, 1, 0, 0}; + + int N = strings.size(); + int reps = 10; + + for (int j = 0; j < reps; ++j) { + for (int i = 0; i < N; ++i) { + if (is_null[i]) { + builder.AppendNull(); + } else { + builder.Append(strings[i]); + } + } + } + + std::shared_ptr array; + ASSERT_OK(builder.Finish(&array)); + + std::shared_ptr slice, slice2; + + slice = array->Slice(5); + slice2 = array->Slice(5); + ASSERT_EQ(N * reps - 5, slice->length()); + + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(5, slice->length(), 0, slice)); + + // Chained slices + slice2 = array->Slice(2)->Slice(3); + ASSERT_TRUE(slice->Equals(slice2)); + + slice = array->Slice(5, 20); + slice2 = array->Slice(5, 20); + ASSERT_EQ(20, slice->length()); + + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(5, 25, 0, slice)); +} + +TEST_F(TestBinaryArray, TestSliceEquality) { + CheckSliceEquality(); +} + +TEST_F(TestStringArray, TestSliceEquality) { + CheckSliceEquality(); +} + } // namespace arrow diff --git a/cpp/src/arrow/array-struct-test.cc b/cpp/src/arrow/array-struct-test.cc index 6790e9b9c29..f4e7409a623 100644 --- a/cpp/src/arrow/array-struct-test.cc +++ b/cpp/src/arrow/array-struct-test.cc @@ -381,6 +381,23 @@ TEST_F(TestStructBuilder, TestEquality) { EXPECT_FALSE(array->RangeEquals(0, 1, 0, unequal_values_array)); EXPECT_TRUE(array->RangeEquals(1, 3, 1, unequal_values_array)); EXPECT_FALSE(array->RangeEquals(3, 4, 3, unequal_values_array)); + + // ARROW-33 Slice / equality + std::shared_ptr slice, slice2; + + slice = array->Slice(2); + slice2 = array->Slice(2); + ASSERT_EQ(array->length() - 2, slice->length()); + + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(2, slice->length(), 0, slice)); + + slice = array->Slice(1, 2); + slice2 = array->Slice(1, 2); + ASSERT_EQ(2, slice->length()); + + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(1, 3, 0, slice)); } TEST_F(TestStructBuilder, TestZeroLength) { diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index e089a5f94df..404694a871b 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -431,7 +431,7 @@ class ARROW_EXPORT UnionArray : public Array { /// Note that this buffer does not account for any slice offset std::shared_ptr value_offsets() const { return value_offsets_; } - const uint8_t* raw_type_ids() const { return raw_type_ids_; } + const uint8_t* raw_type_ids() const { return raw_type_ids_ + offset_; } const int32_t* raw_value_offsets() const { return raw_value_offsets_ + offset_; } UnionMode mode() const { return static_cast(*type_.get()).mode; } From 8900d58f70c66ffac153623a125b012cf721c3a6 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 5 Feb 2017 20:03:08 -0500 Subject: [PATCH 07/15] Add Slice tests for DictionaryArray. Test recomputing the null count Change-Id: Id13727cefbacadb89dee6e178175634f84d00cb0 --- cpp/src/arrow/array-dictionary-test.cc | 42 ++++++++++++++++++++------ cpp/src/arrow/array-test.cc | 17 +++++++++++ 2 files changed, 49 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/array-dictionary-test.cc b/cpp/src/arrow/array-dictionary-test.cc index 1a0d49a118f..ef1e5e07f02 100644 --- a/cpp/src/arrow/array-dictionary-test.cc +++ b/cpp/src/arrow/array-dictionary-test.cc @@ -74,25 +74,47 @@ TEST(TestDictionary, Equals) { std::vector indices3_values = {1, 1, 0, 0, 2, 0}; ArrayFromVector(int16(), is_valid, indices3_values, &indices3); - auto arr = std::make_shared(dict_type, indices); - auto arr2 = std::make_shared(dict_type, indices2); - auto arr3 = std::make_shared(dict2_type, indices); - auto arr4 = std::make_shared(dict_type, indices3); + auto array = std::make_shared(dict_type, indices); + auto array2 = std::make_shared(dict_type, indices2); + auto array3 = std::make_shared(dict2_type, indices); + auto array4 = std::make_shared(dict_type, indices3); - ASSERT_TRUE(arr->Equals(arr)); + ASSERT_TRUE(array->Equals(array)); // Equal, because the unequal index is masked by null - ASSERT_TRUE(arr->Equals(arr2)); + ASSERT_TRUE(array->Equals(array2)); // Unequal dictionaries - ASSERT_FALSE(arr->Equals(arr3)); + ASSERT_FALSE(array->Equals(array3)); // Unequal indices - ASSERT_FALSE(arr->Equals(arr4)); + ASSERT_FALSE(array->Equals(array4)); // RangeEquals - ASSERT_TRUE(arr->RangeEquals(3, 6, 3, arr4)); - ASSERT_FALSE(arr->RangeEquals(1, 3, 1, arr4)); + ASSERT_TRUE(array->RangeEquals(3, 6, 3, array4)); + ASSERT_FALSE(array->RangeEquals(1, 3, 1, array4)); + + // ARROW-33 Test slices + const int size = array->length(); + + std::shared_ptr slice, slice2; + slice = array->Array::Slice(2); + slice2 = array->Array::Slice(2); + ASSERT_EQ(size - 2, slice->length()); + + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(2, array->length(), 0, slice)); + + // Chained slices + slice2 = array->Array::Slice(1)->Array::Slice(1); + ASSERT_TRUE(slice->Equals(slice2)); + + slice = array->Slice(1, 3); + slice2 = array->Slice(1, 3); + ASSERT_EQ(3, slice->length()); + + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(1, 4, 0, slice)); } TEST(TestDictionary, Validate) { diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 5a69913086d..2e8f84897a6 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -87,6 +87,23 @@ TEST_F(TestArray, TestEquality) { EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_array)); } +TEST_F(TestArray, SliceRecomputeNullCount) { + std::vector valid_bytes = {1, 0, 1, 1, 0, 1, 0, 0}; + + auto array = MakeArrayFromValidBytes(valid_bytes, pool_); + + ASSERT_EQ(4, array->null_count()); + + auto slice = array->Slice(1, 4); + ASSERT_EQ(2, slice->null_count()); + + slice = array->Slice(4); + ASSERT_EQ(1, slice->null_count()); + + slice = array->Slice(0); + ASSERT_EQ(4, slice->null_count()); +} + TEST_F(TestArray, TestIsNull) { // clang-format off std::vector null_bitmap = {1, 0, 1, 1, 0, 1, 0, 0, From b6c511ec31fc06c324488eb7bef370e4fb83d38b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 5 Feb 2017 20:11:37 -0500 Subject: [PATCH 08/15] Add RecordBatch::Slice convenience method Change-Id: I393711125073fec11860c47bf3496805be6618ba --- cpp/src/arrow/table-test.cc | 26 ++++++++++++++++++++++++++ cpp/src/arrow/table.cc | 13 +++++++++++++ cpp/src/arrow/table.h | 4 ++++ 3 files changed, 43 insertions(+) diff --git a/cpp/src/arrow/table-test.cc b/cpp/src/arrow/table-test.cc index 67c9f6703f4..e7c5d667903 100644 --- a/cpp/src/arrow/table-test.cc +++ b/cpp/src/arrow/table-test.cc @@ -242,4 +242,30 @@ TEST_F(TestRecordBatch, Equals) { ASSERT_FALSE(b1.Equals(b4)); } +TEST_F(TestRecordBatch, Slice) { + const int length = 10; + + auto f0 = std::make_shared("f0", int32()); + auto f1 = std::make_shared("f1", uint8()); + + vector> fields = {f0, f1}; + auto schema = std::make_shared(fields); + + auto a0 = MakePrimitive(length); + auto a1 = MakePrimitive(length); + + RecordBatch batch(schema, length, {a0, a1}); + + auto batch_slice = batch.Slice(2); + auto batch_slice2 = batch.Slice(1, 5); + + for (int i = 0; i < batch.num_columns(); ++i) { + ASSERT_EQ(2, batch_slice->column(i)->offset()); + ASSERT_EQ(length - 2, batch_slice->column(i)->length()); + + ASSERT_EQ(1, batch_slice2->column(i)->offset()); + ASSERT_EQ(5, batch_slice2->column(i)->length()); + } +} + } // namespace arrow diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 0908f71d447..9e31ba5af0c 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -60,6 +60,19 @@ bool RecordBatch::ApproxEquals(const RecordBatch& other) const { return true; } +std::shared_ptr RecordBatch::Slice(int32_t offset) { + return Slice(offset, this->num_rows() - offset); +} + +std::shared_ptr RecordBatch::Slice(int32_t offset, int32_t length) { + std::vector> arrays; + arrays.reserve(num_columns()); + for (const auto& field : columns_) { + arrays.emplace_back(field->Slice(offset, length)); + } + return std::make_shared(schema_, num_rows_, arrays); +} + // ---------------------------------------------------------------------- // Table methods diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index 583847cfbe3..fa56824a5a1 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -64,6 +64,10 @@ class ARROW_EXPORT RecordBatch { // @returns: the number of rows (the corresponding length of each column) int32_t num_rows() const { return num_rows_; } + /// Slice each of the arrays in the record batch and construct a new RecordBatch object + std::shared_ptr Slice(int32_t offset); + std::shared_ptr Slice(int32_t offset, int32_t length); + private: std::shared_ptr schema_; int32_t num_rows_; From c6d814dc8588c50d904bacdaaf0dd5a9fc1ea9f8 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 5 Feb 2017 21:54:35 -0500 Subject: [PATCH 09/15] Work on adding sliced array support to IPC code path, with pretty printer and comparison fixed for sliced bitmaps, etc. Not all working yet Change-Id: I74c0e1548b420becac2d4d329e752d5719d51912 --- cpp/src/arrow/array-union-test.cc | 67 ++++++++++ cpp/src/arrow/buffer.h | 2 +- cpp/src/arrow/compare.cc | 28 ++-- cpp/src/arrow/ipc/adapter.cc | 181 +++++++++++++++++++++----- cpp/src/arrow/ipc/adapter.h | 8 +- cpp/src/arrow/ipc/ipc-adapter-test.cc | 40 +++++- cpp/src/arrow/ipc/stream.cc | 15 ++- cpp/src/arrow/ipc/stream.h | 8 ++ cpp/src/arrow/pretty_print.cc | 14 +- cpp/src/arrow/util/bit-util.cc | 12 +- cpp/src/arrow/util/bit-util.h | 5 +- 11 files changed, 318 insertions(+), 62 deletions(-) create mode 100644 cpp/src/arrow/array-union-test.cc diff --git a/cpp/src/arrow/array-union-test.cc b/cpp/src/arrow/array-union-test.cc new file mode 100644 index 00000000000..eb9bd7da31b --- /dev/null +++ b/cpp/src/arrow/array-union-test.cc @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Tests for UnionArray + +#include +#include +#include + +#include "gtest/gtest.h" + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/ipc/test-common.h" +#include "arrow/status.h" +#include "arrow/table.h" +#include "arrow/test-util.h" +#include "arrow/type.h" + +namespace arrow { + +TEST(TestUnionArrayAdHoc, TestSliceEquals) { + std::shared_ptr batch; + ASSERT_OK(ipc::MakeUnion(&batch)); + + const int size = batch->num_rows(); + + auto CheckUnion = [&size](std::shared_ptr array) { + std::shared_ptr slice, slice2; + slice = array->Slice(2); + slice2 = array->Slice(2); + ASSERT_EQ(size - 2, slice->length()); + + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(2, array->length(), 0, slice)); + + // Chained slices + slice2 = array->Slice(1)->Slice(1); + ASSERT_TRUE(slice->Equals(slice2)); + + slice = array->Slice(1, 5); + slice2 = array->Slice(1, 5); + ASSERT_EQ(5, slice->length()); + + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(1, 6, 0, slice)); + }; + + CheckUnion(batch->column(1)); + CheckUnion(batch->column(2)); +} + +} // namespace arrow diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 2271a399585..9c400b12903 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -105,7 +105,7 @@ class ARROW_EXPORT Buffer : public std::enable_shared_from_this { /// Construct a view on passed buffer at the indicated offset and length. This /// function cannot fail and does not error checking (except in debug builds) -ARROW_EXPORT std::shared_ptr SliceBuffer( +std::shared_ptr ARROW_EXPORT SliceBuffer( const std::shared_ptr& buffer, int64_t offset, int64_t length); /// A Buffer whose contents can be mutated. May or may not own its data. diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 679650920d5..7a26db412dd 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -315,20 +315,28 @@ class EqualsVisitor : public RangeEqualsVisitor { } result_ = true; } else { - result_ = left.data()->Equals(*right.data(), BitUtil::BytesForBits(left.length())); + result_ = BitmapEquals(left.data()->data(), right.data()->data(), left.length()); } return Status::OK(); } bool IsEqualPrimitive(const PrimitiveArray& left) { const auto& right = static_cast(right_); - if (left.null_count() > 0) { - const uint8_t* left_data = left.data()->data(); - const uint8_t* right_data = right.data()->data(); - const auto& size_meta = dynamic_cast(*left.type()); - const int value_byte_size = size_meta.bit_width() / 8; - DCHECK_GT(value_byte_size, 0); + const auto& size_meta = dynamic_cast(*left.type()); + const int value_byte_size = size_meta.bit_width() / 8; + DCHECK_GT(value_byte_size, 0); + + const uint8_t* left_data = nullptr; + if (left.length() > 0) { + left_data = left.data()->data() + left.offset() * value_byte_size; + } + + const uint8_t* right_data = nullptr; + if (right.length() > 0) { + right_data = right.data()->data() + right.offset() * value_byte_size; + } + if (left.null_count() > 0) { for (int i = 0; i < left.length(); ++i) { if (!left.IsNull(i) && memcmp(left_data, right_data, value_byte_size)) { return false; @@ -339,7 +347,7 @@ class EqualsVisitor : public RangeEqualsVisitor { return true; } else { if (left.length() == 0) { return true; } - return left.data()->Equals(*right.data(), left.length()); + return memcmp(left_data, right_data, value_byte_size * left.length()) == 0; } } @@ -465,8 +473,8 @@ static bool BaseDataEquals(const Array& left, const Array& right) { return false; } if (left.null_count() > 0) { - return left.null_bitmap()->Equals( - *right.null_bitmap(), BitUtil::BytesForBits(left.length())); + return BitmapEquals( + left.null_bitmap()->data(), right.null_bitmap()->data(), left.length()); } return true; } diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index 4ca716add77..e14768c68bb 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -30,6 +30,7 @@ #include "arrow/ipc/metadata-internal.h" #include "arrow/ipc/metadata.h" #include "arrow/ipc/util.h" +#include "arrow/memory_pool.h" #include "arrow/schema.h" #include "arrow/status.h" #include "arrow/table.h" @@ -49,9 +50,10 @@ namespace ipc { class RecordBatchWriter : public ArrayVisitor { public: - RecordBatchWriter( - const RecordBatch& batch, int64_t buffer_start_offset, int max_recursion_depth) - : batch_(batch), + RecordBatchWriter(MemoryPool* pool, const RecordBatch& batch, + int64_t buffer_start_offset, int max_recursion_depth) + : pool_(pool), + batch_(batch), max_recursion_depth_(max_recursion_depth), buffer_start_offset_(buffer_start_offset) {} @@ -62,7 +64,15 @@ class RecordBatchWriter : public ArrayVisitor { // push back all common elements field_nodes_.push_back(flatbuf::FieldNode(arr.length(), arr.null_count())); if (arr.null_count() > 0) { - buffers_.push_back(arr.null_bitmap()); + std::shared_ptr bitmap = arr.null_bitmap(); + + if (arr.offset() != 0) { + // With a sliced array / non-zero offset, we must copy the bitmap + RETURN_NOT_OK( + CopyBitmap(pool_, bitmap->data(), arr.offset(), arr.length(), &bitmap)); + } + + buffers_.push_back(bitmap); } else { // Push a dummy zero-length buffer, not to be copied buffers_.push_back(std::make_shared(nullptr, 0)); @@ -208,50 +218,136 @@ class RecordBatchWriter : public ArrayVisitor { private: Status Visit(const NullArray& array) override { return Status::NotImplemented("null"); } - Status VisitPrimitive(const PrimitiveArray& array) { - buffers_.push_back(array.data()); + template + Status VisitFixedWidth(const ArrayType& array) { + std::shared_ptr data_buffer = array.data(); + + if (array.offset() != 0) { + // Non-zero offset, slice the buffer + const auto& fw_type = static_cast(*array.type()); + const int type_width = fw_type.bit_width() / 8; + const int64_t byte_offset = array.offset() * type_width; + + // Send padding if it's available + const int64_t buffer_length = + std::min(BitUtil::RoundUpToMultipleOf64(array.length() * type_width), + data_buffer->size() - byte_offset); + data_buffer = SliceBuffer(data_buffer, byte_offset, buffer_length); + } + buffers_.push_back(data_buffer); + return Status::OK(); + } + + template + Status GetZeroBasedValueOffsets( + const ArrayType& array, std::shared_ptr* value_offsets) { + // Share slicing logic between ListArray and BinaryArray + + auto offsets = array.value_offsets(); + + if (array.offset() != 0) { + // If we have a non-zero offset, then the value offsets do not start at + // zero. We must a) create a new offsets array with shifted offsets and + // b) slice the values array accordingly + + std::shared_ptr shifted_offsets; + RETURN_NOT_OK(AllocateBuffer( + pool_, sizeof(int32_t) * (array.length() + 1), &shifted_offsets)); + + int32_t* dest_offsets = reinterpret_cast(shifted_offsets->mutable_data()); + const int32_t start_offset = array.value_offset(0); + + for (int i = 0; i < array.length(); ++i) { + dest_offsets[i] = array.value_offset(i) - start_offset; + } + // Final offset + dest_offsets[array.length()] = array.value_offset(array.length()) - start_offset; + offsets = shifted_offsets; + } + + *value_offsets = offsets; return Status::OK(); } Status VisitBinary(const BinaryArray& array) { - buffers_.push_back(array.value_offsets()); - buffers_.push_back(array.data()); + std::shared_ptr value_offsets; + RETURN_NOT_OK(GetZeroBasedValueOffsets(array, &value_offsets)); + auto data = array.data(); + + if (array.offset() != 0) { + // Slice the data buffer to include only the range we need now + data = SliceBuffer(data, array.value_offset(0), array.value_offset(array.length())); + } + + buffers_.push_back(value_offsets); + buffers_.push_back(data); return Status::OK(); } - Status Visit(const BooleanArray& array) override { return VisitPrimitive(array); } + Status Visit(const BooleanArray& array) override { + buffers_.push_back(array.data()); + return Status::OK(); + } - Status Visit(const Int8Array& array) override { return VisitPrimitive(array); } + Status Visit(const Int8Array& array) override { + return VisitFixedWidth(array); + } - Status Visit(const Int16Array& array) override { return VisitPrimitive(array); } + Status Visit(const Int16Array& array) override { + return VisitFixedWidth(array); + } - Status Visit(const Int32Array& array) override { return VisitPrimitive(array); } + Status Visit(const Int32Array& array) override { + return VisitFixedWidth(array); + } - Status Visit(const Int64Array& array) override { return VisitPrimitive(array); } + Status Visit(const Int64Array& array) override { + return VisitFixedWidth(array); + } - Status Visit(const UInt8Array& array) override { return VisitPrimitive(array); } + Status Visit(const UInt8Array& array) override { + return VisitFixedWidth(array); + } - Status Visit(const UInt16Array& array) override { return VisitPrimitive(array); } + Status Visit(const UInt16Array& array) override { + return VisitFixedWidth(array); + } - Status Visit(const UInt32Array& array) override { return VisitPrimitive(array); } + Status Visit(const UInt32Array& array) override { + return VisitFixedWidth(array); + } - Status Visit(const UInt64Array& array) override { return VisitPrimitive(array); } + Status Visit(const UInt64Array& array) override { + return VisitFixedWidth(array); + } - Status Visit(const HalfFloatArray& array) override { return VisitPrimitive(array); } + Status Visit(const HalfFloatArray& array) override { + return VisitFixedWidth(array); + } - Status Visit(const FloatArray& array) override { return VisitPrimitive(array); } + Status Visit(const FloatArray& array) override { + return VisitFixedWidth(array); + } - Status Visit(const DoubleArray& array) override { return VisitPrimitive(array); } + Status Visit(const DoubleArray& array) override { + return VisitFixedWidth(array); + } Status Visit(const StringArray& array) override { return VisitBinary(array); } Status Visit(const BinaryArray& array) override { return VisitBinary(array); } - Status Visit(const DateArray& array) override { return VisitPrimitive(array); } + Status Visit(const DateArray& array) override { + return VisitFixedWidth(array); + } - Status Visit(const TimeArray& array) override { return VisitPrimitive(array); } + Status Visit(const TimeArray& array) override { + return VisitFixedWidth(array); + } - Status Visit(const TimestampArray& array) override { return VisitPrimitive(array); } + Status Visit(const TimestampArray& array) override { + return VisitFixedWidth(array); + } Status Visit(const IntervalArray& array) override { return Status::NotImplemented("interval"); @@ -262,17 +358,32 @@ class RecordBatchWriter : public ArrayVisitor { } Status Visit(const ListArray& array) override { - buffers_.push_back(array.value_offsets()); + std::shared_ptr value_offsets; + RETURN_NOT_OK(GetZeroBasedValueOffsets(array, &value_offsets)); + buffers_.push_back(value_offsets); + --max_recursion_depth_; - RETURN_NOT_OK(VisitArray(*array.values().get())); + std::shared_ptr values = array.values(); + + if (array.offset() != 0) { + // For non-zero offset, we slice the values array accordingly + const int32_t offset = array.value_offset(0); + const int32_t length = array.value_offset(array.length()) - offset; + values = values->Slice(offset, length); + } + RETURN_NOT_OK(VisitArray(*values)); ++max_recursion_depth_; return Status::OK(); } Status Visit(const StructArray& array) override { --max_recursion_depth_; - for (const auto& field : array.fields()) { - RETURN_NOT_OK(VisitArray(*field.get())); + for (std::shared_ptr field : array.fields()) { + if (array.offset() != 0) { + // If offset is non-zero, slice the child array + field = field->Slice(array.offset(), array.length()); + } + RETURN_NOT_OK(VisitArray(*field)); } ++max_recursion_depth_; return Status::OK(); @@ -284,8 +395,12 @@ class RecordBatchWriter : public ArrayVisitor { if (array.mode() == UnionMode::DENSE) { buffers_.push_back(array.value_offsets()); } --max_recursion_depth_; - for (const auto& field : array.children()) { - RETURN_NOT_OK(VisitArray(*field.get())); + for (std::shared_ptr field : array.children()) { + if (array.offset() != 0) { + // If offset is non-zero, slice the child array + field = field->Slice(array.offset(), array.length()); + } + RETURN_NOT_OK(VisitArray(*field)); } ++max_recursion_depth_; return Status::OK(); @@ -298,6 +413,8 @@ class RecordBatchWriter : public ArrayVisitor { return Status::OK(); } + // In some cases, intermediate buffers may need to be allocated (with sliced arrays) + MemoryPool* pool_; const RecordBatch& batch_; std::vector field_nodes_; @@ -310,14 +427,14 @@ class RecordBatchWriter : public ArrayVisitor { Status WriteRecordBatch(const RecordBatch& batch, int64_t buffer_start_offset, io::OutputStream* dst, int32_t* metadata_length, int64_t* body_length, - int max_recursion_depth) { + MemoryPool* pool, int max_recursion_depth) { DCHECK_GT(max_recursion_depth, 0); - RecordBatchWriter serializer(batch, buffer_start_offset, max_recursion_depth); + RecordBatchWriter serializer(pool, batch, buffer_start_offset, max_recursion_depth); return serializer.Write(dst, metadata_length, body_length); } Status GetRecordBatchSize(const RecordBatch& batch, int64_t* size) { - RecordBatchWriter serializer(batch, 0, kMaxIpcRecursionDepth); + RecordBatchWriter serializer(default_memory_pool(), batch, 0, kMaxIpcRecursionDepth); RETURN_NOT_OK(serializer.GetTotalSize(size)); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/adapter.h b/cpp/src/arrow/ipc/adapter.h index f9ef7d9fe12..83542d0b066 100644 --- a/cpp/src/arrow/ipc/adapter.h +++ b/cpp/src/arrow/ipc/adapter.h @@ -30,6 +30,7 @@ namespace arrow { class Array; +class MemoryPool; class RecordBatch; class Schema; class Status; @@ -71,14 +72,15 @@ constexpr int kMaxIpcRecursionDepth = 64; // // @param(out) body_length: the size of the contiguous buffer block plus // padding bytes -ARROW_EXPORT Status WriteRecordBatch(const RecordBatch& batch, +Status ARROW_EXPORT WriteRecordBatch(const RecordBatch& batch, int64_t buffer_start_offset, io::OutputStream* dst, int32_t* metadata_length, - int64_t* body_length, int max_recursion_depth = kMaxIpcRecursionDepth); + int64_t* body_length, MemoryPool* pool, + int max_recursion_depth = kMaxIpcRecursionDepth); // Compute the precise number of bytes needed in a contiguous memory segment to // write the record batch. This involves generating the complete serialized // Flatbuffers metadata. -ARROW_EXPORT Status GetRecordBatchSize(const RecordBatch& batch, int64_t* size); +Status ARROW_EXPORT GetRecordBatchSize(const RecordBatch& batch, int64_t* size); // ---------------------------------------------------------------------- // "Read" path; does not copy data if the input supports zero copy reads diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc index 17868f8f102..ecd2d68fea0 100644 --- a/cpp/src/arrow/ipc/ipc-adapter-test.cc +++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc @@ -32,6 +32,7 @@ #include "arrow/buffer.h" #include "arrow/memory_pool.h" +#include "arrow/pretty_print.h" #include "arrow/status.h" #include "arrow/test-util.h" #include "arrow/util/bit-util.h" @@ -56,7 +57,7 @@ class TestWriteRecordBatch : public ::testing::TestWithParam, const int64_t buffer_offset = 0; RETURN_NOT_OK(WriteRecordBatch( - batch, buffer_offset, mmap_.get(), &metadata_length, &body_length)); + batch, buffer_offset, mmap_.get(), &metadata_length, &body_length, pool_)); std::shared_ptr metadata; RETURN_NOT_OK(ReadRecordBatchMetadata(0, metadata_length, mmap_.get(), &metadata)); @@ -92,6 +93,33 @@ TEST_P(TestWriteRecordBatch, RoundTrip) { } } +TEST_P(TestWriteRecordBatch, SliceRoundTrip) { + std::shared_ptr batch; + ASSERT_OK((*GetParam())(&batch)); // NOLINT clang-tidy gtest issue + std::shared_ptr batch_result; + + auto sliced_batch = batch->Slice(2, 10); + + ASSERT_OK(RoundTripHelper(*sliced_batch, 1 << 16, &batch_result)); + + EXPECT_EQ(sliced_batch->num_rows(), batch_result->num_rows()); + + for (int i = 0; i < sliced_batch->num_columns(); ++i) { + const auto& left = *sliced_batch->column(i); + const auto& right = *batch_result->column(i); + if (!left.Equals(right)) { + std::stringstream pp_result; + std::stringstream pp_expected; + + ASSERT_OK(PrettyPrint(left, 0, &pp_expected)); + ASSERT_OK(PrettyPrint(right, 0, &pp_result)); + + FAIL() << "Index: " << i << " Expected: " << pp_expected.str() + << "\nGot: " << pp_result.str(); + } + } +} + INSTANTIATE_TEST_CASE_P(RoundTripTests, TestWriteRecordBatch, ::testing::Values(&MakeIntRecordBatch, &MakeListRecordBatch, &MakeNonNullRecordBatch, &MakeZeroLengthRecordBatch, &MakeDeeplyNestedList, @@ -102,7 +130,8 @@ void TestGetRecordBatchSize(std::shared_ptr batch) { int32_t mock_metadata_length = -1; int64_t mock_body_length = -1; int64_t size = -1; - ASSERT_OK(WriteRecordBatch(*batch, 0, &mock, &mock_metadata_length, &mock_body_length)); + ASSERT_OK(WriteRecordBatch( + *batch, 0, &mock, &mock_metadata_length, &mock_body_length, default_memory_pool())); ASSERT_OK(GetRecordBatchSize(*batch, &size)); ASSERT_EQ(mock.GetExtentBytesWritten(), size); } @@ -156,10 +185,11 @@ class RecursionLimits : public ::testing::Test, public io::MemoryMapFixture { io::MemoryMapFixture::InitMemoryMap(memory_map_size, path, &mmap_); if (override_level) { - return WriteRecordBatch( - *batch, 0, mmap_.get(), metadata_length, body_length, recursion_level + 1); + return WriteRecordBatch(*batch, 0, mmap_.get(), metadata_length, body_length, pool_, + recursion_level + 1); } else { - return WriteRecordBatch(*batch, 0, mmap_.get(), metadata_length, body_length); + return WriteRecordBatch( + *batch, 0, mmap_.get(), metadata_length, body_length, pool_); } } diff --git a/cpp/src/arrow/ipc/stream.cc b/cpp/src/arrow/ipc/stream.cc index c9057e860b1..72eb13465af 100644 --- a/cpp/src/arrow/ipc/stream.cc +++ b/cpp/src/arrow/ipc/stream.cc @@ -28,6 +28,7 @@ #include "arrow/ipc/adapter.h" #include "arrow/ipc/metadata.h" #include "arrow/ipc/util.h" +#include "arrow/memory_pool.h" #include "arrow/schema.h" #include "arrow/status.h" #include "arrow/util/logging.h" @@ -41,7 +42,11 @@ namespace ipc { StreamWriter::~StreamWriter() {} StreamWriter::StreamWriter(io::OutputStream* sink, const std::shared_ptr& schema) - : sink_(sink), schema_(schema), position_(-1), started_(false) {} + : sink_(sink), + schema_(schema), + pool_(default_memory_pool()), + position_(-1), + started_(false) {} Status StreamWriter::UpdatePosition() { return sink_->Tell(&position_); @@ -76,8 +81,8 @@ Status StreamWriter::WriteRecordBatch(const RecordBatch& batch, FileBlock* block // Frame of reference in file format is 0, see ARROW-384 const int64_t buffer_start_offset = 0; - RETURN_NOT_OK(arrow::ipc::WriteRecordBatch( - batch, buffer_start_offset, sink_, &block->metadata_length, &block->body_length)); + RETURN_NOT_OK(arrow::ipc::WriteRecordBatch(batch, buffer_start_offset, sink_, + &block->metadata_length, &block->body_length, pool_)); RETURN_NOT_OK(UpdatePosition()); DCHECK(position_ % 8 == 0) << "WriteRecordBatch did not perform aligned writes"; @@ -85,6 +90,10 @@ Status StreamWriter::WriteRecordBatch(const RecordBatch& batch, FileBlock* block return Status::OK(); } +void StreamWriter::set_memory_pool(MemoryPool* pool) { + pool_ = pool; +} + // ---------------------------------------------------------------------- // StreamWriter implementation diff --git a/cpp/src/arrow/ipc/stream.h b/cpp/src/arrow/ipc/stream.h index 53f51dc7367..12414fa2ca0 100644 --- a/cpp/src/arrow/ipc/stream.h +++ b/cpp/src/arrow/ipc/stream.h @@ -30,6 +30,7 @@ namespace arrow { class Array; class Buffer; struct Field; +class MemoryPool; class RecordBatch; class Schema; class Status; @@ -59,6 +60,10 @@ class ARROW_EXPORT StreamWriter { /// closing the actual OutputStream virtual Status Close(); + // In some cases, writing may require memory allocation. We use the default + // memory pool, but provide the option to override + void set_memory_pool(MemoryPool* pool); + protected: StreamWriter(io::OutputStream* sink, const std::shared_ptr& schema); @@ -81,6 +86,9 @@ class ARROW_EXPORT StreamWriter { io::OutputStream* sink_; std::shared_ptr schema_; + + MemoryPool* pool_; + int64_t position_; bool started_; }; diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index a8d6443a6bc..288667834e4 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -178,7 +178,11 @@ class ArrayPrinter : public ArrayVisitor { Newline(); Write("-- values: "); - RETURN_NOT_OK(PrettyPrint(*array.values().get(), indent_ + 2, sink_)); + auto values = array.values(); + if (array.offset() != 0) { + values = values->Slice(array.value_offset(0), array.value_offset(array.length())); + } + RETURN_NOT_OK(PrettyPrint(*values, indent_ + 2, sink_)); return Status::OK(); } @@ -189,7 +193,7 @@ class ArrayPrinter : public ArrayVisitor { std::stringstream ss; ss << "-- child " << i << " type: " << fields[i]->type()->ToString() << " values: "; Write(ss.str()); - RETURN_NOT_OK(PrettyPrint(*fields[i].get(), indent_ + 2, sink_)); + RETURN_NOT_OK(PrettyPrint(*fields[i], indent_ + 2, sink_)); } return Status::OK(); } @@ -222,11 +226,11 @@ class ArrayPrinter : public ArrayVisitor { Newline(); Write("-- dictionary: "); - RETURN_NOT_OK(PrettyPrint(*array.dictionary().get(), indent_ + 2, sink_)); + RETURN_NOT_OK(PrettyPrint(*array.dictionary(), indent_ + 2, sink_)); Newline(); Write("-- indices: "); - return PrettyPrint(*array.indices().get(), indent_ + 2, sink_); + return PrettyPrint(*array.indices(), indent_ + 2, sink_); } void Write(const char* data) { (*sink_) << data; } @@ -260,7 +264,7 @@ Status PrettyPrint(const RecordBatch& batch, int indent, std::ostream* sink) { for (int i = 0; i < batch.num_columns(); ++i) { const std::string& name = batch.column_name(i); (*sink) << name << ": "; - RETURN_NOT_OK(PrettyPrint(*batch.column(i).get(), indent + 2, sink)); + RETURN_NOT_OK(PrettyPrint(*batch.column(i), indent + 2, sink)); (*sink) << "\n"; } return Status::OK(); diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc index 5c6792f9cb0..bf7731f22c5 100644 --- a/cpp/src/arrow/util/bit-util.cc +++ b/cpp/src/arrow/util/bit-util.cc @@ -92,7 +92,7 @@ int64_t CountSetBits(const uint8_t* data, int64_t bit_offset, int64_t length) { } Status GetEmptyBitmap( - MemoryPool* pool, int32_t length, std::shared_ptr* result) { + MemoryPool* pool, int64_t length, std::shared_ptr* result) { RETURN_NOT_OK(AllocateBuffer(pool, BitUtil::BytesForBits(length), result)); memset((*result)->mutable_data(), 0, (*result)->size()); return Status::OK(); @@ -101,7 +101,7 @@ Status GetEmptyBitmap( Status CopyBitmap(MemoryPool* pool, const uint8_t* data, int32_t offset, int32_t length, std::shared_ptr* out) { std::shared_ptr buffer; - RETURN_NOT_OK(AllocateBuffer(pool, BitUtil::BytesForBits(length), &buffer)); + RETURN_NOT_OK(GetEmptyBitmap(pool, length, &buffer)); uint8_t* dest = buffer->mutable_data(); for (int64_t i = 0; i < length; ++i) { BitUtil::SetBitTo(dest, i, BitUtil::GetBit(data, i + offset)); @@ -110,4 +110,12 @@ Status CopyBitmap(MemoryPool* pool, const uint8_t* data, int32_t offset, int32_t return Status::OK(); } +bool BitmapEquals(const uint8_t* left, const uint8_t* right, int64_t bit_length) { + // TODO(wesm): Make this faster using word-wise comparisons + for (int64_t i = 0; i < bit_length; ++i) { + if (BitUtil::GetBit(left, i) != BitUtil::GetBit(right, i)) { return false; } + } + return true; +} + } // namespace arrow diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index fe71ed32772..bfbe657325c 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -116,7 +116,7 @@ ARROW_EXPORT Status BytesToBits(const std::vector&, std::shared_ptr* result); + MemoryPool* pool, int64_t length, std::shared_ptr* result); /// Copy a bit range of an existing bitmap /// @@ -140,6 +140,9 @@ Status ARROW_EXPORT CopyBitmap(MemoryPool* pool, const uint8_t* bitmap, int32_t int64_t ARROW_EXPORT CountSetBits( const uint8_t* data, int64_t bit_offset, int64_t length); +bool ARROW_EXPORT BitmapEquals( + const uint8_t* left, const uint8_t* right, int64_t bit_length); + } // namespace arrow #endif // ARROW_UTIL_BIT_UTIL_H From 1a6fcb44150e9fe96ad349e66bdb4015bf1e8001 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 6 Feb 2017 00:55:41 -0500 Subject: [PATCH 10/15] Make some more progress. dense union needs more work Change-Id: I1096b38d9c78f50b7d92e2abe7af237d80dae5dd --- cpp/src/arrow/array-test.cc | 9 ++++ cpp/src/arrow/array.cc | 8 ++- cpp/src/arrow/array.h | 5 +- cpp/src/arrow/compare.cc | 76 ++++++++++++++++++++++----- cpp/src/arrow/ipc/adapter.cc | 40 +++++++++++++- cpp/src/arrow/ipc/ipc-adapter-test.cc | 12 +++-- cpp/src/arrow/ipc/json-internal.cc | 16 +++--- cpp/src/arrow/ipc/test-common.h | 3 +- cpp/src/arrow/pretty_print.cc | 32 +++++++---- cpp/src/arrow/type.cc | 6 +-- cpp/src/arrow/type.h | 8 +-- cpp/src/arrow/util/bit-util.cc | 8 ++- cpp/src/arrow/util/bit-util.h | 4 +- 13 files changed, 174 insertions(+), 53 deletions(-) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 2e8f84897a6..45130d8f640 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -102,6 +102,15 @@ TEST_F(TestArray, SliceRecomputeNullCount) { slice = array->Slice(0); ASSERT_EQ(4, slice->null_count()); + + // No bitmap, compute 0 + std::shared_ptr data; + const int kBufferSize = 64; + ASSERT_OK(AllocateBuffer(pool_, kBufferSize, &data)); + memset(data->mutable_data(), 0, kBufferSize); + + auto arr = std::make_shared(16, data, nullptr, -1); + ASSERT_EQ(0, arr->null_count()); } TEST_F(TestArray, TestIsNull) { diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 5bf39315702..f84023e6c7d 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -52,8 +52,12 @@ Array::Array(const std::shared_ptr& type, int32_t length, } int32_t Array::null_count() const { - if (null_count_ < 0 && null_bitmap_) { - null_count_ = CountSetBits(null_bitmap_data_, offset_, length_); + if (null_count_ < 0) { + if (null_bitmap_) { + null_count_ = CountSetBits(null_bitmap_data_, offset_, length_); + } else { + null_count_ = 0; + } } return null_count_; } diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 404694a871b..f3e8f9a4982 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -415,6 +415,7 @@ class ARROW_EXPORT StructArray : public Array { class ARROW_EXPORT UnionArray : public Array { public: using TypeClass = UnionType; + using type_id_t = uint8_t; UnionArray(const std::shared_ptr& type, int32_t length, const std::vector>& children, @@ -431,7 +432,7 @@ class ARROW_EXPORT UnionArray : public Array { /// Note that this buffer does not account for any slice offset std::shared_ptr value_offsets() const { return value_offsets_; } - const uint8_t* raw_type_ids() const { return raw_type_ids_ + offset_; } + const type_id_t* raw_type_ids() const { return raw_type_ids_ + offset_; } const int32_t* raw_value_offsets() const { return raw_value_offsets_ + offset_; } UnionMode mode() const { return static_cast(*type_.get()).mode; } @@ -448,7 +449,7 @@ class ARROW_EXPORT UnionArray : public Array { std::vector> children_; std::shared_ptr type_ids_; - const uint8_t* raw_type_ids_; + const type_id_t* raw_type_ids_; std::shared_ptr value_offsets_; const int32_t* raw_value_offsets_; diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 7a26db412dd..9979554f88b 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -200,7 +200,11 @@ class RangeEqualsVisitor : public ArrayVisitor { for (size_t j = 0; j < left.fields().size(); ++j) { // TODO: really we should be comparing stretches of non-null data rather // than looking at one value at a time. - equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, right.field(j)); + const int left_abs_index = i + left.offset(); + const int right_abs_index = o_i + right.offset(); + + equal_fields = left.field(j)->RangeEquals( + left_abs_index, left_abs_index + 1, right_abs_index, right.field(j)); if (!equal_fields) { return false; } } } @@ -223,7 +227,7 @@ class RangeEqualsVisitor : public ArrayVisitor { // Define a mapping from the type id to child number uint8_t max_code = 0; - const std::vector type_codes = left_type.type_ids; + const std::vector type_codes = left_type.type_codes; for (size_t i = 0; i < type_codes.size(); ++i) { const uint8_t code = type_codes[i]; if (code > max_code) { max_code = code; } @@ -248,10 +252,14 @@ class RangeEqualsVisitor : public ArrayVisitor { id = left_ids[i]; child_num = type_id_to_child_num[id]; + const int left_abs_index = i + left.offset(); + const int right_abs_index = o_i + right.offset(); + // TODO(wesm): really we should be comparing stretches of non-null data // rather than looking at one value at a time. if (union_mode == UnionMode::SPARSE) { - if (!left.child(child_num)->RangeEquals(i, i + 1, o_i, right.child(child_num))) { + if (!left.child(child_num)->RangeEquals(left_abs_index, left_abs_index + 1, + right_abs_index, right.child(child_num))) { return false; } } else { @@ -315,7 +323,8 @@ class EqualsVisitor : public RangeEqualsVisitor { } result_ = true; } else { - result_ = BitmapEquals(left.data()->data(), right.data()->data(), left.length()); + result_ = BitmapEquals(left.data()->data(), left.offset(), right.data()->data(), + right.offset(), left.length()); } return Status::OK(); } @@ -384,13 +393,46 @@ class EqualsVisitor : public RangeEqualsVisitor { Status Visit(const IntervalArray& left) override { return ComparePrimitive(left); } + template + bool ValueOffsetsEqual(const ArrayType& left) { + const auto& right = static_cast(right_); + + if (left.offset() == 0 && right.offset() == 0) { + return left.value_offsets()->Equals( + *right.value_offsets(), (left.length() + 1) * sizeof(int32_t)); + } else { + // One of the arrays is sliced; logic is more complicated because the + // value offsets are not both 0-based + auto left_offsets = + reinterpret_cast(left.value_offsets()->data()) + left.offset(); + auto right_offsets = + reinterpret_cast(right.value_offsets()->data()) + + right.offset(); + + for (int32_t i = 0; i < left.length() + 1; ++i) { + if (left_offsets[i] - left_offsets[0] != right_offsets[i] - right_offsets[0]) { + return false; + } + } + return true; + } + } + bool CompareBinary(const BinaryArray& left) { const auto& right = static_cast(right_); - bool equal_offsets = left.value_offsets()->Equals( - *right.value_offsets(), (left.length() + 1) * sizeof(int32_t)); + + bool equal_offsets = ValueOffsetsEqual(left); if (!equal_offsets) { return false; } - if (!left.data() && !(right.data())) { return true; } - return left.data()->Equals(*right.data(), left.raw_value_offsets()[left.length()]); + + if (left.offset() == 0 && right.offset() == 0) { + if (!left.data() && !(right.data())) { return true; } + return left.data()->Equals(*right.data(), left.raw_value_offsets()[left.length()]); + } else { + // Compare the corresponding data range + const int64_t total_bytes = left.value_offset(left.length()) - left.value_offset(0); + return std::memcmp(left.data()->data() + left.value_offset(0), + right.data()->data() + right.value_offset(0), total_bytes) == 0; + } } Status Visit(const StringArray& left) override { @@ -405,12 +447,20 @@ class EqualsVisitor : public RangeEqualsVisitor { Status Visit(const ListArray& left) override { const auto& right = static_cast(right_); - if (!left.value_offsets()->Equals( - *right.value_offsets(), (left.length() + 1) * sizeof(int32_t))) { + bool equal_offsets = ValueOffsetsEqual(left); + if (!equal_offsets) { result_ = false; - } else { + return Status::OK(); + } + + if (left.offset() == 0 && right.offset() == 0) { result_ = left.values()->Equals(right.values()); + } else { + // One of the arrays is sliced + result_ = left.values()->RangeEquals(left.value_offset(0), + left.value_offset(left.length()), right.value_offset(0), right.values()); } + return Status::OK(); } @@ -473,8 +523,8 @@ static bool BaseDataEquals(const Array& left, const Array& right) { return false; } if (left.null_count() > 0) { - return BitmapEquals( - left.null_bitmap()->data(), right.null_bitmap()->data(), left.length()); + return BitmapEquals(left.null_bitmap()->data(), left.offset(), + right.null_bitmap()->data(), right.offset(), left.length()); } return true; } diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index e14768c68bb..67be508b2fb 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -390,9 +390,45 @@ class RecordBatchWriter : public ArrayVisitor { } Status Visit(const UnionArray& array) override { - buffers_.push_back(array.type_ids()); + auto type_ids = array.type_ids(); + if (array.offset() != 0) { + type_ids = SliceBuffer(type_ids, array.offset() * sizeof(UnionArray::type_id_t), + array.length() * sizeof(UnionArray::type_id_t)); + } + + const auto& type = static_cast(*array.type()); + + buffers_.push_back(type_ids); + + if (array.mode() == UnionMode::DENSE) { + auto value_offsets = array.value_offsets(); + if (array.offset() != 0) { + // This is an unpleasant case. Because the offsets are different for + // each child array, when we have a sliced array, we need to "rebase" + // the value_offsets for each array + + // std::shared_ptr shifted_buffer; + // RETURN_NOT_OK(AllocateBuffer(pool_, array.length() * sizeof(int32_t), + // &shifted_buffer)); + // int32_t* shifted_offsets = + // reinterpret_cast(shifted_buffer->mutable_data()); - if (array.mode() == UnionMode::DENSE) { buffers_.push_back(array.value_offsets()); } + // Allocate an array of base offsets + + // std::vector base_offsets(type.type_codes.size(), 0); + // const uint8_t* type_ids = array.raw_type_ids(); + + // TODO(wesm): XXX + + // Examine the data up until array.offset() to determine how much to + // shift the data + // for (int32_t i = 0 + + value_offsets = SliceBuffer(value_offsets, array.offset() * sizeof(int32_t), + array.length() * sizeof(int32_t)); + } + buffers_.push_back(value_offsets); + } --max_recursion_depth_; for (std::shared_ptr field : array.children()) { diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc index ecd2d68fea0..bae6578f110 100644 --- a/cpp/src/arrow/ipc/ipc-adapter-test.cc +++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc @@ -98,6 +98,9 @@ TEST_P(TestWriteRecordBatch, SliceRoundTrip) { ASSERT_OK((*GetParam())(&batch)); // NOLINT clang-tidy gtest issue std::shared_ptr batch_result; + // Skip the zero-length case + if (batch->num_rows() < 2) { return; } + auto sliced_batch = batch->Slice(2, 10); ASSERT_OK(RoundTripHelper(*sliced_batch, 1 << 16, &batch_result)); @@ -120,10 +123,11 @@ TEST_P(TestWriteRecordBatch, SliceRoundTrip) { } } -INSTANTIATE_TEST_CASE_P(RoundTripTests, TestWriteRecordBatch, - ::testing::Values(&MakeIntRecordBatch, &MakeListRecordBatch, &MakeNonNullRecordBatch, - &MakeZeroLengthRecordBatch, &MakeDeeplyNestedList, - &MakeStringTypesRecordBatch, &MakeStruct, &MakeUnion)); +INSTANTIATE_TEST_CASE_P( + RoundTripTests, TestWriteRecordBatch, + ::testing::Values(&MakeIntRecordBatch, &MakeStringTypesRecordBatch, + &MakeNonNullRecordBatch, &MakeZeroLengthRecordBatch, &MakeListRecordBatch, + &MakeDeeplyNestedList, &MakeStruct, &MakeUnion)); void TestGetRecordBatchSize(std::shared_ptr batch) { ipc::MockOutputStream mock; diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index fd184958d51..efa22636572 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -199,8 +199,8 @@ class JsonSchemaWriter : public TypeVisitor { // Write type ids writer_->Key("typeIds"); writer_->StartArray(); - for (size_t i = 0; i < type.type_ids.size(); ++i) { - writer_->Uint(type.type_ids[i]); + for (size_t i = 0; i < type.type_codes.size(); ++i) { + writer_->Uint(type.type_codes[i]); } writer_->EndArray(); } @@ -718,17 +718,17 @@ class JsonSchemaReader { return Status::Invalid(ss.str()); } - const auto& json_type_ids = json_type.FindMember("typeIds"); - RETURN_NOT_ARRAY("typeIds", json_type_ids, json_type); + const auto& json_type_codes = json_type.FindMember("typeIds"); + RETURN_NOT_ARRAY("typeIds", json_type_codes, json_type); - std::vector type_ids; - const auto& id_array = json_type_ids->value.GetArray(); + std::vector type_codes; + const auto& id_array = json_type_codes->value.GetArray(); for (const rj::Value& val : id_array) { DCHECK(val.IsUint()); - type_ids.push_back(val.GetUint()); + type_codes.push_back(val.GetUint()); } - *type = union_(children, type_ids, mode); + *type = union_(children, type_codes, mode); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index 9b155d3c647..b33201525ae 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -194,7 +194,6 @@ Status MakeZeroLengthRecordBatch(std::shared_ptr* out) { // Example data MemoryPool* pool = default_memory_pool(); - const int length = 200; const bool include_nulls = true; std::shared_ptr leaf_values, list_array, list_list_array, flat_array; RETURN_NOT_OK(MakeRandomInt32Array(0, include_nulls, pool, &leaf_values)); @@ -202,7 +201,7 @@ Status MakeZeroLengthRecordBatch(std::shared_ptr* out) { RETURN_NOT_OK( MakeRandomListArray(list_array, 0, include_nulls, pool, &list_list_array)); RETURN_NOT_OK(MakeRandomInt32Array(0, include_nulls, pool, &flat_array)); - out->reset(new RecordBatch(schema, length, {list_array, list_list_array, flat_array})); + out->reset(new RecordBatch(schema, 0, {list_array, list_list_array, flat_array})); return Status::OK(); } diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index 288667834e4..f6497d2b076 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -164,8 +164,15 @@ class ArrayPrinter : public ArrayVisitor { Status WriteValidityBitmap(const Array& array) { Newline(); Write("-- is_valid: "); - BooleanArray is_valid(array.length(), array.null_bitmap()); - return PrettyPrint(is_valid, indent_ + 2, sink_); + + if (array.null_count() > 0) { + BooleanArray is_valid( + array.length(), array.null_bitmap(), nullptr, 0, array.offset()); + return PrettyPrint(is_valid, indent_ + 2, sink_); + } else { + Write("all not null"); + return Status::OK(); + } } Status Visit(const ListArray& array) override { @@ -173,7 +180,8 @@ class ArrayPrinter : public ArrayVisitor { Newline(); Write("-- value_offsets: "); - Int32Array value_offsets(array.length() + 1, array.value_offsets()); + Int32Array value_offsets( + array.length() + 1, array.value_offsets(), nullptr, 0, array.offset()); RETURN_NOT_OK(PrettyPrint(value_offsets, indent_ + 2, sink_)); Newline(); @@ -187,20 +195,25 @@ class ArrayPrinter : public ArrayVisitor { return Status::OK(); } - Status PrintChildren(const std::vector>& fields) { + Status PrintChildren( + const std::vector>& fields, int32_t offset, int32_t length) { for (size_t i = 0; i < fields.size(); ++i) { Newline(); std::stringstream ss; ss << "-- child " << i << " type: " << fields[i]->type()->ToString() << " values: "; Write(ss.str()); - RETURN_NOT_OK(PrettyPrint(*fields[i], indent_ + 2, sink_)); + + std::shared_ptr field = fields[i]; + if (offset != 0) { field = field->Slice(offset, length); } + + RETURN_NOT_OK(PrettyPrint(*field, indent_ + 2, sink_)); } return Status::OK(); } Status Visit(const StructArray& array) override { RETURN_NOT_OK(WriteValidityBitmap(array)); - return PrintChildren(array.fields()); + return PrintChildren(array.fields(), array.offset(), array.length()); } Status Visit(const UnionArray& array) override { @@ -208,17 +221,18 @@ class ArrayPrinter : public ArrayVisitor { Newline(); Write("-- type_ids: "); - UInt8Array type_ids(array.length(), array.type_ids()); + UInt8Array type_ids(array.length(), array.type_ids(), nullptr, 0, array.offset()); RETURN_NOT_OK(PrettyPrint(type_ids, indent_ + 2, sink_)); if (array.mode() == UnionMode::DENSE) { Newline(); Write("-- value_offsets: "); - Int32Array value_offsets(array.length(), array.value_offsets()); + Int32Array value_offsets( + array.length(), array.value_offsets(), nullptr, 0, array.offset()); RETURN_NOT_OK(PrettyPrint(value_offsets, indent_ + 2, sink_)); } - return PrintChildren(array.children()); + return PrintChildren(array.children(), array.offset(), array.length()); } Status Visit(const DictionaryArray& array) override { diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index ba775845fca..a1c2b79950d 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -115,7 +115,7 @@ std::string UnionType::ToString() const { for (size_t i = 0; i < children_.size(); ++i) { if (i) { s << ", "; } - s << children_[i]->ToString() << "=" << static_cast(type_ids[i]); + s << children_[i]->ToString() << "=" << static_cast(type_codes[i]); } s << ">"; return s.str(); @@ -224,8 +224,8 @@ std::shared_ptr struct_(const std::vector>& fie } std::shared_ptr union_(const std::vector>& child_fields, - const std::vector& type_ids, UnionMode mode) { - return std::make_shared(child_fields, type_ids, mode); + const std::vector& type_codes, UnionMode mode) { + return std::make_shared(child_fields, type_codes, mode); } std::shared_ptr dictionary(const std::shared_ptr& index_type, diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 8638a3f4b6e..927b8a44fe1 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -413,8 +413,8 @@ struct ARROW_EXPORT UnionType : public DataType { static constexpr Type::type type_id = Type::UNION; UnionType(const std::vector>& fields, - const std::vector& type_ids, UnionMode mode = UnionMode::SPARSE) - : DataType(Type::UNION), mode(mode), type_ids(type_ids) { + const std::vector& type_codes, UnionMode mode = UnionMode::SPARSE) + : DataType(Type::UNION), mode(mode), type_codes(type_codes) { children_ = fields; } @@ -429,7 +429,7 @@ struct ARROW_EXPORT UnionType : public DataType { // The type id used in the data to indicate each data type in the union. For // example, the first type in the union might be denoted by the id 5 (instead // of 0). - std::vector type_ids; + std::vector type_codes; }; // ---------------------------------------------------------------------- @@ -551,7 +551,7 @@ std::shared_ptr ARROW_EXPORT struct_( std::shared_ptr ARROW_EXPORT union_( const std::vector>& child_fields, - const std::vector& type_ids, UnionMode mode = UnionMode::SPARSE); + const std::vector& type_codes, UnionMode mode = UnionMode::SPARSE); std::shared_ptr ARROW_EXPORT dictionary( const std::shared_ptr& index_type, const std::shared_ptr& values); diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc index bf7731f22c5..f3fbb41fa54 100644 --- a/cpp/src/arrow/util/bit-util.cc +++ b/cpp/src/arrow/util/bit-util.cc @@ -110,10 +110,14 @@ Status CopyBitmap(MemoryPool* pool, const uint8_t* data, int32_t offset, int32_t return Status::OK(); } -bool BitmapEquals(const uint8_t* left, const uint8_t* right, int64_t bit_length) { +bool BitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right, + int64_t right_offset, int64_t bit_length) { // TODO(wesm): Make this faster using word-wise comparisons for (int64_t i = 0; i < bit_length; ++i) { - if (BitUtil::GetBit(left, i) != BitUtil::GetBit(right, i)) { return false; } + if (BitUtil::GetBit(left, left_offset + i) != + BitUtil::GetBit(right, right_offset + i)) { + return false; + } } return true; } diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index bfbe657325c..a0fbdd2f92c 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -140,8 +140,8 @@ Status ARROW_EXPORT CopyBitmap(MemoryPool* pool, const uint8_t* bitmap, int32_t int64_t ARROW_EXPORT CountSetBits( const uint8_t* data, int64_t bit_offset, int64_t length); -bool ARROW_EXPORT BitmapEquals( - const uint8_t* left, const uint8_t* right, int64_t bit_length); +bool ARROW_EXPORT BitmapEquals(const uint8_t* left, int64_t left_offset, + const uint8_t* right, int64_t right_offset, int64_t bit_length); } // namespace arrow From 4f086284597a8da11383c28707e5ebbc3f984c56 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 6 Feb 2017 01:02:58 -0500 Subject: [PATCH 11/15] Add missing include Change-Id: I23040e716d40f6d19642165f21a25f741ce8879f --- cpp/src/arrow/ipc/test-common.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index b33201525ae..385196e11f7 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include From 2a1392994b1ff98879604f50e8fa7bd7900d3358 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 6 Feb 2017 09:27:52 -0500 Subject: [PATCH 12/15] Implement slicing IPC logic for dense array Change-Id: Iead66e3b86a52fa9b1188e4ceb477e5cdfecd744 --- cpp/src/arrow/ipc/adapter.cc | 72 +++++++++++++++++++++++------------ cpp/src/arrow/pretty_print.cc | 3 +- 2 files changed, 50 insertions(+), 25 deletions(-) diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index 67be508b2fb..312bf9379a8 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -396,47 +396,71 @@ class RecordBatchWriter : public ArrayVisitor { array.length() * sizeof(UnionArray::type_id_t)); } - const auto& type = static_cast(*array.type()); - buffers_.push_back(type_ids); + --max_recursion_depth_; if (array.mode() == UnionMode::DENSE) { + const auto& type = static_cast(*array.type()); auto value_offsets = array.value_offsets(); + + // The Union type codes are not necessary 0-indexed + uint8_t max_code = 0; + for (uint8_t code : type.type_codes) { + if (code > max_code) { max_code = code; } + } + + // Allocate an array of child offsets. Set all to -1 to indicate that we + // haven't observed a first occurrence of a particular child yet + std::vector child_offsets(max_code + 1); + std::vector child_lengths(max_code + 1, 0); + if (array.offset() != 0) { // This is an unpleasant case. Because the offsets are different for // each child array, when we have a sliced array, we need to "rebase" // the value_offsets for each array - // std::shared_ptr shifted_buffer; - // RETURN_NOT_OK(AllocateBuffer(pool_, array.length() * sizeof(int32_t), - // &shifted_buffer)); - // int32_t* shifted_offsets = - // reinterpret_cast(shifted_buffer->mutable_data()); + const int32_t* unshifted_offsets = array.raw_value_offsets(); + const uint8_t* type_ids = array.raw_type_ids(); - // Allocate an array of base offsets + // Allocate the shifted offsets + std::shared_ptr shifted_offsets_buffer; + RETURN_NOT_OK(AllocateBuffer( + pool_, array.length() * sizeof(int32_t), &shifted_offsets_buffer)); + int32_t* shifted_offsets = + reinterpret_cast(shifted_offsets_buffer->mutable_data()); - // std::vector base_offsets(type.type_codes.size(), 0); - // const uint8_t* type_ids = array.raw_type_ids(); + for (int32_t i = 0; i < array.length(); ++i) { + const uint8_t code = type_ids[i]; + int32_t shift = child_offsets[code]; + if (shift == -1) { child_offsets[code] = shift = unshifted_offsets[i]; } + shifted_offsets[i] = unshifted_offsets[i] - shift; - // TODO(wesm): XXX + // Update the child length to account for observed value + ++child_lengths[code]; + } - // Examine the data up until array.offset() to determine how much to - // shift the data - // for (int32_t i = 0 - - value_offsets = SliceBuffer(value_offsets, array.offset() * sizeof(int32_t), - array.length() * sizeof(int32_t)); + value_offsets = shifted_offsets_buffer; } buffers_.push_back(value_offsets); - } - --max_recursion_depth_; - for (std::shared_ptr field : array.children()) { - if (array.offset() != 0) { - // If offset is non-zero, slice the child array - field = field->Slice(array.offset(), array.length()); + // Visit children and slice accordingly + for (int i = 0; i < type.num_children(); ++i) { + std::shared_ptr child = array.child(i); + if (array.offset() != 0) { + const uint8_t code = type.type_codes[i]; + child = child->Slice(child_offsets[code], child_lengths[code]); + } + RETURN_NOT_OK(VisitArray(*child)); + } + } else { + for (std::shared_ptr child : array.children()) { + // Sparse union, slicing is simpler + if (array.offset() != 0) { + // If offset is non-zero, slice the child array + child = child->Slice(array.offset(), array.length()); + } + RETURN_NOT_OK(VisitArray(*child)); } - RETURN_NOT_OK(VisitArray(*field)); } ++max_recursion_depth_; return Status::OK(); diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index f6497d2b076..23c05807c16 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -232,7 +232,8 @@ class ArrayPrinter : public ArrayVisitor { RETURN_NOT_OK(PrettyPrint(value_offsets, indent_ + 2, sink_)); } - return PrintChildren(array.children(), array.offset(), array.length()); + // Print the children without any offset, because the type ids are absolute + return PrintChildren(array.children(), 0, array.length() + array.offset()); } Status Visit(const DictionaryArray& array) override { From 9a008704305ba7a1e20d41d0f3fe3714ba13bbaa Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 6 Feb 2017 09:55:34 -0500 Subject: [PATCH 13/15] Make ApproxEquals for floating point arrays work on slices Change-Id: Ife519a479657883e2c93a96cad587725bfa81179 --- cpp/src/arrow/array-dictionary-test.cc | 20 +++++++------- cpp/src/arrow/array-primitive-test.cc | 36 ++++++++++++++++++++++++++ cpp/src/arrow/array-string-test.cc | 13 +++------- cpp/src/arrow/builder.cc | 11 ++++---- cpp/src/arrow/builder.h | 4 +-- cpp/src/arrow/column-test.cc | 14 +++++----- cpp/src/arrow/compare.cc | 4 +-- cpp/src/arrow/ipc/adapter.cc | 1 + cpp/src/arrow/ipc/ipc-json-test.cc | 17 ++++++------ cpp/src/arrow/ipc/json-internal.cc | 2 +- cpp/src/arrow/ipc/test-common.h | 29 ++++++++++----------- cpp/src/arrow/pretty_print-test.cc | 6 ++--- cpp/src/arrow/test-util.h | 10 +++---- 13 files changed, 94 insertions(+), 73 deletions(-) diff --git a/cpp/src/arrow/array-dictionary-test.cc b/cpp/src/arrow/array-dictionary-test.cc index ef1e5e07f02..61381b76711 100644 --- a/cpp/src/arrow/array-dictionary-test.cc +++ b/cpp/src/arrow/array-dictionary-test.cc @@ -34,7 +34,7 @@ namespace arrow { TEST(TestDictionary, Basics) { std::vector values = {100, 1000, 10000, 100000}; std::shared_ptr dict; - ArrayFromVector(int32(), values, &dict); + ArrayFromVector(values, &dict); std::shared_ptr type1 = std::dynamic_pointer_cast(dictionary(int16(), dict)); @@ -54,25 +54,25 @@ TEST(TestDictionary, Equals) { std::shared_ptr dict; std::vector dict_values = {"foo", "bar", "baz"}; - ArrayFromVector(utf8(), dict_values, &dict); + ArrayFromVector(dict_values, &dict); std::shared_ptr dict_type = dictionary(int16(), dict); std::shared_ptr dict2; std::vector dict2_values = {"foo", "bar", "baz", "qux"}; - ArrayFromVector(utf8(), dict2_values, &dict2); + ArrayFromVector(dict2_values, &dict2); std::shared_ptr dict2_type = dictionary(int16(), dict2); std::shared_ptr indices; std::vector indices_values = {1, 2, -1, 0, 2, 0}; - ArrayFromVector(int16(), is_valid, indices_values, &indices); + ArrayFromVector(is_valid, indices_values, &indices); std::shared_ptr indices2; std::vector indices2_values = {1, 2, 0, 0, 2, 0}; - ArrayFromVector(int16(), is_valid, indices2_values, &indices2); + ArrayFromVector(is_valid, indices2_values, &indices2); std::shared_ptr indices3; std::vector indices3_values = {1, 1, 0, 0, 2, 0}; - ArrayFromVector(int16(), is_valid, indices3_values, &indices3); + ArrayFromVector(is_valid, indices3_values, &indices3); auto array = std::make_shared(dict_type, indices); auto array2 = std::make_shared(dict_type, indices2); @@ -122,20 +122,20 @@ TEST(TestDictionary, Validate) { std::shared_ptr dict; std::vector dict_values = {"foo", "bar", "baz"}; - ArrayFromVector(utf8(), dict_values, &dict); + ArrayFromVector(dict_values, &dict); std::shared_ptr dict_type = dictionary(int16(), dict); std::shared_ptr indices; std::vector indices_values = {1, 2, 0, 0, 2, 0}; - ArrayFromVector(uint8(), is_valid, indices_values, &indices); + ArrayFromVector(is_valid, indices_values, &indices); std::shared_ptr indices2; std::vector indices2_values = {1., 2., 0., 0., 2., 0.}; - ArrayFromVector(float32(), is_valid, indices2_values, &indices2); + ArrayFromVector(is_valid, indices2_values, &indices2); std::shared_ptr indices3; std::vector indices3_values = {1, 2, 0, 0, 2, 0}; - ArrayFromVector(int64(), is_valid, indices3_values, &indices3); + ArrayFromVector(is_valid, indices3_values, &indices3); std::shared_ptr arr = std::make_shared(dict_type, indices); std::shared_ptr arr2 = std::make_shared(dict_type, indices2); diff --git a/cpp/src/arrow/array-primitive-test.cc b/cpp/src/arrow/array-primitive-test.cc index d781e9cde00..a20fdbf8b91 100644 --- a/cpp/src/arrow/array-primitive-test.cc +++ b/cpp/src/arrow/array-primitive-test.cc @@ -505,4 +505,40 @@ TYPED_TEST(TestPrimitiveBuilder, TestReserve) { ASSERT_EQ(BitUtil::NextPower2(kMinBuilderCapacity + 100), this->builder_->capacity()); } +template +void CheckSliceApproxEquals() { + using T = typename TYPE::c_type; + + const int kSize = 50; + std::vector draws1; + std::vector draws2; + + const uint32_t kSeed = 0; + test::random_real(kSize, kSeed, 0, 100, &draws1); + test::random_real(kSize, kSeed + 1, 0, 100, &draws2); + + // Make the draws equal in the sliced segment, but unequal elsewhere (to + // catch not using the slice offset) + for (int i = 10; i < 30; ++i) { + draws2[i] = draws1[i]; + } + + std::vector is_valid; + test::random_is_valid(kSize, 0.1, &is_valid); + + std::shared_ptr array1, array2; + ArrayFromVector(is_valid, draws1, &array1); + ArrayFromVector(is_valid, draws2, &array2); + + std::shared_ptr slice1 = array1->Slice(10, 20); + std::shared_ptr slice2 = array2->Slice(10, 20); + + ASSERT_TRUE(slice1->ApproxEquals(slice2)); +} + +TEST(TestPrimitiveAdHoc, FloatingSliceApproxEquals) { + CheckSliceApproxEquals(); + CheckSliceApproxEquals(); +} + } // namespace arrow diff --git a/cpp/src/arrow/array-string-test.cc b/cpp/src/arrow/array-string-test.cc index e2712738628..8b7eb41d4c3 100644 --- a/cpp/src/arrow/array-string-test.cc +++ b/cpp/src/arrow/array-string-test.cc @@ -147,8 +147,7 @@ class TestStringBuilder : public TestBuilder { public: void SetUp() { TestBuilder::SetUp(); - type_ = TypePtr(new StringType()); - builder_.reset(new StringBuilder(pool_, type_)); + builder_.reset(new StringBuilder(pool_)); } void Done() { @@ -160,8 +159,6 @@ class TestStringBuilder : public TestBuilder { } protected: - TypePtr type_; - std::unique_ptr builder_; std::shared_ptr result_; }; @@ -318,8 +315,7 @@ class TestBinaryBuilder : public TestBuilder { public: void SetUp() { TestBuilder::SetUp(); - type_ = TypePtr(new BinaryType()); - builder_.reset(new BinaryBuilder(pool_, type_)); + builder_.reset(new BinaryBuilder(pool_)); } void Done() { @@ -331,8 +327,6 @@ class TestBinaryBuilder : public TestBuilder { } protected: - TypePtr type_; - std::unique_ptr builder_; std::shared_ptr result_; }; @@ -385,8 +379,7 @@ void CheckSliceEquality() { using Traits = TypeTraits; using BuilderType = typename Traits::BuilderType; - auto type = Traits::type_singleton(); - BuilderType builder(default_memory_pool(), type); + BuilderType builder(default_memory_pool()); std::vector strings = {"foo", "", "bar", "baz", "qux", ""}; std::vector is_null = {0, 1, 0, 1, 0, 0}; diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 5c6b19ff228..36eca12014c 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -333,14 +333,13 @@ std::shared_ptr ListBuilder::value_builder() const { // ---------------------------------------------------------------------- // String and binary -// This used to be a static member variable of BinaryBuilder, but it can cause -// valgrind to report a (spurious?) memory leak when needed in other shared -// libraries. The problem came up while adding explicit visibility to libarrow -// and libparquet_arrow -static TypePtr kBinaryValueType = TypePtr(new UInt8Type()); +BinaryBuilder::BinaryBuilder(MemoryPool* pool) + : ListBuilder(pool, std::make_shared(pool, uint8()), binary()) { + byte_builder_ = static_cast(value_builder_.get()); +} BinaryBuilder::BinaryBuilder(MemoryPool* pool, const TypePtr& type) - : ListBuilder(pool, std::make_shared(pool, kBinaryValueType), type) { + : ListBuilder(pool, std::make_shared(pool, uint8()), type) { byte_builder_ = static_cast(value_builder_.get()); } diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 7a32aae9ba4..eb1ac5a866f 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -362,6 +362,7 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder { // BinaryBuilder : public ListBuilder class ARROW_EXPORT BinaryBuilder : public ListBuilder { public: + explicit BinaryBuilder(MemoryPool* pool); explicit BinaryBuilder(MemoryPool* pool, const TypePtr& type); Status Append(const uint8_t* value, int32_t length) { @@ -387,9 +388,6 @@ class ARROW_EXPORT StringBuilder : public BinaryBuilder { explicit StringBuilder(MemoryPool* pool = default_memory_pool()) : BinaryBuilder(pool, utf8()) {} - explicit StringBuilder(MemoryPool* pool, const std::shared_ptr& type) - : BinaryBuilder(pool, type) {} - using BinaryBuilder::Append; Status Finish(std::shared_ptr* out) override; diff --git a/cpp/src/arrow/column-test.cc b/cpp/src/arrow/column-test.cc index 1e722ed7de0..0bbfc831f5c 100644 --- a/cpp/src/arrow/column-test.cc +++ b/cpp/src/arrow/column-test.cc @@ -51,7 +51,7 @@ TEST_F(TestChunkedArray, BasicEquals) { std::vector null_bitmap(100, true); std::vector data(100, 1); std::shared_ptr array; - ArrayFromVector(int32(), null_bitmap, data, &array); + ArrayFromVector(null_bitmap, data, &array); arrays_one_.push_back(array); arrays_another_.push_back(array); @@ -67,9 +67,9 @@ TEST_F(TestChunkedArray, EqualsDifferingTypes) { std::vector data32(100, 1); std::vector data64(100, 1); std::shared_ptr array; - ArrayFromVector(int32(), null_bitmap, data32, &array); + ArrayFromVector(null_bitmap, data32, &array); arrays_one_.push_back(array); - ArrayFromVector(int64(), null_bitmap, data64, &array); + ArrayFromVector(null_bitmap, data64, &array); arrays_another_.push_back(array); Construct(); @@ -83,9 +83,9 @@ TEST_F(TestChunkedArray, EqualsDifferingLengths) { std::vector data100(100, 1); std::vector data101(101, 1); std::shared_ptr array; - ArrayFromVector(int32(), null_bitmap100, data100, &array); + ArrayFromVector(null_bitmap100, data100, &array); arrays_one_.push_back(array); - ArrayFromVector(int32(), null_bitmap101, data101, &array); + ArrayFromVector(null_bitmap101, data101, &array); arrays_another_.push_back(array); Construct(); @@ -94,7 +94,7 @@ TEST_F(TestChunkedArray, EqualsDifferingLengths) { std::vector null_bitmap1(1, true); std::vector data1(1, 1); - ArrayFromVector(int32(), null_bitmap1, data1, &array); + ArrayFromVector(null_bitmap1, data1, &array); arrays_one_.push_back(array); Construct(); @@ -156,7 +156,7 @@ TEST_F(TestColumn, Equals) { std::vector null_bitmap(100, true); std::vector data(100, 1); std::shared_ptr array; - ArrayFromVector(int32(), null_bitmap, data, &array); + ArrayFromVector(null_bitmap, data, &array); arrays_one_.push_back(array); arrays_another_.push_back(array); diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 9979554f88b..27fad713572 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -480,8 +480,8 @@ inline bool FloatingApproxEquals( const NumericArray& left, const NumericArray& right) { using T = typename TYPE::c_type; - auto left_data = reinterpret_cast(left.data()->data()); - auto right_data = reinterpret_cast(right.data()->data()); + const T* left_data = left.raw_data(); + const T* right_data = right.raw_data(); static constexpr T EPSILON = 1E-5; diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index 312bf9379a8..3613ccbadbb 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -17,6 +17,7 @@ #include "arrow/ipc/adapter.h" +#include #include #include #include diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc index ef1a0f36173..3e759cccbbc 100644 --- a/cpp/src/arrow/ipc/ipc-json-test.cc +++ b/cpp/src/arrow/ipc/ipc-json-test.cc @@ -80,7 +80,7 @@ template void CheckPrimitive(const std::shared_ptr& type, const std::vector& is_valid, const std::vector& values) { MemoryPool* pool = default_memory_pool(); - typename TypeTraits::BuilderType builder(pool, type); + typename TypeTraits::BuilderType builder(pool); for (size_t i = 0; i < values.size(); ++i) { if (is_valid[i]) { @@ -146,12 +146,11 @@ TEST(TestJsonArrayWriter, NestedTypes) { std::vector values = {0, 1, 2, 3, 4, 5, 6}; std::shared_ptr values_array; - ArrayFromVector(int32(), values_is_valid, values, &values_array); + ArrayFromVector(values_is_valid, values, &values_array); std::vector i16_values = {0, 1, 2, 3, 4, 5, 6}; std::shared_ptr i16_values_array; - ArrayFromVector( - int16(), values_is_valid, i16_values, &i16_values_array); + ArrayFromVector(values_is_valid, i16_values, &i16_values_array); // List std::vector list_is_valid = {true, false, true, true, true}; @@ -202,15 +201,15 @@ void MakeBatchArrays(const std::shared_ptr& schema, const int num_rows, test::randint(num_rows, 0, 100, &v2_values); std::shared_ptr v1; - ArrayFromVector(schema->field(0)->type, is_valid, v1_values, &v1); + ArrayFromVector(is_valid, v1_values, &v1); std::shared_ptr v2; - ArrayFromVector(schema->field(1)->type, is_valid, v2_values, &v2); + ArrayFromVector(is_valid, v2_values, &v2); static const int kBufferSize = 10; static uint8_t buffer[kBufferSize]; static uint32_t seed = 0; - StringBuilder string_builder(default_memory_pool(), utf8()); + StringBuilder string_builder(default_memory_pool()); for (int i = 0; i < num_rows; ++i) { if (!is_valid[i]) { string_builder.AppendNull(); @@ -338,13 +337,13 @@ TEST(TestJsonFileReadWrite, MinimalFormatExample) { std::vector foo_valid = {true, false, true, true, true}; std::vector foo_values = {1, 2, 3, 4, 5}; std::shared_ptr foo; - ArrayFromVector(int32(), foo_valid, foo_values, &foo); + ArrayFromVector(foo_valid, foo_values, &foo); ASSERT_TRUE(batch->column(0)->Equals(foo)); std::vector bar_valid = {true, false, false, true, true}; std::vector bar_values = {1, 2, 3, 4, 5}; std::shared_ptr bar; - ArrayFromVector(float64(), bar_valid, bar_values, &bar); + ArrayFromVector(bar_valid, bar_values, &bar); ASSERT_TRUE(batch->column(1)->Equals(bar)); } diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index efa22636572..1a95b2ce470 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -844,7 +844,7 @@ class JsonArrayReader { typename std::enable_if::value, Status>::type ReadArray( const RjObject& json_array, int32_t length, const std::vector& is_valid, const std::shared_ptr& type, std::shared_ptr* array) { - typename TypeTraits::BuilderType builder(pool_, type); + typename TypeTraits::BuilderType builder(pool_); const auto& json_data = json_array.FindMember("DATA"); RETURN_NOT_ARRAY("DATA", json_data, json_array); diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index 385196e11f7..b4930c4555d 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -29,6 +29,7 @@ #include "arrow/buffer.h" #include "arrow/builder.h" #include "arrow/memory_pool.h" +#include "arrow/status.h" #include "arrow/table.h" #include "arrow/test-util.h" #include "arrow/type.h" @@ -120,10 +121,10 @@ Status MakeIntRecordBatch(std::shared_ptr* out) { template Status MakeRandomBinaryArray( - const TypePtr& type, int32_t length, MemoryPool* pool, std::shared_ptr* out) { + int32_t length, MemoryPool* pool, std::shared_ptr* out) { const std::vector values = { "", "", "abc", "123", "efg", "456!@#!@#", "12312"}; - Builder builder(pool, type); + Builder builder(pool); const auto values_len = values.size(); for (int32_t i = 0; i < length; ++i) { int values_index = i % values_len; @@ -149,15 +150,15 @@ Status MakeStringTypesRecordBatch(std::shared_ptr* out) { std::shared_ptr a0, a1; MemoryPool* pool = default_memory_pool(); + // Quirk with RETURN_NOT_OK macro and templated functions { - auto status = - MakeRandomBinaryArray(string_type, length, pool, &a0); - RETURN_NOT_OK(status); + auto s = MakeRandomBinaryArray(length, pool, &a0); + RETURN_NOT_OK(s); } + { - auto status = - MakeRandomBinaryArray(binary_type, length, pool, &a1); - RETURN_NOT_OK(status); + auto s = MakeRandomBinaryArray(length, pool, &a1); + RETURN_NOT_OK(s); } out->reset(new RecordBatch(schema, length, {a0, a1})); return Status::OK(); @@ -308,21 +309,17 @@ Status MakeUnion(std::shared_ptr* out) { RETURN_NOT_OK(test::CopyBufferFromVector(type_ids, &type_ids_buffer)); std::vector u0_values = {0, 1, 2, 3, 4, 5, 6}; - ArrayFromVector( - sparse_type->child(0)->type, u0_values, &sparse_children[0]); + ArrayFromVector(u0_values, &sparse_children[0]); std::vector u1_values = {10, 11, 12, 13, 14, 15, 16}; - ArrayFromVector( - sparse_type->child(1)->type, u1_values, &sparse_children[1]); + ArrayFromVector(u1_values, &sparse_children[1]); // dense children u0_values = {0, 2, 3, 7}; - ArrayFromVector( - dense_type->child(0)->type, u0_values, &dense_children[0]); + ArrayFromVector(u0_values, &dense_children[0]); u1_values = {11, 14, 15}; - ArrayFromVector( - dense_type->child(1)->type, u1_values, &dense_children[1]); + ArrayFromVector(u1_values, &dense_children[1]); std::shared_ptr offsets_buffer; std::vector offsets = {0, 0, 1, 2, 1, 2, 3}; diff --git a/cpp/src/arrow/pretty_print-test.cc b/cpp/src/arrow/pretty_print-test.cc index 4725d5dd808..aca650f0a92 100644 --- a/cpp/src/arrow/pretty_print-test.cc +++ b/cpp/src/arrow/pretty_print-test.cc @@ -55,7 +55,7 @@ template void CheckPrimitive(int indent, const std::vector& is_valid, const std::vector& values, const char* expected) { std::shared_ptr array; - ArrayFromVector(std::make_shared(), is_valid, values, &array); + ArrayFromVector(is_valid, values, &array); CheckArray(*array.get(), indent, expected); } @@ -76,12 +76,12 @@ TEST_F(TestPrettyPrint, DictionaryType) { std::shared_ptr dict; std::vector dict_values = {"foo", "bar", "baz"}; - ArrayFromVector(utf8(), dict_values, &dict); + ArrayFromVector(dict_values, &dict); std::shared_ptr dict_type = dictionary(int16(), dict); std::shared_ptr indices; std::vector indices_values = {1, 2, -1, 0, 2, 0}; - ArrayFromVector(int16(), is_valid, indices_values, &indices); + ArrayFromVector(is_valid, indices_values, &indices); auto arr = std::make_shared(dict_type, indices); static const char* expected = R"expected( diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 2df8e957685..ffc78067d1b 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -232,11 +232,10 @@ class TestBase : public ::testing::Test { }; template -void ArrayFromVector(const std::shared_ptr& type, - const std::vector& is_valid, const std::vector& values, +void ArrayFromVector(const std::vector& is_valid, const std::vector& values, std::shared_ptr* out) { MemoryPool* pool = default_memory_pool(); - typename TypeTraits::BuilderType builder(pool, std::make_shared()); + typename TypeTraits::BuilderType builder(pool); for (size_t i = 0; i < values.size(); ++i) { if (is_valid[i]) { ASSERT_OK(builder.Append(values[i])); @@ -248,10 +247,9 @@ void ArrayFromVector(const std::shared_ptr& type, } template -void ArrayFromVector(const std::shared_ptr& type, - const std::vector& values, std::shared_ptr* out) { +void ArrayFromVector(const std::vector& values, std::shared_ptr* out) { MemoryPool* pool = default_memory_pool(); - typename TypeTraits::BuilderType builder(pool, std::make_shared()); + typename TypeTraits::BuilderType builder(pool); for (size_t i = 0; i < values.size(); ++i) { ASSERT_OK(builder.Append(values[i])); } From 86511a36b0918bb2c2a1c33673c7ee244d82d5e0 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 6 Feb 2017 10:34:10 -0500 Subject: [PATCH 14/15] Python fixes, clang warning fixes Change-Id: Ifa914f92611e00a9e3514b65680c8d2254b63a29 --- cpp/src/arrow/builder.h | 4 ++-- python/CMakeLists.txt | 2 +- python/pyarrow/includes/libarrow.pxd | 4 ++-- python/pyarrow/scalar.pyx | 2 +- python/src/pyarrow/adapters/builtin.cc | 2 +- python/src/pyarrow/adapters/pandas.cc | 20 +++++++++++--------- python/src/pyarrow/io.cc | 21 ++++++++++++++++----- 7 files changed, 34 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index eb1ac5a866f..37a2f342717 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -141,7 +141,7 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { using value_type = typename Type::c_type; explicit PrimitiveBuilder(MemoryPool* pool, const TypePtr& type) - : ArrayBuilder(pool, type), data_(nullptr) {} + : ArrayBuilder(pool, type), data_(nullptr), raw_data_(nullptr) {} using ArrayBuilder::Advance; @@ -240,7 +240,7 @@ using DoubleBuilder = NumericBuilder; class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { public: explicit BooleanBuilder(MemoryPool* pool, const TypePtr& type = boolean()) - : ArrayBuilder(pool, type), data_(nullptr) {} + : ArrayBuilder(pool, type), data_(nullptr), raw_data_(nullptr) {} using ArrayBuilder::Advance; diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 842a2196dab..ba26692b32b 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -95,7 +95,7 @@ if ("${COMPILER_FAMILY}" STREQUAL "clang") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qunused-arguments") # Cython warnings in clang - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality -Wno-constant-logical-operand") endif() set(PYARROW_LINK "a") diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 38883e811e1..ebfdc410fa0 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -179,8 +179,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: double Value(int i) cdef cppclass CListArray" arrow::ListArray"(CArray): - const int32_t* offsets() - int32_t offset(int i) + const int32_t* raw_value_offsets() + int32_t value_offset(int i) int32_t value_length(int i) shared_ptr[CArray] values() shared_ptr[CDataType] value_type() diff --git a/python/pyarrow/scalar.pyx b/python/pyarrow/scalar.pyx index 30b90408dc0..9d2b2b11a80 100644 --- a/python/pyarrow/scalar.pyx +++ b/python/pyarrow/scalar.pyx @@ -202,7 +202,7 @@ cdef class ListValue(ArrayValue): self.value_type = box_data_type(self.ap.value_type()) cdef getitem(self, int i): - cdef int j = self.ap.offset(self.index) + i + cdef int j = self.ap.value_offset(self.index) + i return box_arrow_scalar(self.value_type, self.ap.values(), j) def as_py(self): diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc index 1abfb409118..5fd8eef23fe 100644 --- a/python/src/pyarrow/adapters/builtin.cc +++ b/python/src/pyarrow/adapters/builtin.cc @@ -505,7 +505,7 @@ Status ConvertPySequence( // Handle NA / NullType case if (type->type == Type::NA) { - out->reset(new arrow::NullArray(type, size)); + out->reset(new arrow::NullArray(size)); return Status::OK(); } diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc index 8d05821c2fd..345dc9070e5 100644 --- a/python/src/pyarrow/adapters/pandas.cc +++ b/python/src/pyarrow/adapters/pandas.cc @@ -1338,8 +1338,7 @@ class ArrowSerializer { PyAcquireGIL lock; PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); - arrow::TypePtr string_type(new arrow::DateType()); - arrow::DateBuilder date_builder(pool_, string_type); + arrow::DateBuilder date_builder(pool_); RETURN_NOT_OK(date_builder.Resize(length_)); Status s; @@ -1363,8 +1362,7 @@ class ArrowSerializer { // and unicode mixed in the object array PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); - arrow::TypePtr string_type(new arrow::StringType()); - arrow::StringBuilder string_builder(pool_, string_type); + arrow::StringBuilder string_builder(pool_); RETURN_NOT_OK(string_builder.Resize(length_)); Status s; @@ -1374,8 +1372,8 @@ class ArrowSerializer { if (have_bytes) { const auto& arr = static_cast(*out->get()); - *out = std::make_shared( - arr.length(), arr.offsets(), arr.data(), arr.null_count(), arr.null_bitmap()); + *out = std::make_shared(arr.length(), arr.value_offsets(), + arr.data(), arr.null_bitmap(), arr.null_count()); } return Status::OK(); } @@ -1403,7 +1401,7 @@ class ArrowSerializer { } } - *out = std::make_shared(length_, data, null_count, null_bitmap_); + *out = std::make_shared(length_, data, null_bitmap_, null_count); return Status::OK(); } @@ -1515,10 +1513,14 @@ inline Status ArrowSerializer::Convert(std::shared_ptr* out) { null_count = ValuesToBitmap(PyArray_DATA(arr_), length_, null_bitmap_data_); } + // For readability + constexpr int32_t kOffset = 0; + RETURN_NOT_OK(ConvertData()); std::shared_ptr type; RETURN_NOT_OK(MakeDataType(&type)); - RETURN_NOT_OK(MakePrimitiveArray(type, length_, data_, null_count, null_bitmap_, out)); + RETURN_NOT_OK( + MakePrimitiveArray(type, length_, data_, null_bitmap_, null_count, kOffset, out)); return Status::OK(); } @@ -1657,7 +1659,7 @@ ArrowSerializer::ConvertTypedLists( // TODO: If there are bytes involed, convert to Binary representation bool have_bytes = false; - auto value_builder = std::make_shared(pool_, field->type); + auto value_builder = std::make_shared(pool_); ListBuilder list_builder(pool_, value_builder); PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); for (int64_t i = 0; i < length_; ++i) { diff --git a/python/src/pyarrow/io.cc b/python/src/pyarrow/io.cc index 92352607e62..aa4cb7b052c 100644 --- a/python/src/pyarrow/io.cc +++ b/python/src/pyarrow/io.cc @@ -56,9 +56,20 @@ static Status CheckPyError() { return Status::OK(); } +// This is annoying: because C++11 does not allow implicit conversion of string +// literals to non-const char*, we need to go through some gymnastics to use +// PyObject_CallMethod without a lot of pain (its arguments are non-const +// char*) +template +static inline PyObject* cpp_PyObject_CallMethod( + PyObject* obj, const char* method_name, const char* argspec, ArgTypes... args) { + return PyObject_CallMethod( + obj, const_cast(method_name), const_cast(argspec), args...); +} + Status PythonFile::Close() { // whence: 0 for relative to start of file, 2 for end of file - PyObject* result = PyObject_CallMethod(file_, "close", "()"); + PyObject* result = cpp_PyObject_CallMethod(file_, "close", "()"); Py_XDECREF(result); ARROW_RETURN_NOT_OK(CheckPyError()); return Status::OK(); @@ -66,14 +77,14 @@ Status PythonFile::Close() { Status PythonFile::Seek(int64_t position, int whence) { // whence: 0 for relative to start of file, 2 for end of file - PyObject* result = PyObject_CallMethod(file_, "seek", "(ii)", position, whence); + PyObject* result = cpp_PyObject_CallMethod(file_, "seek", "(ii)", position, whence); Py_XDECREF(result); ARROW_RETURN_NOT_OK(CheckPyError()); return Status::OK(); } Status PythonFile::Read(int64_t nbytes, PyObject** out) { - PyObject* result = PyObject_CallMethod(file_, "read", "(i)", nbytes); + PyObject* result = cpp_PyObject_CallMethod(file_, "read", "(i)", nbytes); ARROW_RETURN_NOT_OK(CheckPyError()); *out = result; return Status::OK(); @@ -84,7 +95,7 @@ Status PythonFile::Write(const uint8_t* data, int64_t nbytes) { PyBytes_FromStringAndSize(reinterpret_cast(data), nbytes); ARROW_RETURN_NOT_OK(CheckPyError()); - PyObject* result = PyObject_CallMethod(file_, "write", "(O)", py_data); + PyObject* result = cpp_PyObject_CallMethod(file_, "write", "(O)", py_data); Py_XDECREF(py_data); Py_XDECREF(result); ARROW_RETURN_NOT_OK(CheckPyError()); @@ -92,7 +103,7 @@ Status PythonFile::Write(const uint8_t* data, int64_t nbytes) { } Status PythonFile::Tell(int64_t* position) { - PyObject* result = PyObject_CallMethod(file_, "tell", "()"); + PyObject* result = cpp_PyObject_CallMethod(file_, "tell", "()"); ARROW_RETURN_NOT_OK(CheckPyError()); *position = PyLong_AsLongLong(result); From 61afe42c6792d3d6a9ed93139c220ddef2580b82 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 6 Feb 2017 11:01:57 -0500 Subject: [PATCH 15/15] Some API cleaning in builder.h Change-Id: Id76bae5adf3263f0b01ef015c4a6857779262df0 --- cpp/src/arrow/builder.cc | 37 +++++++++++++++++++++++-------------- cpp/src/arrow/builder.h | 8 ++++---- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 36eca12014c..dddadeee0da 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -202,10 +202,19 @@ template class PrimitiveBuilder; template class PrimitiveBuilder; template class PrimitiveBuilder; template class PrimitiveBuilder; +template class PrimitiveBuilder; template class PrimitiveBuilder; template class PrimitiveBuilder; template class PrimitiveBuilder; +BooleanBuilder::BooleanBuilder(MemoryPool* pool) + : ArrayBuilder(pool, boolean()), data_(nullptr), raw_data_(nullptr) {} + +BooleanBuilder::BooleanBuilder(MemoryPool* pool, const std::shared_ptr& type) + : BooleanBuilder(pool) { + DCHECK_EQ(Type::BOOL, type->type); +} + Status BooleanBuilder::Init(int32_t capacity) { RETURN_NOT_OK(ArrayBuilder::Init(capacity)); data_ = std::make_shared(pool_); @@ -355,6 +364,8 @@ Status BinaryBuilder::Finish(std::shared_ptr* out) { return Status::OK(); } +StringBuilder::StringBuilder(MemoryPool* pool) : BinaryBuilder(pool, utf8()) {} + Status StringBuilder::Finish(std::shared_ptr* out) { std::shared_ptr result; RETURN_NOT_OK(ListBuilder::Finish(&result)); @@ -392,9 +403,9 @@ std::shared_ptr StructBuilder::field_builder(int pos) const { // ---------------------------------------------------------------------- // Helper functions -#define BUILDER_CASE(ENUM, BuilderType) \ - case Type::ENUM: \ - out->reset(new BuilderType(pool, type)); \ +#define BUILDER_CASE(ENUM, BuilderType) \ + case Type::ENUM: \ + out->reset(new BuilderType(pool)); \ return Status::OK(); // Initially looked at doing this with vtables, but shared pointers makes it @@ -413,19 +424,17 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, BUILDER_CASE(UINT64, UInt64Builder); BUILDER_CASE(INT64, Int64Builder); BUILDER_CASE(DATE, DateBuilder); - BUILDER_CASE(TIMESTAMP, TimestampBuilder); - - BUILDER_CASE(BOOL, BooleanBuilder); - - BUILDER_CASE(FLOAT, FloatBuilder); - BUILDER_CASE(DOUBLE, DoubleBuilder); - - case Type::STRING: - out->reset(new StringBuilder(pool)); + case Type::TIMESTAMP: + out->reset(new TimestampBuilder(pool, type)); return Status::OK(); - case Type::BINARY: - out->reset(new BinaryBuilder(pool, type)); + case Type::TIME: + out->reset(new TimeBuilder(pool, type)); return Status::OK(); + BUILDER_CASE(BOOL, BooleanBuilder); + BUILDER_CASE(FLOAT, FloatBuilder); + BUILDER_CASE(DOUBLE, DoubleBuilder); + BUILDER_CASE(STRING, StringBuilder); + BUILDER_CASE(BINARY, BinaryBuilder); case Type::LIST: { std::shared_ptr value_builder; std::shared_ptr value_type = diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 37a2f342717..0b83b9f3c68 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -231,6 +231,7 @@ using Int16Builder = NumericBuilder; using Int32Builder = NumericBuilder; using Int64Builder = NumericBuilder; using TimestampBuilder = NumericBuilder; +using TimeBuilder = NumericBuilder; using DateBuilder = NumericBuilder; using HalfFloatBuilder = NumericBuilder; @@ -239,8 +240,8 @@ using DoubleBuilder = NumericBuilder; class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { public: - explicit BooleanBuilder(MemoryPool* pool, const TypePtr& type = boolean()) - : ArrayBuilder(pool, type), data_(nullptr), raw_data_(nullptr) {} + explicit BooleanBuilder(MemoryPool* pool); + explicit BooleanBuilder(MemoryPool* pool, const std::shared_ptr& type); using ArrayBuilder::Advance; @@ -385,8 +386,7 @@ class ARROW_EXPORT BinaryBuilder : public ListBuilder { // String builder class ARROW_EXPORT StringBuilder : public BinaryBuilder { public: - explicit StringBuilder(MemoryPool* pool = default_memory_pool()) - : BinaryBuilder(pool, utf8()) {} + explicit StringBuilder(MemoryPool* pool); using BinaryBuilder::Append;