Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
80 commits
Select commit Hold shift + click to select a range
6de6f0f
refactor python to arrow conversions
kszucs Aug 31, 2020
19c07f1
fallback to bytes in case of mixed unicode and bytes data
kszucs Sep 1, 2020
ccf7b9a
fix error type
kszucs Sep 1, 2020
7e86eba
decrypt
kszucs Sep 2, 2020
eb8a785
remove converter.cc
kszucs Sep 2, 2020
f2b519e
remove unused headers
kszucs Sep 2, 2020
75bf699
dictionary support
kszucs Sep 2, 2020
0d73f02
fix dict conversion
kszucs Sep 3, 2020
aa6364f
refactor PyBytesView
kszucs Sep 3, 2020
8a2cc26
improve specialization
kszucs Sep 3, 2020
07b96b1
other specialization
kszucs Sep 3, 2020
74c74cf
minor cleanup
kszucs Sep 3, 2020
a1fa6cf
clang format
kszucs Sep 3, 2020
d0f5667
update dictionary scalar test
kszucs Sep 3, 2020
69cebce
fix dictionary scalar serialization and please gcc4.8
kszucs Sep 4, 2020
be9d8e7
fix dictionary scalar test
kszucs Sep 4, 2020
4f5891f
linting; try to fix msvc error
kszucs Sep 4, 2020
ddde7a9
avoid name collisions
kszucs Sep 4, 2020
e7c046c
use string view
kszucs Sep 4, 2020
22d098c
test that dictionary index type is honored
kszucs Sep 4, 2020
524dfbd
always return with an array instead of a chunkedarray
kszucs Sep 4, 2020
677adb2
address review comments
kszucs Sep 7, 2020
b4797ce
move to internal namespace
kszucs Sep 7, 2020
1a5ad12
simplify
kszucs Sep 8, 2020
9c2c5e5
overloaded AppendValue
kszucs Sep 8, 2020
9ece006
method specialization
kszucs Sep 8, 2020
53d83bf
specialize primitive converters
kszucs Sep 9, 2020
fb94b32
support list of pairs when converting to struct array; struct scalar …
kszucs Sep 9, 2020
748836b
auto chunking
kszucs Sep 9, 2020
c086364
working chunk on capacity overflow
kszucs Sep 10, 2020
5a38642
fix struct key kind inference and failing tests
kszucs Sep 10, 2020
4bc562c
use different aliases in the converter trait
kszucs Sep 10, 2020
2581619
validate overflow for list types as well
kszucs Sep 10, 2020
666eae3
fix assertion in numpy large memory test
kszucs Sep 10, 2020
79134e4
lint
kszucs Sep 10, 2020
80eff04
fix py35 tests cases; fix linting error
kszucs Sep 10, 2020
e2baab0
linting again
kszucs Sep 10, 2020
3dc203e
test nested overflow for string
kszucs Sep 11, 2020
723f602
test builder overflow checking
kszucs Sep 14, 2020
984c82e
pass pool separately
kszucs Sep 14, 2020
4b7a408
remove unused inference function
kszucs Sep 14, 2020
bf06472
pa.array docstrings
kszucs Sep 14, 2020
196a0c1
additional testing for struct conversion
kszucs Sep 15, 2020
50efce9
inline comments
kszucs Sep 15, 2020
a3af670
flake8
kszucs Sep 15, 2020
768c75c
missing parameter documentation
kszucs Sep 15, 2020
01b08fc
resolve rebase problem
kszucs Sep 15, 2020
9691c87
revert breaking StructScalar change
kszucs Sep 15, 2020
62a5afd
fix large memory test case
kszucs Sep 15, 2020
0e749a0
fix hypothesis test case
kszucs Sep 15, 2020
488fc7f
Apply suggestions from code review
kszucs Sep 19, 2020
09faa6c
move out Extend and ExtendMasked
kszucs Sep 19, 2020
35415e2
use trait to map types to converters
kszucs Sep 19, 2020
f4922ec
rename template argument
kszucs Sep 19, 2020
76938ca
flake8
kszucs Sep 19, 2020
d485bc1
resolve rebose conflicts
kszucs Sep 19, 2020
2c63f7d
chunker should not inherit from converter
kszucs Sep 19, 2020
68c3373
make chunker construction more straightforward
kszucs Sep 19, 2020
ca7238e
more verbose dictionary scalar naming in the python bindings
kszucs Sep 19, 2020
dd5fa70
address review comments
kszucs Sep 21, 2020
c70552b
address review comments
kszucs Sep 21, 2020
21e932b
address review comments
kszucs Sep 21, 2020
5545281
fix specialization
kszucs Sep 21, 2020
c52c367
clang format
kszucs Sep 21, 2020
4b426d5
fix overflow checking
kszucs Sep 21, 2020
d823c64
address review comments
kszucs Sep 21, 2020
57e6c7f
implement ValidateOverflow for map type; test list overflow chunking
kszucs Sep 21, 2020
45a93cf
address review comments
kszucs Sep 22, 2020
3359f47
test PYARROW_IGNORE_TIMEZONE; test map array overflow
kszucs Sep 22, 2020
e8a7475
address remaining review comments
kszucs Sep 23, 2020
e04e245
remove leftover
kszucs Sep 23, 2020
fe3ceb2
test cases for two additional auto chunking issues
kszucs Sep 23, 2020
01a89fc
additional overflow tests
kszucs Sep 23, 2020
aaf92b8
missing newline in docstrings
kszucs Sep 23, 2020
3115303
conversion optimizations
kszucs Sep 25, 2020
6c709e2
please gcc 4.8
kszucs Sep 25, 2020
b551d12
some inline notes describing performance improvements
kszucs Sep 25, 2020
806e4c0
try to please compilers
kszucs Sep 25, 2020
b527a0e
fix msvc issues
kszucs Sep 25, 2020
2213c25
cast to offset type when appending to string builders
kszucs Sep 25, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion cpp/src/arrow/array/array_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,13 @@ struct ScalarFromArraySlotImpl {
}

if (array_.IsNull(index_)) {
return MakeNullScalar(array_.type());
auto null = MakeNullScalar(array_.type());
if (is_dictionary(array_.type()->id())) {
auto& dict_null = checked_cast<DictionaryScalar&>(*null);
const auto& dict_array = checked_cast<const DictionaryArray&>(array_);
dict_null.value.dictionary = dict_array.dictionary();
}
return null;
}

RETURN_NOT_OK(VisitArrayInline(array_, this));
Expand Down
22 changes: 22 additions & 0 deletions cpp/src/arrow/array/array_binary_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,26 @@ class TestStringBuilder : public TestBuilder {
ASSERT_EQ(reps * 40, result_->value_data()->size());
}

void TestOverflowCheck() {
auto max_size = builder_->memory_limit();

ASSERT_OK(builder_->ValidateOverflow(1));
ASSERT_OK(builder_->ValidateOverflow(max_size));
ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_size + 1));

ASSERT_OK(builder_->Append("bb"));
ASSERT_OK(builder_->ValidateOverflow(max_size - 2));
ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_size - 1));

ASSERT_OK(builder_->AppendNull());
ASSERT_OK(builder_->ValidateOverflow(max_size - 2));
ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_size - 1));

ASSERT_OK(builder_->Append("ccc"));
ASSERT_OK(builder_->ValidateOverflow(max_size - 5));
ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_size - 4));
}

void TestZeroLength() {
// All buffers are null
Done();
Expand Down Expand Up @@ -602,6 +622,8 @@ TYPED_TEST(TestStringBuilder, TestCapacityReserve) { this->TestCapacityReserve()

TYPED_TEST(TestStringBuilder, TestZeroLength) { this->TestZeroLength(); }

TYPED_TEST(TestStringBuilder, TestOverflowCheck) { this->TestOverflowCheck(); }

// ----------------------------------------------------------------------
// ChunkedBinaryBuilder tests

Expand Down
32 changes: 32 additions & 0 deletions cpp/src/arrow/array/array_list_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,32 @@ class TestListArray : public TestBuilder {
AssertArraysEqual(*result_, *expected);
}

void TestOverflowCheck() {
Int16Builder* vb = checked_cast<Int16Builder*>(builder_->value_builder());
auto max_elements = builder_->maximum_elements();

ASSERT_OK(builder_->ValidateOverflow(1));
ASSERT_OK(builder_->ValidateOverflow(max_elements));
ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_elements + 1));

ASSERT_OK(builder_->Append());
ASSERT_OK(vb->Append(1));
ASSERT_OK(vb->Append(2));
ASSERT_OK(builder_->ValidateOverflow(max_elements - 2));
ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_elements - 1));

ASSERT_OK(builder_->AppendNull());
ASSERT_OK(builder_->ValidateOverflow(max_elements - 2));
ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_elements - 1));

ASSERT_OK(builder_->Append());
ASSERT_OK(vb->Append(1));
ASSERT_OK(vb->Append(2));
ASSERT_OK(vb->Append(3));
ASSERT_OK(builder_->ValidateOverflow(max_elements - 5));
ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_elements - 4));
}

protected:
std::shared_ptr<DataType> value_type_;

Expand Down Expand Up @@ -508,6 +534,12 @@ TYPED_TEST(TestListArray, ValidateOffsets) { this->TestValidateOffsets(); }

TYPED_TEST(TestListArray, CornerCases) { this->TestCornerCases(); }

#ifndef ARROW_LARGE_MEMORY_TESTS
TYPED_TEST(TestListArray, DISABLED_TestOverflowCheck) { this->TestOverflowCheck(); }
#else
TYPED_TEST(TestListArray, TestOverflowCheck) { this->TestOverflowCheck(); }
#endif

// ----------------------------------------------------------------------
// Map tests

Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/array/builder_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,12 @@ Status ArrayBuilder::Finish(std::shared_ptr<Array>* out) {
return Status::OK();
}

Result<std::shared_ptr<Array>> ArrayBuilder::Finish() {
std::shared_ptr<Array> out;
RETURN_NOT_OK(Finish(&out));
return out;
}

void ArrayBuilder::Reset() {
capacity_ = length_ = null_count_ = 0;
null_bitmap_builder_.Reset();
Expand Down
9 changes: 9 additions & 0 deletions cpp/src/arrow/array/builder_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ class ARROW_EXPORT ArrayBuilder {
/// skip shared pointers and just return a raw pointer
ArrayBuilder* child(int i) { return children_[i].get(); }

const std::shared_ptr<ArrayBuilder>& child_builder(int i) const { return children_[i]; }

int num_children() const { return static_cast<int>(children_.size()); }

virtual int64_t length() const { return length_; }
Expand Down Expand Up @@ -118,6 +120,13 @@ class ARROW_EXPORT ArrayBuilder {
/// \return Status
Status Finish(std::shared_ptr<Array>* out);

/// \brief Return result of builder as an Array object.
///
/// The builder is reset except for DictionaryBuilder.
///
/// \return The finalized Array object
Result<std::shared_ptr<Array>> Finish();

/// \brief Return the type of the built Array
virtual std::shared_ptr<DataType> type() const = 0;

Expand Down
48 changes: 31 additions & 17 deletions cpp/src/arrow/array/builder_binary.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,7 @@ class BaseBinaryBuilder : public ArrayBuilder {

Status AppendNulls(int64_t length) final {
const int64_t num_bytes = value_data_builder_.length();
if (ARROW_PREDICT_FALSE(num_bytes > memory_limit())) {
return AppendOverflow(num_bytes);
}
ARROW_RETURN_NOT_OK(ValidateOverflow(0));
ARROW_RETURN_NOT_OK(Reserve(length));
for (int64_t i = 0; i < length; ++i) {
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
Expand Down Expand Up @@ -232,6 +230,16 @@ class BaseBinaryBuilder : public ArrayBuilder {
value_data_builder_.Reset();
}

Status ValidateOverflow(int64_t new_bytes) {
auto new_size = value_data_builder_.length() + new_bytes;
if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
return Status::CapacityError("array cannot contain more than ", memory_limit(),
" bytes, have ", new_size);
} else {
return Status::OK();
}
}

Status Resize(int64_t capacity) override {
// XXX Why is this check necessary? There is no reason to disallow, say,
// binary arrays with more than 2**31 empty or null values.
Expand All @@ -249,12 +257,8 @@ class BaseBinaryBuilder : public ArrayBuilder {
/// \brief Ensures there is enough allocated capacity to append the indicated
/// number of bytes to the value data buffer without additional allocations
Status ReserveData(int64_t elements) {
const int64_t size = value_data_length() + elements;
ARROW_RETURN_IF(size > memory_limit(),
Status::CapacityError("Cannot reserve capacity larger than ",
memory_limit(), " bytes"));
return (size > value_data_capacity()) ? value_data_builder_.Reserve(elements)
: Status::OK();
ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
return value_data_builder_.Reserve(elements);
}

Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
Expand Down Expand Up @@ -317,16 +321,9 @@ class BaseBinaryBuilder : public ArrayBuilder {
TypedBufferBuilder<offset_type> offsets_builder_;
TypedBufferBuilder<uint8_t> value_data_builder_;

Status AppendOverflow(int64_t num_bytes) {
return Status::CapacityError("array cannot contain more than ", memory_limit(),
" bytes, have ", num_bytes);
}

Status AppendNextOffset() {
const int64_t num_bytes = value_data_builder_.length();
if (ARROW_PREDICT_FALSE(num_bytes > memory_limit())) {
return AppendOverflow(num_bytes);
}
ARROW_RETURN_NOT_OK(ValidateOverflow(0));
return offsets_builder_.Append(static_cast<offset_type>(num_bytes));
}

Expand Down Expand Up @@ -462,6 +459,23 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0);
}

Status ValidateOverflow(int64_t new_bytes) const {
auto new_size = byte_builder_.length() + new_bytes;
if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
return Status::CapacityError("array cannot contain more than ", memory_limit(),
" bytes, have ", new_size);
} else {
return Status::OK();
}
}

/// \brief Ensures there is enough allocated capacity to append the indicated
/// number of bytes to the value data buffer without additional allocations
Status ReserveData(int64_t elements) {
ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
return byte_builder_.Reserve(elements);
}

void Reset() override;
Status Resize(int64_t capacity) override;
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
Expand Down
25 changes: 25 additions & 0 deletions cpp/src/arrow/array/builder_nested.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,18 @@ MapBuilder::MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& ke
: MapBuilder(pool, key_builder, item_builder,
map(key_builder->type(), item_builder->type(), keys_sorted)) {}

MapBuilder::MapBuilder(MemoryPool* pool,
const std::shared_ptr<ArrayBuilder>& struct_builder,
const std::shared_ptr<DataType>& type)
: ArrayBuilder(pool) {
auto map_type = internal::checked_cast<const MapType*>(type.get());
keys_sorted_ = map_type->keys_sorted();
key_builder_ = struct_builder->child_builder(0);
item_builder_ = struct_builder->child_builder(1);
list_builder_ =
std::make_shared<ListBuilder>(pool, struct_builder, struct_builder->type());
}

Status MapBuilder::Resize(int64_t capacity) {
RETURN_NOT_OK(list_builder_->Resize(capacity));
capacity_ = list_builder_->capacity();
Expand Down Expand Up @@ -170,6 +182,19 @@ Status FixedSizeListBuilder::AppendNulls(int64_t length) {
return value_builder_->AppendNulls(list_size_ * length);
}

Status FixedSizeListBuilder::ValidateOverflow(int64_t new_elements) {
auto new_length = value_builder_->length() + new_elements;
if (new_elements != list_size_) {
return Status::Invalid("Length of item not correct: expected ", list_size_,
" but got array of size ", new_elements);
}
if (new_length > maximum_elements()) {
return Status::CapacityError("array cannot contain more than ", maximum_elements(),
" elements, have ", new_elements);
}
return Status::OK();
}

Status FixedSizeListBuilder::Resize(int64_t capacity) {
RETURN_NOT_OK(CheckCapacity(capacity));
return ArrayBuilder::Resize(capacity);
Expand Down
37 changes: 26 additions & 11 deletions cpp/src/arrow/array/builder_nested.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ class BaseListBuilder : public ArrayBuilder {

Status AppendNulls(int64_t length) final {
ARROW_RETURN_NOT_OK(Reserve(length));
ARROW_RETURN_NOT_OK(CheckNextOffset());
ARROW_RETURN_NOT_OK(ValidateOverflow(0));
UnsafeAppendToBitmap(length, false);
const int64_t num_values = value_builder_->length();
for (int64_t i = 0; i < length; ++i) {
Expand Down Expand Up @@ -131,6 +131,16 @@ class BaseListBuilder : public ArrayBuilder {
return Status::OK();
}

Status ValidateOverflow(int64_t new_elements) const {
auto new_length = value_builder_->length() + new_elements;
if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) {
return Status::CapacityError("List array cannot contain more than ",
maximum_elements(), " elements, have ", new_elements);
} else {
return Status::OK();
}
}

ArrayBuilder* value_builder() const { return value_builder_.get(); }

// Cannot make this a static attribute because of linking issues
Expand All @@ -147,17 +157,8 @@ class BaseListBuilder : public ArrayBuilder {
std::shared_ptr<ArrayBuilder> value_builder_;
std::shared_ptr<Field> value_field_;

Status CheckNextOffset() const {
const int64_t num_values = value_builder_->length();
ARROW_RETURN_IF(
num_values > maximum_elements(),
Status::CapacityError("List array cannot contain more than ", maximum_elements(),
" child elements,", " have ", num_values));
return Status::OK();
}

Status AppendNextOffset() {
ARROW_RETURN_NOT_OK(CheckNextOffset());
ARROW_RETURN_NOT_OK(ValidateOverflow(0));
const int64_t num_values = value_builder_->length();
return offsets_builder_.Append(static_cast<offset_type>(num_values));
}
Expand Down Expand Up @@ -227,6 +228,9 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder {
MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder,
const std::shared_ptr<ArrayBuilder>& item_builder, bool keys_sorted = false);

MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& item_builder,
const std::shared_ptr<DataType>& type);

Status Resize(int64_t capacity) override;
void Reset() override;
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
Expand Down Expand Up @@ -276,6 +280,10 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder {
return map(key_builder_->type(), item_builder_->type(), keys_sorted_);
}

Status ValidateOverflow(int64_t new_elements) {
return list_builder_->ValidateOverflow(new_elements);
}

protected:
inline Status AdjustStructBuilderLength();

Expand Down Expand Up @@ -343,12 +351,19 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder {
/// automatically.
Status AppendNulls(int64_t length) final;

Status ValidateOverflow(int64_t new_elements);

ArrayBuilder* value_builder() const { return value_builder_.get(); }

std::shared_ptr<DataType> type() const override {
return fixed_size_list(value_field_->WithType(value_builder_->type()), list_size_);
}

// Cannot make this a static attribute because of linking issues
static constexpr int64_t maximum_elements() {
return std::numeric_limits<FixedSizeListType::offset_type>::max() - 1;
}

protected:
std::shared_ptr<Field> value_field_;
const int32_t list_size_;
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/arrow/array/builder_primitive.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ namespace arrow {
class ARROW_EXPORT NullBuilder : public ArrayBuilder {
public:
explicit NullBuilder(MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool) {}
explicit NullBuilder(const std::shared_ptr<DataType>& type,
MemoryPool* pool = default_memory_pool())
: NullBuilder(pool) {}

/// \brief Append the specified number of null elements
Status AppendNulls(int64_t length) final {
Expand Down
Loading