Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cpp/cmake_modules/DefineOptions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,7 @@ macro(config_summary)
message(STATUS " Generator: ${CMAKE_GENERATOR}")
message(STATUS " Build type: ${CMAKE_BUILD_TYPE}")
message(STATUS " Source directory: ${CMAKE_CURRENT_SOURCE_DIR}")
message(STATUS " Install prefix: ${CMAKE_INSTALL_PREFIX}")
if(${CMAKE_EXPORT_COMPILE_COMMANDS})
message(
STATUS " Compile commands: ${CMAKE_CURRENT_BINARY_DIR}/compile_commands.json")
Expand Down Expand Up @@ -415,6 +416,7 @@ macro(config_summary)
file(APPEND ${summary} "\"compile_commands\": "
"\"${CMAKE_CURRENT_BINARY_DIR}/compile_commands.json\",\n")
endif()
file(APPEND ${summary} "\"install_prefix\": \"${CMAKE_INSTALL_PREFIX}\",\n")
file(APPEND ${summary} "\"arrow_version\": \"${ARROW_VERSION}\"\n")
file(APPEND ${summary} "}\n")
endif()
Expand Down
14 changes: 14 additions & 0 deletions cpp/src/arrow/array-binary-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,20 @@ TEST_F(TestChunkedBinaryBuilder, BasicOperation) {
}
}

TEST_F(TestChunkedBinaryBuilder, Reserve) {
// ARROW-6060
const int32_t chunksize = 1000;
Init(chunksize);
ASSERT_OK(builder_->Reserve(chunksize / 2));
auto bytes_after_first_reserve = default_memory_pool()->bytes_allocated();
for (int i = 0; i < 8; ++i) {
ASSERT_OK(builder_->Reserve(chunksize / 2));
}
// no new memory will be allocated since capacity was sufficient for the loop's
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How is that? We're reserving 4.5 * chunksize total, no?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In between Reserve calls, no data is appended to the builder, so there is no need to allocate any additional memory.

If you look at the bug report in ARROW-6060 the issue appears to be that memory was being allocated when there was no need.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To review the docstring for Reserve

  /// \brief Ensure that there is enough space allocated to add the indicated
  /// number of elements without any further calls to Resize. Overallocation is
  /// used in order to minimize the impact of incremental Reserve() calls.
  ///
  /// \param[in] additional_capacity the number of additional array values
  /// \return Status

So if there is already enough memory allocated then Reserve should be a no-op

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(it might be worth stating in the docstring that calls to Reserve are not cumulative)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(it might be worth stating in the docstring that calls to Reserve are not cumulative)

Really? They are. It's the (confusing) difference between Resize and Reserve.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@wesm I've added a note to the docstring

Copy link
Member

@wesm wesm Aug 6, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a good opportunity to clarify the docstring if there is still confusion.

Reserve: pre-allocate space for an absolute, additional amount of data (prior calls to Reserve are not taken into consideration, so Reserve(1000)->Reserve(2000) is the same as Reserve(2000))
Resize: pre-allocate space for an absolute amount of data (regardless of how much has already been appended)

As we've seen here getting the behavior of Reserve wrong has made the 0.14.x release basically unusable for a number of users, so there are consequences to this

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bkietz is it possible to have a more aggressive test, e.g. one that uses 16mb in the normal case and 16g in the falling one?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@fsaintjacques in the failing case to balloon to 16G all I'd need to do is increase the loop max; the calls to Reserve would continue increasing capacity by the growth factor (1.5) until we run out of memory. I'm not sure what merit increasing the test's scale has

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed with @bkietz. We don't need to test to crash, just fail, if Reserve() is buggy.

// Reserve() calls
ASSERT_EQ(default_memory_pool()->bytes_allocated(), bytes_after_first_reserve);
}

TEST_F(TestChunkedBinaryBuilder, NoData) {
Init(1000);

Expand Down
7 changes: 5 additions & 2 deletions cpp/src/arrow/array/builder_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,12 @@ class ARROW_EXPORT ArrayBuilder {
/// \return Status
virtual Status Resize(int64_t capacity);

/// \brief Ensure that there is enough space allocated to add the indicated
/// number of elements without any further calls to Resize. Overallocation is
/// \brief Ensure that there is enough space allocated to append the indicated
/// number of elements without any further reallocation. Overallocation is
/// used in order to minimize the impact of incremental Reserve() calls.
/// Note that additional_capacity is relative to the current number of elements
/// rather than to the current capacity, so calls to Reserve() which are not
/// interspersed with addition of new elements may not increase the capacity.
///
/// \param[in] additional_capacity the number of additional array values
/// \return Status
Expand Down
13 changes: 9 additions & 4 deletions cpp/src/arrow/array/builder_binary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -179,14 +179,19 @@ Status ChunkedBinaryBuilder::Reserve(int64_t values) {
return Status::OK();
}

auto current_capacity = builder_->capacity();
auto min_capacity = builder_->length() + values;
auto new_capacity = BufferBuilder::GrowByFactor(builder_->capacity(), min_capacity);
if (ARROW_PREDICT_TRUE(new_capacity <= kListMaximumElements)) {
if (current_capacity >= min_capacity) {
return Status::OK();
}

auto new_capacity = BufferBuilder::GrowByFactor(current_capacity, min_capacity);
if (ARROW_PREDICT_TRUE(new_capacity <= max_chunk_length_)) {
return builder_->Resize(new_capacity);
}

extra_capacity_ = new_capacity - kListMaximumElements;
return builder_->Resize(kListMaximumElements);
extra_capacity_ = new_capacity - max_chunk_length_;
return builder_->Resize(max_chunk_length_);
}

} // namespace internal
Expand Down
9 changes: 4 additions & 5 deletions cpp/src/arrow/array/builder_binary.h
Original file line number Diff line number Diff line change
Expand Up @@ -509,8 +509,8 @@ namespace internal {

class ARROW_EXPORT ChunkedBinaryBuilder {
public:
ChunkedBinaryBuilder(int32_t max_chunk_value_length,
MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);
explicit ChunkedBinaryBuilder(int32_t max_chunk_value_length,
MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);

ChunkedBinaryBuilder(int32_t max_chunk_value_length, int32_t max_chunk_length,
MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);
Expand All @@ -534,9 +534,8 @@ class ARROW_EXPORT ChunkedBinaryBuilder {
}

if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
// The current item would cause builder_->value_data_length() to exceed
// max_chunk_size_, so finish this chunk and append the current item to the next
// chunk
// The current item would cause builder_->length() to exceed max_chunk_length_, so
// finish this chunk and append the current item to the next chunk
ARROW_RETURN_NOT_OK(NextChunk());
}

Expand Down