From dfb2e2ea1ec5f089626ee86a1a6ca02b75b407b1 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 6 Feb 2017 11:02:12 -0500 Subject: [PATCH 1/4] API fixes for ARROW-33 patch --- src/parquet/arrow/arrow-reader-writer-test.cc | 6 +++-- src/parquet/arrow/reader.cc | 8 +++--- src/parquet/arrow/test-util.h | 25 +++++++++---------- src/parquet/arrow/writer.cc | 6 ++--- 4 files changed, 23 insertions(+), 22 deletions(-) diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc index 9a7fea92..8bf1e278 100644 --- a/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/src/parquet/arrow/arrow-reader-writer-test.cc @@ -587,8 +587,10 @@ TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compability) { int64_data_ptr[i] = static_cast(uint32_data_ptr[i]); } } + + const int32_t kOffset = 0; ASSERT_OK(MakePrimitiveArray(std::make_shared<::arrow::Int64Type>(), values->length(), - int64_data, values->null_count(), values->null_bitmap(), &expected_values)); + int64_data, values->null_bitmap(), values->null_count(), kOffset, &expected_values)); this->ReadAndCheckSingleColumnTable(expected_values); } @@ -596,7 +598,7 @@ using TestStringParquetIO = TestParquetIO<::arrow::StringType>; TEST_F(TestStringParquetIO, EmptyStringColumnRequiredWrite) { std::shared_ptr values; - ::arrow::StringBuilder builder(::arrow::default_memory_pool(), ::arrow::utf8()); + ::arrow::StringBuilder builder(::arrow::default_memory_pool()); for (size_t i = 0; i < SMALL_SIZE; i++) { builder.Append(""); } diff --git a/src/parquet/arrow/reader.cc b/src/parquet/arrow/reader.cc index 50594948..df34d4c6 100644 --- a/src/parquet/arrow/reader.cc +++ b/src/parquet/arrow/reader.cc @@ -616,7 +616,7 @@ Status ColumnReader::Impl::WrapIntoListArray(const int16_t* def_levels, auto list_type = std::make_shared<::arrow::ListType>( std::make_shared("item", output->type(), nullable[j + 1])); output = std::make_shared<::arrow::ListArray>( - list_type, list_lengths[j], offsets[j], output, null_counts[j], valid_bits[j]); + list_type, list_lengths[j], offsets[j], output, valid_bits[j], null_counts[j]); } *array = output; } @@ -667,7 +667,7 @@ Status ColumnReader::Impl::TypedReadBatch(int batch_size, std::shared_ptr ::arrow::BitUtil::CeilByte(valid_bits_idx_) / 8, false)); } *out = std::make_shared>( - field_->type, valid_bits_idx_, data_buffer_, null_count_, valid_bits_buffer_); + field_->type, valid_bits_idx_, data_buffer_, valid_bits_buffer_, null_count_); // Relase the ownership as the Buffer is now part of a new Array valid_bits_buffer_.reset(); } else { @@ -741,7 +741,7 @@ Status ColumnReader::Impl::TypedReadBatch<::arrow::BooleanType, BooleanType>( valid_bits_buffer_ = valid_bits_buffer; } *out = std::make_shared( - field_->type, valid_bits_idx_, data_buffer_, null_count_, valid_bits_buffer_); + field_->type, valid_bits_idx_, data_buffer_, valid_bits_buffer_, null_count_); // Relase the ownership data_buffer_.reset(); valid_bits_buffer_.reset(); @@ -770,7 +770,7 @@ Status ColumnReader::Impl::ReadByteArrayBatch( int16_t* rep_levels = reinterpret_cast(rep_levels_buffer_.mutable_data()); int values_to_read = batch_size; - BuilderType builder(pool_, field_->type); + BuilderType builder(pool_); while ((values_to_read > 0) && column_reader_) { RETURN_NOT_OK(values_buffer_.Resize(values_to_read * sizeof(ByteArray), false)); auto reader = dynamic_cast*>(column_reader_.get()); diff --git a/src/parquet/arrow/test-util.h b/src/parquet/arrow/test-util.h index 4d87dd82..b03bde52 100644 --- a/src/parquet/arrow/test-util.h +++ b/src/parquet/arrow/test-util.h @@ -47,8 +47,7 @@ typename std::enable_if::value, Status>::type NonNullA size_t size, std::shared_ptr* out) { std::vector values; ::arrow::test::random_real(size, 0, 0, 1, &values); - ::arrow::NumericBuilder builder( - ::arrow::default_memory_pool(), std::make_shared()); + ::arrow::NumericBuilder builder(::arrow::default_memory_pool()); builder.Append(values.data(), values.size()); return builder.Finish(out); } @@ -58,8 +57,10 @@ typename std::enable_if::value, Status>::type NonNullArr size_t size, std::shared_ptr* out) { std::vector values; ::arrow::test::randint(size, 0, 64, &values); - ::arrow::NumericBuilder builder( - ::arrow::default_memory_pool(), std::make_shared()); + + // Passing data type so this will work with TimestampType too + ::arrow::NumericBuilder builder(::arrow::default_memory_pool(), + std::make_shared()); builder.Append(values.data(), values.size()); return builder.Finish(out); } @@ -69,7 +70,7 @@ typename std::enable_if< is_arrow_string::value || is_arrow_binary::value, Status>::type NonNullArray(size_t size, std::shared_ptr* out) { using BuilderType = typename ::arrow::TypeTraits::BuilderType; - BuilderType builder(::arrow::default_memory_pool(), std::make_shared()); + BuilderType builder(::arrow::default_memory_pool()); for (size_t i = 0; i < size; i++) { builder.Append("test-string"); } @@ -81,8 +82,7 @@ typename std::enable_if::value, Status>::type NonNullAr size_t size, std::shared_ptr* out) { std::vector values; ::arrow::test::randint(size, 0, 1, &values); - ::arrow::BooleanBuilder builder( - ::arrow::default_memory_pool(), std::make_shared<::arrow::BooleanType>()); + ::arrow::BooleanBuilder builder(::arrow::default_memory_pool()); builder.Append(values.data(), values.size()); return builder.Finish(out); } @@ -100,8 +100,7 @@ typename std::enable_if::value, Status>::type Nullable valid_bytes[i * 2] = 0; } - ::arrow::NumericBuilder builder( - ::arrow::default_memory_pool(), std::make_shared()); + ::arrow::NumericBuilder builder(::arrow::default_memory_pool()); builder.Append(values.data(), values.size(), valid_bytes.data()); return builder.Finish(out); } @@ -121,6 +120,7 @@ typename std::enable_if::value, Status>::type NullableAr valid_bytes[i * 2] = 0; } + // Passing data type so this will work with TimestampType too ::arrow::NumericBuilder builder( ::arrow::default_memory_pool(), std::make_shared()); builder.Append(values.data(), values.size(), valid_bytes.data()); @@ -140,7 +140,7 @@ NullableArray( } using BuilderType = typename ::arrow::TypeTraits::BuilderType; - BuilderType builder(::arrow::default_memory_pool(), std::make_shared()); + BuilderType builder(::arrow::default_memory_pool()); const int kBufferSize = 10; uint8_t buffer[kBufferSize]; @@ -171,8 +171,7 @@ typename std::enable_if::value, Status>::type NullableA valid_bytes[i * 2] = 0; } - ::arrow::BooleanBuilder builder( - ::arrow::default_memory_pool(), std::make_shared<::arrow::BooleanType>()); + ::arrow::BooleanBuilder builder(::arrow::default_memory_pool()); builder.Append(values.data(), values.size(), valid_bytes.data()); return builder.Finish(out); } @@ -211,7 +210,7 @@ Status MakeListArary(const std::shared_ptr& values, int64_t size, auto value_field = std::make_shared<::arrow::Field>("item", values->type(), nullable_values); *out = std::make_shared<::arrow::ListArray>( - ::arrow::list(value_field), size, offsets, values, null_count, null_bitmap); + ::arrow::list(value_field), size, offsets, values, null_bitmap, null_count); return Status::OK(); } diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc index 75563138..0be6b692 100644 --- a/src/parquet/arrow/writer.cc +++ b/src/parquet/arrow/writer.cc @@ -88,10 +88,10 @@ class LevelBuilder : public ::arrow::ArrayVisitor { Status Visit(const ListArray& array) override { valid_bitmaps_.push_back(array.null_bitmap_data()); null_counts_.push_back(array.null_count()); - offsets_.push_back(array.raw_offsets()); + offsets_.push_back(array.raw_value_offsets()); - min_offset_idx_ = array.raw_offsets()[min_offset_idx_]; - max_offset_idx_ = array.raw_offsets()[max_offset_idx_]; + min_offset_idx_ = array.raw_value_offsets()[min_offset_idx_]; + max_offset_idx_ = array.raw_value_offsets()[max_offset_idx_]; return array.values()->Accept(this); } From b1b69b9447eb8793626b3f6ce510ad684afb684f Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 6 Feb 2017 11:10:27 -0500 Subject: [PATCH 2/4] clang-format --- src/parquet/arrow/arrow-reader-writer-test.cc | 3 ++- src/parquet/arrow/test-util.h | 4 ++-- src/parquet/util/cpu-info.cc | 6 +++--- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc index 8bf1e278..09ae91f6 100644 --- a/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/src/parquet/arrow/arrow-reader-writer-test.cc @@ -590,7 +590,8 @@ TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compability) { const int32_t kOffset = 0; ASSERT_OK(MakePrimitiveArray(std::make_shared<::arrow::Int64Type>(), values->length(), - int64_data, values->null_bitmap(), values->null_count(), kOffset, &expected_values)); + int64_data, values->null_bitmap(), values->null_count(), kOffset, + &expected_values)); this->ReadAndCheckSingleColumnTable(expected_values); } diff --git a/src/parquet/arrow/test-util.h b/src/parquet/arrow/test-util.h index b03bde52..bfc9ce18 100644 --- a/src/parquet/arrow/test-util.h +++ b/src/parquet/arrow/test-util.h @@ -59,8 +59,8 @@ typename std::enable_if::value, Status>::type NonNullArr ::arrow::test::randint(size, 0, 64, &values); // Passing data type so this will work with TimestampType too - ::arrow::NumericBuilder builder(::arrow::default_memory_pool(), - std::make_shared()); + ::arrow::NumericBuilder builder( + ::arrow::default_memory_pool(), std::make_shared()); builder.Append(values.data(), values.size()); return builder.Finish(out); } diff --git a/src/parquet/util/cpu-info.cc b/src/parquet/util/cpu-info.cc index dd31a318..ba0d1462 100644 --- a/src/parquet/util/cpu-info.cc +++ b/src/parquet/util/cpu-info.cc @@ -132,9 +132,9 @@ void CpuInfo::Init() { #else #ifndef _SC_LEVEL1_DCACHE_SIZE // Provide reasonable default values if no info - cache_sizes_[0] = 32 * 1024; // Level 1: 32k - cache_sizes_[1] = 256 * 1024; // Level 2: 256k - cache_sizes_[2] = 3072 * 1024; // Level 3: 3M + cache_sizes_[0] = 32 * 1024; // Level 1: 32k + cache_sizes_[1] = 256 * 1024; // Level 2: 256k + cache_sizes_[2] = 3072 * 1024; // Level 3: 3M #else // Call sysconf to query for the cache sizes cache_sizes_[0] = sysconf(_SC_LEVEL1_DCACHE_SIZE); From 5976d598a97b027b5b3a74365dac1872e98006b5 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 6 Feb 2017 11:29:07 -0500 Subject: [PATCH 3/4] Update Arrow version to head with ARROW-33 --- cmake_modules/ThirdpartyToolchain.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake_modules/ThirdpartyToolchain.cmake b/cmake_modules/ThirdpartyToolchain.cmake index 8b052df8..2e4f41bc 100644 --- a/cmake_modules/ThirdpartyToolchain.cmake +++ b/cmake_modules/ThirdpartyToolchain.cmake @@ -22,7 +22,7 @@ set(THRIFT_VERSION "0.9.1") # Brotli 0.5.2 does not install headers/libraries yet, but 0.6.0.dev does set(BROTLI_VERSION "5db62dcc9d386579609540cdf8869e95ad334bbd") -set(ARROW_VERSION "4226adfbc6b3dff10b3fe7a6691b30bcc94140bd") +set(ARROW_VERSION "5439b71586f4b0f9a36544b9e2417ee6ad7b48e8") # find boost headers and libs set(Boost_DEBUG TRUE) From 4966fcbc5cf56bdc786c29b602576c48b646fc5a Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 6 Feb 2017 15:32:20 -0500 Subject: [PATCH 4/4] Fix off-by-one error in int96 test case --- src/parquet/arrow/arrow-reader-writer-test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc index 09ae91f6..63953ca5 100644 --- a/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/src/parquet/arrow/arrow-reader-writer-test.cc @@ -491,7 +491,7 @@ TEST_F(TestInt96ParquetIO, ReadIntoTimestamp) { // 2nd January 1970, 11:35min 145738543ns Int96 day; day.value[2] = 2440589l; - int64_t seconds = ((1 * 24 + 11) * 60 + 35) * 60; + int64_t seconds = (11 * 60 + 35) * 60; *(reinterpret_cast(&(day.value))) = seconds * 1000l * 1000l * 1000l + 145738543; // Compute the corresponding nanosecond timestamp