From 256b83fecaf2648d7316f03ba991ccaa4998e31f Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Thu, 16 Jan 2025 15:43:35 +0100 Subject: [PATCH 001/102] [C++][Python][Parquet] Implement Content-Defined Chunking for the Parquet writer --- cpp/src/parquet/column_chunker.h | 313 +++++++++++++++++++++++++++++++ cpp/src/parquet/column_writer.cc | 41 +++- cpp/src/parquet/column_writer.h | 1 + cpp/src/parquet/properties.h | 64 ++++++- python/pyarrow/_parquet.pxd | 6 + python/pyarrow/_parquet.pyx | 26 ++- 6 files changed, 438 insertions(+), 13 deletions(-) create mode 100644 cpp/src/parquet/column_chunker.h diff --git a/cpp/src/parquet/column_chunker.h b/cpp/src/parquet/column_chunker.h new file mode 100644 index 00000000000..0fb59be04c2 --- /dev/null +++ b/cpp/src/parquet/column_chunker.h @@ -0,0 +1,313 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include "arrow/array.h" +#include "parquet/level_conversion.h" + +using arrow::internal::checked_cast; + +namespace parquet { +namespace internal { + +// Constants +const uint64_t GEAR_HASH_TABLE[] = { + 0xb088d3a9e840f559, 0x5652c7f739ed20d6, 0x45b28969898972ab, 0x6b0a89d5b68ec777, + 0x368f573e8b7a31b7, 0x1dc636dce936d94b, 0x207a4c4e5554d5b6, 0xa474b34628239acb, + 0x3b06a83e1ca3b912, 0x90e78d6c2f02baf7, 0xe1c92df7150d9a8a, 0x8e95053a1086d3ad, + 0x5a2ef4f1b83a0722, 0xa50fac949f807fae, 0x0e7303eb80d8d681, 0x99b07edc1570ad0f, + 0x689d2fb555fd3076, 0x00005082119ea468, 0xc4b08306a88fcc28, 0x3eb0678af6374afd, + 0xf19f87ab86ad7436, 0xf2129fbfbe6bc736, 0x481149575c98a4ed, 0x0000010695477bc5, + 0x1fba37801a9ceacc, 0x3bf06fd663a49b6d, 0x99687e9782e3874b, 0x79a10673aa50d8e3, + 0xe4accf9e6211f420, 0x2520e71f87579071, 0x2bd5d3fd781a8a9b, 0x00de4dcddd11c873, + 0xeaa9311c5a87392f, 0xdb748eb617bc40ff, 0xaf579a8df620bf6f, 0x86a6e5da1b09c2b1, + 0xcc2fc30ac322a12e, 0x355e2afec1f74267, 0x2d99c8f4c021a47b, 0xbade4b4a9404cfc3, + 0xf7b518721d707d69, 0x3286b6587bf32c20, 0x0000b68886af270c, 0xa115d6e4db8a9079, + 0x484f7e9c97b2e199, 0xccca7bb75713e301, 0xbf2584a62bb0f160, 0xade7e813625dbcc8, + 0x000070940d87955a, 0x8ae69108139e626f, 0xbd776ad72fde38a2, 0xfb6b001fc2fcc0cf, + 0xc7a474b8e67bc427, 0xbaf6f11610eb5d58, 0x09cb1f5b6de770d1, 0xb0b219e6977d4c47, + 0x00ccbc386ea7ad4a, 0xcc849d0adf973f01, 0x73a3ef7d016af770, 0xc807d2d386bdbdfe, + 0x7f2ac9966c791730, 0xd037a86bc6c504da, 0xf3f17c661eaa609d, 0xaca626b04daae687, + 0x755a99374f4a5b07, 0x90837ee65b2caede, 0x6ee8ad93fd560785, 0x0000d9e11053edd8, + 0x9e063bb2d21cdbd7, 0x07ab77f12a01d2b2, 0xec550255e6641b44, 0x78fb94a8449c14c6, + 0xc7510e1bc6c0f5f5, 0x0000320b36e4cae3, 0x827c33262c8b1a2d, 0x14675f0b48ea4144, + 0x267bd3a6498deceb, 0xf1916ff982f5035e, 0x86221b7ff434fb88, 0x9dbecee7386f49d8, + 0xea58f8cac80f8f4a, 0x008d198692fc64d8, 0x6d38704fbabf9a36, 0xe032cb07d1e7be4c, + 0x228d21f6ad450890, 0x635cb1bfc02589a5, 0x4620a1739ca2ce71, 0xa7e7dfe3aae5fb58, + 0x0c10ca932b3c0deb, 0x2727fee884afed7b, 0xa2df1c6df9e2ab1f, 0x4dcdd1ac0774f523, + 0x000070ffad33e24e, 0xa2ace87bc5977816, 0x9892275ab4286049, 0xc2861181ddf18959, + 0xbb9972a042483e19, 0xef70cd3766513078, 0x00000513abfc9864, 0xc058b61858c94083, + 0x09e850859725e0de, 0x9197fb3bf83e7d94, 0x7e1e626d12b64bce, 0x520c54507f7b57d1, + 0xbee1797174e22416, 0x6fd9ac3222e95587, 0x0023957c9adfbf3e, 0xa01c7d7e234bbe15, + 0xaba2c758b8a38cbb, 0x0d1fa0ceec3e2b30, 0x0bb6a58b7e60b991, 0x4333dd5b9fa26635, + 0xc2fd3b7d4001c1a3, 0xfb41802454731127, 0x65a56185a50d18cb, 0xf67a02bd8784b54f, + 0x696f11dd67e65063, 0x00002022fca814ab, 0x8cd6be912db9d852, 0x695189b6e9ae8a57, + 0xee9453b50ada0c28, 0xd8fc5ea91a78845e, 0xab86bf191a4aa767, 0x0000c6b5c86415e5, + 0x267310178e08a22e, 0xed2d101b078bca25, 0x3b41ed84b226a8fb, 0x13e622120f28dc06, + 0xa315f5ebfb706d26, 0x8816c34e3301bace, 0xe9395b9cbb71fdae, 0x002ce9202e721648, + 0x4283db1d2bb3c91c, 0xd77d461ad2b1a6a5, 0xe2ec17e46eeb866b, 0xb8e0be4039fbc47c, + 0xdea160c4d5299d04, 0x7eec86c8d28c3634, 0x2119ad129f98a399, 0xa6ccf46b61a283ef, + 0x2c52cedef658c617, 0x2db4871169acdd83, 0x0000f0d6f39ecbe9, 0x3dd5d8c98d2f9489, + 0x8a1872a22b01f584, 0xf282a4c40e7b3cf2, 0x8020ec2ccb1ba196, 0x6693b6e09e59e313, + 0x0000ce19cc7c83eb, 0x20cb5735f6479c3b, 0x762ebf3759d75a5b, 0x207bfe823d693975, + 0xd77dc112339cd9d5, 0x9ba7834284627d03, 0x217dc513e95f51e9, 0xb27b1a29fc5e7816, + 0x00d5cd9831bb662d, 0x71e39b806d75734c, 0x7e572af006fb1a23, 0xa2734f2f6ae91f85, + 0xbf82c6b5022cddf2, 0x5c3beac60761a0de, 0xcdc893bb47416998, 0x6d1085615c187e01, + 0x77f8ae30ac277c5d, 0x917c6b81122a2c91, 0x5b75b699add16967, 0x0000cf6ae79a069b, + 0xf3c40afa60de1104, 0x2063127aa59167c3, 0x621de62269d1894d, 0xd188ac1de62b4726, + 0x107036e2154b673c, 0x0000b85f28553a1d, 0xf2ef4e4c18236f3d, 0xd9d6de6611b9f602, + 0xa1fc7955fb47911c, 0xeb85fd032f298dbd, 0xbe27502fb3befae1, 0xe3034251c4cd661e, + 0x441364d354071836, 0x0082b36c75f2983e, 0xb145910316fa66f0, 0x021c069c9847caf7, + 0x2910dfc75a4b5221, 0x735b353e1c57a8b5, 0xce44312ce98ed96c, 0xbc942e4506bdfa65, + 0xf05086a71257941b, 0xfec3b215d351cead, 0x00ae1055e0144202, 0xf54b40846f42e454, + 0x00007fd9c8bcbcc8, 0xbfbd9ef317de9bfe, 0xa804302ff2854e12, 0x39ce4957a5e5d8d4, + 0xffb9e2a45637ba84, 0x55b9ad1d9ea0818b, 0x00008acbf319178a, 0x48e2bfc8d0fbfb38, + 0x8be39841e848b5e8, 0x0e2712160696a08b, 0xd51096e84b44242a, 0x1101ba176792e13a, + 0xc22e770f4531689d, 0x1689eff272bbc56c, 0x00a92a197f5650ec, 0xbc765990bda1784e, + 0xc61441e392fcb8ae, 0x07e13a2ced31e4a0, 0x92cbe984234e9d4d, 0x8f4ff572bb7d8ac5, + 0x0b9670c00b963bd0, 0x62955a581a03eb01, 0x645f83e5ea000254, 0x41fce516cd88f299, + 0xbbda9748da7a98cf, 0x0000aab2fe4845fa, 0x19761b069bf56555, 0x8b8f5e8343b6ad56, + 0x3e5d1cfd144821d9, 0xec5c1e2ca2b0cd8f, 0xfaf7e0fea7fbb57f, 0x000000d3ba12961b, + 0xda3f90178401b18e, 0x70ff906de33a5feb, 0x0527d5a7c06970e7, 0x22d8e773607c13e9, + 0xc9ab70df643c3bac, 0xeda4c6dc8abe12e3, 0xecef1f410033e78a, 0x0024c2b274ac72cb, + 0x06740d954fa900b4, 0x1d7a299b323d6304, 0xb3c37cb298cbead5, 0xc986e3c76178739b, + 0x9fabea364b46f58a, 0x6da214c5af85cc56, 0x17a43ed8b7a38f84, 0x6eccec511d9adbeb, + 0xf9cab30913335afb, 0x4a5e60c5f415eed2, 0x00006967503672b4, 0x9da51d121454bb87, + 0x84321e13b9bbc816, 0xfb3d6fb6ab2fdd8d, 0x60305eed8e160a8d, 0xcbbf4b14e9946ce8, + 0x00004f63381b10c3, 0x07d5b7816fcc4e10, 0xe5a536726a6a8155, 0x57afb23447a07fdd, + 0x18f346f7abc9d394, 0x636dc655d61ad33d, 0xcc8bab4939f7f3f6, 0x63c7a906c1dd187b}; + +const uint64_t MASK = 0xffff00000000000; +// const int MIN_LEN = 65536 / 8; +// const int MAX_LEN = 65536 * 2; +const int64_t MIN_LEN = 256 * 1024; +const int64_t MAX_LEN = 2 * 1024 * 1024; + +// create a fake null array class with a GetView method returning 0 always +class FakeNullArray { + public: + uint8_t GetView(int64_t i) const { return 0; } + + std::shared_ptr<::arrow::DataType> type() const { return ::arrow::null(); } + + int64_t null_count() const { return 0; } +}; + +class GearHash { + public: + GearHash(const LevelInfo& level_info, uint64_t mask, uint64_t min_len, uint64_t max_len) + : level_info_(level_info), + mask_(mask == 0 ? MASK : mask), + min_len_(min_len == 0 ? MIN_LEN : min_len), + max_len_(max_len == 0 ? MAX_LEN : max_len) {} + + template + bool Roll(const T value) { + constexpr size_t BYTE_WIDTH = sizeof(T); + chunk_size_ += BYTE_WIDTH; + // if (chunk_size_ < min_len_) { + // return false; + // } + auto bytes = reinterpret_cast(&value); + bool match = false; +#pragma unroll + for (size_t i = 0; i < BYTE_WIDTH; ++i) { + hash_ = (hash_ << 1) + GEAR_HASH_TABLE[bytes[i]]; + if ((hash_ & mask_) == 0) { + match = true; + } + } + return match; + } + + bool Roll(std::string_view value) { + chunk_size_ += value.size(); + // if (chunk_size_ < min_len_) { + // return false; + // } + bool match = false; + for (char c : value) { + hash_ = (hash_ << 1) + GEAR_HASH_TABLE[static_cast(c)]; + if ((hash_ & mask_) == 0) { + match = true; + } + } + return match; + } + + bool Check(bool match) { + if ((match && (chunk_size_ >= min_len_)) || (chunk_size_ >= max_len_)) { + chunk_size_ = 0; + return true; + } else { + return false; + } + } + + // bool Check(bool match) { + // if ((match && (chunk_size_ >= min_len_)) || (chunk_size_ >= max_len_)) { + // chunk_size_ = 0; + // return true; + // } else { + // return false; + // } + // } + + // template + // const std::vector> GetBoundaries( + // int64_t num_levels, const T& leaf_array) { + // std::vector> result; + + // int64_t offset = 0; + // int64_t prev_offset = 0; + + // while (offset < num_levels) { + // if (Check(Roll(leaf_array.GetView(offset)))) { + // result.push_back(std::make_tuple(prev_offset, prev_offset, offset - + // prev_offset)); prev_offset = offset; + // } + // ++offset; + // } + // if (prev_offset < num_levels) { + // result.push_back(std::make_tuple(prev_offset, prev_offset, num_levels - + // prev_offset)); + // } + // return result; + // } + + template + const std::vector> GetBoundaries( + const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, + const T& leaf_array) { + std::vector> result; + bool has_def_levels = level_info_.def_level > 0; + bool has_rep_levels = level_info_.rep_level > 0; + // bool no_nulls = leaf_array.null_count() == 0; + // if (!has_rep_levels && !maybe_parent_nulls && no_nulls) { + // return GetBoundaries(num_levels, leaf_array); + // } + + bool is_match; + int64_t level_offset = 0; + int64_t value_offset = 0; + int64_t record_level_offset = 0; + int64_t record_value_offset = 0; + int64_t prev_record_level_offset = 0; + int64_t prev_record_value_offset = 0; + + while (level_offset < num_levels) { + int16_t def_level = has_def_levels ? def_levels[level_offset] : 0; + int16_t rep_level = has_rep_levels ? rep_levels[level_offset] : 0; + + if (rep_level == 0) { + // record boundary + record_level_offset = level_offset; + record_value_offset = value_offset; + } + + is_match = Roll(def_level) || Roll(rep_level); + ++level_offset; + + if (has_rep_levels) { + if (def_level >= level_info_.repeated_ancestor_def_level) { + is_match |= Roll(leaf_array.GetView(value_offset)); + ++value_offset; + } + } else { + is_match |= Roll(leaf_array.GetView(value_offset)); + ++value_offset; + } + + if (Check(is_match)) { + auto levels_to_write = record_level_offset - prev_record_level_offset; + if (levels_to_write > 0) { + result.emplace_back(prev_record_level_offset, prev_record_value_offset, + levels_to_write); + prev_record_level_offset = record_level_offset; + prev_record_value_offset = record_value_offset; + } + } + } + + auto levels_to_write = num_levels - prev_record_level_offset; + if (levels_to_write > 0) { + result.emplace_back(prev_record_level_offset, prev_record_value_offset, + levels_to_write); + } + return result; + } + +#define PRIMITIVE_CASE(TYPE_ID, ArrowType) \ + case ::arrow::Type::TYPE_ID: \ + return GetBoundaries(def_levels, rep_levels, num_levels, \ + checked_cast(values)); + + const ::arrow::Result>> GetBoundaries( + const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, + const ::arrow::Array& values) { + auto type_id = values.type()->id(); + switch (type_id) { + PRIMITIVE_CASE(BOOL, Boolean) + PRIMITIVE_CASE(INT8, Int8) + PRIMITIVE_CASE(INT16, Int16) + PRIMITIVE_CASE(INT32, Int32) + PRIMITIVE_CASE(INT64, Int64) + PRIMITIVE_CASE(UINT8, UInt8) + PRIMITIVE_CASE(UINT16, UInt16) + PRIMITIVE_CASE(UINT32, UInt32) + PRIMITIVE_CASE(UINT64, UInt64) + PRIMITIVE_CASE(HALF_FLOAT, HalfFloat) + PRIMITIVE_CASE(FLOAT, Float) + PRIMITIVE_CASE(DOUBLE, Double) + PRIMITIVE_CASE(STRING, String) + PRIMITIVE_CASE(BINARY, Binary) + PRIMITIVE_CASE(FIXED_SIZE_BINARY, FixedSizeBinary) + PRIMITIVE_CASE(DATE32, Date32) + PRIMITIVE_CASE(DATE64, Date64) + PRIMITIVE_CASE(TIME32, Time32) + PRIMITIVE_CASE(TIME64, Time64) + PRIMITIVE_CASE(TIMESTAMP, Timestamp) + PRIMITIVE_CASE(DURATION, Duration) + PRIMITIVE_CASE(DECIMAL128, Decimal128) + PRIMITIVE_CASE(DECIMAL256, Decimal256) + case ::arrow::Type::DICTIONARY: + return GetBoundaries( + def_levels, rep_levels, num_levels, + *checked_cast(values).indices()); + case ::arrow::Type::NA: + FakeNullArray fake_null_array; + return GetBoundaries(def_levels, rep_levels, num_levels, fake_null_array); + default: + return ::arrow::Status::NotImplemented("Unsupported type " + + values.type()->ToString()); + } + } + + private: + const internal::LevelInfo& level_info_; + uint64_t mask_ = MASK; + uint64_t min_len_; + uint64_t max_len_; + uint64_t hash_ = 0; + uint64_t chunk_size_ = 0; +}; + +} // namespace internal +} // namespace parquet diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 546e09472a3..b6b00d4f8c2 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -45,6 +45,7 @@ #include "arrow/util/rle_encoding_internal.h" #include "arrow/util/type_traits.h" #include "arrow/visit_array_inline.h" +#include "parquet/column_chunker.h" #include "parquet/column_page.h" #include "parquet/encoding.h" #include "parquet/encryption/encryption_internal.h" @@ -752,7 +753,9 @@ class ColumnWriterImpl { closed_(false), fallback_(false), definition_levels_sink_(allocator_), - repetition_levels_sink_(allocator_) { + repetition_levels_sink_(allocator_), + content_defined_chunker_(level_info_, properties->cdc_mask(), + properties->cdc_min_size(), properties->cdc_max_size()) { definition_levels_rle_ = std::static_pointer_cast(AllocateBuffer(allocator_, 0)); repetition_levels_rle_ = @@ -895,6 +898,8 @@ class ColumnWriterImpl { std::vector> data_pages_; + internal::GearHash content_defined_chunker_; + private: void InitSinks() { definition_levels_sink_.Rewind(0); @@ -1357,13 +1362,37 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, bits_buffer_->ZeroPadding(); } - if (leaf_array.type()->id() == ::arrow::Type::DICTIONARY) { - return WriteArrowDictionary(def_levels, rep_levels, num_levels, leaf_array, ctx, - maybe_parent_nulls); + if (this->properties_->cdc_enabled()) { + ARROW_ASSIGN_OR_RAISE(auto boundaries, + content_defined_chunker_.GetBoundaries( + def_levels, rep_levels, num_levels, leaf_array)); + for (auto boundary : boundaries) { + auto level_offset = std::get<0>(boundary); + auto array_offset = std::get<1>(boundary); + auto levels_to_write = std::get<2>(boundary); + auto sliced_array = leaf_array.Slice(array_offset); + if (leaf_array.type()->id() == ::arrow::Type::DICTIONARY) { + ARROW_CHECK_OK(WriteArrowDictionary(def_levels + level_offset, + rep_levels + level_offset, levels_to_write, + *sliced_array, ctx, maybe_parent_nulls)); + } else { + ARROW_CHECK_OK(WriteArrowDense(def_levels + level_offset, + rep_levels + level_offset, levels_to_write, + *sliced_array, ctx, maybe_parent_nulls)); + } + AddDataPage(); + } + return Status::OK(); } else { - return WriteArrowDense(def_levels, rep_levels, num_levels, leaf_array, ctx, - maybe_parent_nulls); + if (leaf_array.type()->id() == ::arrow::Type::DICTIONARY) { + return WriteArrowDictionary(def_levels, rep_levels, num_levels, leaf_array, ctx, + maybe_parent_nulls); + } else { + return WriteArrowDense(def_levels, rep_levels, num_levels, leaf_array, ctx, + maybe_parent_nulls); + } } + END_PARQUET_CATCH_EXCEPTIONS } diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index bd329d61053..2ef549150b3 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -23,6 +23,7 @@ #include "arrow/type_fwd.h" #include "arrow/util/compression.h" +#include "parquet/column_chunker.h" #include "parquet/exception.h" #include "parquet/platform.h" #include "parquet/types.h" diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index a68307d37bb..bebad436fbc 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -27,6 +27,7 @@ #include "arrow/type.h" #include "arrow/util/compression.h" #include "arrow/util/type_fwd.h" +#include "parquet/column_chunker.h" #include "parquet/encryption/encryption.h" #include "parquet/exception.h" #include "parquet/parquet_version.h" @@ -260,7 +261,11 @@ class PARQUET_EXPORT WriterProperties { created_by_(DEFAULT_CREATED_BY), store_decimal_as_integer_(false), page_checksum_enabled_(false), - size_statistics_level_(DEFAULT_SIZE_STATISTICS_LEVEL) {} + size_statistics_level_(DEFAULT_SIZE_STATISTICS_LEVEL), + cdc_enabled_(false), + cdc_mask_(0), + cdc_min_size_(0), + cdc_max_size_(0) {} explicit Builder(const WriterProperties& properties) : pool_(properties.memory_pool()), @@ -275,10 +280,38 @@ class PARQUET_EXPORT WriterProperties { page_checksum_enabled_(properties.page_checksum_enabled()), size_statistics_level_(properties.size_statistics_level()), sorting_columns_(properties.sorting_columns()), - default_column_properties_(properties.default_column_properties()) {} + default_column_properties_(properties.default_column_properties()), + cdc_enabled_(properties.cdc_enabled()), + cdc_min_size_(properties.cdc_min_size()), + cdc_max_size_(properties.cdc_max_size()) {} virtual ~Builder() {} + Builder* enable_cdc() { + cdc_enabled_ = true; + return this; + } + + Builder* disable_cdc() { + cdc_enabled_ = false; + return this; + } + + Builder* cdc_mask(uint64_t mask) { + cdc_mask_ = mask; + return this; + } + + Builder* cdc_min_size(uint64_t min_size) { + cdc_min_size_ = min_size; + return this; + } + + Builder* cdc_max_size(uint64_t max_size) { + cdc_max_size_ = max_size; + return this; + } + /// Specify the memory pool for the writer. Default default_memory_pool. Builder* memory_pool(MemoryPool* pool) { pool_ = pool; @@ -701,7 +734,8 @@ class PARQUET_EXPORT WriterProperties { pagesize_, version_, created_by_, page_checksum_enabled_, size_statistics_level_, std::move(file_encryption_properties_), default_column_properties_, column_properties, data_page_version_, - store_decimal_as_integer_, std::move(sorting_columns_))); + store_decimal_as_integer_, std::move(sorting_columns_), cdc_enabled_, cdc_mask_, + cdc_min_size_, cdc_max_size_)); } private: @@ -730,6 +764,11 @@ class PARQUET_EXPORT WriterProperties { std::unordered_map dictionary_enabled_; std::unordered_map statistics_enabled_; std::unordered_map page_index_enabled_; + + bool cdc_enabled_; + uint64_t cdc_mask_; + uint64_t cdc_min_size_; + uint64_t cdc_max_size_; }; inline MemoryPool* memory_pool() const { return pool_; } @@ -754,6 +793,11 @@ class PARQUET_EXPORT WriterProperties { inline bool page_checksum_enabled() const { return page_checksum_enabled_; } + inline bool cdc_enabled() const { return cdc_enabled_; } + inline uint64_t cdc_mask() const { return cdc_mask_; } + inline uint64_t cdc_min_size() const { return cdc_min_size_; } + inline uint64_t cdc_max_size() const { return cdc_max_size_; } + inline SizeStatisticsLevel size_statistics_level() const { return size_statistics_level_; } @@ -856,7 +900,8 @@ class PARQUET_EXPORT WriterProperties { const ColumnProperties& default_column_properties, const std::unordered_map& column_properties, ParquetDataPageVersion data_page_version, bool store_short_decimal_as_integer, - std::vector sorting_columns) + std::vector sorting_columns, bool cdc_enabled, uint64_t cdc_mask, + uint64_t cdc_min_size, uint64_t cdc_max_size) : pool_(pool), dictionary_pagesize_limit_(dictionary_pagesize_limit), write_batch_size_(write_batch_size), @@ -871,7 +916,11 @@ class PARQUET_EXPORT WriterProperties { file_encryption_properties_(file_encryption_properties), sorting_columns_(std::move(sorting_columns)), default_column_properties_(default_column_properties), - column_properties_(column_properties) {} + column_properties_(column_properties), + cdc_enabled_(cdc_enabled), + cdc_mask_(cdc_mask), + cdc_min_size_(cdc_min_size), + cdc_max_size_(cdc_max_size) {} MemoryPool* pool_; int64_t dictionary_pagesize_limit_; @@ -891,6 +940,11 @@ class PARQUET_EXPORT WriterProperties { ColumnProperties default_column_properties_; std::unordered_map column_properties_; + + bool cdc_enabled_; + uint64_t cdc_mask_; + uint64_t cdc_min_size_; + uint64_t cdc_max_size_; }; PARQUET_EXPORT const std::shared_ptr& default_writer_properties(); diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index fb502e1f737..9e5bec110eb 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -495,6 +495,11 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: Builder* disable_write_page_index() Builder* enable_page_checksum() Builder* disable_page_checksum() + Builder* enable_cdc() + Builder* disable_cdc() + Builder* cdc_mask(uint64_t mask) + Builder* cdc_min_size(uint64_t min_size) + Builder* cdc_max_size(uint64_t max_size) shared_ptr[WriterProperties] build() cdef cppclass ArrowWriterProperties: @@ -646,6 +651,7 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( write_page_checksum=*, sorting_columns=*, store_decimal_as_integer=*, + content_defined_chunking=* ) except * diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index c29269d5a05..ef251937330 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1972,7 +1972,8 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( write_page_index=False, write_page_checksum=False, sorting_columns=None, - store_decimal_as_integer=False) except *: + store_decimal_as_integer=False, + content_defined_chunking=False) except *: """General writer properties""" cdef: @@ -2101,6 +2102,8 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( raise TypeError( "'column_encoding' should be a dictionary or a string") + # size limits + if data_page_size is not None: props.data_pagesize(data_page_size) @@ -2110,6 +2113,23 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( if dictionary_pagesize_limit is not None: props.dictionary_pagesize_limit(dictionary_pagesize_limit) + # content defined chunking + + if content_defined_chunking is False: + props.disable_cdc() + elif content_defined_chunking is True: + props.enable_cdc() + elif isinstance(content_defined_chunking, tuple): + mask, min_size, max_size = content_defined_chunking + props.enable_cdc() + props.cdc_mask(mask) + props.cdc_min_size(min_size) + props.cdc_max_size(max_size) + else: + raise ValueError( + "Unsupported value for content_defined_chunking: {0}" + .format(content_defined_chunking)) + # encryption if encryption_properties is not None: @@ -2259,7 +2279,8 @@ cdef class ParquetWriter(_Weakrefable): write_page_index=False, write_page_checksum=False, sorting_columns=None, - store_decimal_as_integer=False): + store_decimal_as_integer=False, + content_defined_chunking=False): cdef: shared_ptr[WriterProperties] properties shared_ptr[ArrowWriterProperties] arrow_properties @@ -2294,6 +2315,7 @@ cdef class ParquetWriter(_Weakrefable): write_page_checksum=write_page_checksum, sorting_columns=sorting_columns, store_decimal_as_integer=store_decimal_as_integer, + content_defined_chunking=content_defined_chunking ) arrow_properties = _create_arrow_writer_properties( use_deprecated_int96_timestamps=use_deprecated_int96_timestamps, From e699e9a88a2f1719a150e7d7c47489cf8aeb69e9 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Mon, 27 Jan 2025 20:27:34 +0100 Subject: [PATCH 002/102] always roll values --- cpp/src/parquet/column_chunker.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/column_chunker.h b/cpp/src/parquet/column_chunker.h index 0fb59be04c2..7bd0e51e19e 100644 --- a/cpp/src/parquet/column_chunker.h +++ b/cpp/src/parquet/column_chunker.h @@ -204,7 +204,7 @@ class GearHash { // return GetBoundaries(num_levels, leaf_array); // } - bool is_match; + bool def_match, rep_match, val_match; int64_t level_offset = 0; int64_t value_offset = 0; int64_t record_level_offset = 0; @@ -222,20 +222,23 @@ class GearHash { record_value_offset = value_offset; } - is_match = Roll(def_level) || Roll(rep_level); + def_match = Roll(def_level); + rep_match = Roll(rep_level); ++level_offset; if (has_rep_levels) { if (def_level >= level_info_.repeated_ancestor_def_level) { - is_match |= Roll(leaf_array.GetView(value_offset)); + val_match = Roll(leaf_array.GetView(value_offset)); ++value_offset; + } else { + val_match = false; } } else { - is_match |= Roll(leaf_array.GetView(value_offset)); + val_match = Roll(leaf_array.GetView(value_offset)); ++value_offset; } - if (Check(is_match)) { + if (Check(def_match || rep_match || val_match)) { auto levels_to_write = record_level_offset - prev_record_level_offset; if (levels_to_write > 0) { result.emplace_back(prev_record_level_offset, prev_record_value_offset, From ba53621c5a7969129fe59a294b6e16978c087285 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Mon, 27 Jan 2025 22:05:27 +0100 Subject: [PATCH 003/102] add faster paths for flat arrays --- cpp/src/parquet/column_chunker.h | 158 +++++++++++++++---------------- 1 file changed, 78 insertions(+), 80 deletions(-) diff --git a/cpp/src/parquet/column_chunker.h b/cpp/src/parquet/column_chunker.h index 7bd0e51e19e..2e143a040e8 100644 --- a/cpp/src/parquet/column_chunker.h +++ b/cpp/src/parquet/column_chunker.h @@ -122,12 +122,11 @@ class GearHash { bool Roll(const T value) { constexpr size_t BYTE_WIDTH = sizeof(T); chunk_size_ += BYTE_WIDTH; - // if (chunk_size_ < min_len_) { - // return false; - // } + if (chunk_size_ < min_len_) { + return false; + } auto bytes = reinterpret_cast(&value); bool match = false; -#pragma unroll for (size_t i = 0; i < BYTE_WIDTH; ++i) { hash_ = (hash_ << 1) + GEAR_HASH_TABLE[bytes[i]]; if ((hash_ & mask_) == 0) { @@ -139,9 +138,9 @@ class GearHash { bool Roll(std::string_view value) { chunk_size_ += value.size(); - // if (chunk_size_ < min_len_) { - // return false; - // } + if (chunk_size_ < min_len_) { + return false; + } bool match = false; for (char c : value) { hash_ = (hash_ << 1) + GEAR_HASH_TABLE[static_cast(c)]; @@ -152,8 +151,8 @@ class GearHash { return match; } - bool Check(bool match) { - if ((match && (chunk_size_ >= min_len_)) || (chunk_size_ >= max_len_)) { + inline bool Check(bool match) { + if (match || (chunk_size_ >= max_len_)) { chunk_size_ = 0; return true; } else { @@ -161,37 +160,6 @@ class GearHash { } } - // bool Check(bool match) { - // if ((match && (chunk_size_ >= min_len_)) || (chunk_size_ >= max_len_)) { - // chunk_size_ = 0; - // return true; - // } else { - // return false; - // } - // } - - // template - // const std::vector> GetBoundaries( - // int64_t num_levels, const T& leaf_array) { - // std::vector> result; - - // int64_t offset = 0; - // int64_t prev_offset = 0; - - // while (offset < num_levels) { - // if (Check(Roll(leaf_array.GetView(offset)))) { - // result.push_back(std::make_tuple(prev_offset, prev_offset, offset - - // prev_offset)); prev_offset = offset; - // } - // ++offset; - // } - // if (prev_offset < num_levels) { - // result.push_back(std::make_tuple(prev_offset, prev_offset, num_levels - - // prev_offset)); - // } - // return result; - // } - template const std::vector> GetBoundaries( const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, @@ -199,61 +167,91 @@ class GearHash { std::vector> result; bool has_def_levels = level_info_.def_level > 0; bool has_rep_levels = level_info_.rep_level > 0; - // bool no_nulls = leaf_array.null_count() == 0; - // if (!has_rep_levels && !maybe_parent_nulls && no_nulls) { - // return GetBoundaries(num_levels, leaf_array); - // } - - bool def_match, rep_match, val_match; - int64_t level_offset = 0; - int64_t value_offset = 0; - int64_t record_level_offset = 0; - int64_t record_value_offset = 0; - int64_t prev_record_level_offset = 0; - int64_t prev_record_value_offset = 0; - while (level_offset < num_levels) { - int16_t def_level = has_def_levels ? def_levels[level_offset] : 0; - int16_t rep_level = has_rep_levels ? rep_levels[level_offset] : 0; - - if (rep_level == 0) { - // record boundary - record_level_offset = level_offset; - record_value_offset = value_offset; + if (!has_rep_levels && !has_def_levels) { + // fastest path for non-repeated non-null data + bool val_match; + int64_t offset = 0; + int64_t prev_offset = 0; + while (offset < num_levels) { + val_match = Roll(leaf_array.GetView(offset)); + ++offset; + if (Check(val_match)) { + result.emplace_back(prev_offset, prev_offset, offset - prev_offset); + prev_offset = offset; + } } + if (prev_offset < num_levels) { + result.emplace_back(prev_offset, prev_offset, num_levels - prev_offset); + } + } else if (!has_rep_levels) { + // non-repeated data possibly with nulls + bool def_match, val_match; + int64_t offset = 0; + int64_t prev_offset = 0; + while (offset < num_levels) { + def_match = Roll(def_levels[offset]); + val_match = Roll(leaf_array.GetView(offset)); + ++offset; + if (Check(def_match || val_match)) { + result.emplace_back(prev_offset, prev_offset, offset - prev_offset); + prev_offset = offset; + } + } + if (prev_offset < num_levels) { + result.emplace_back(prev_offset, prev_offset, num_levels - prev_offset); + } + } else { + // repeated data possibly with nulls + bool def_match, rep_match, val_match; + int16_t def_level; + int16_t rep_level; + int64_t level_offset = 0; + int64_t value_offset = 0; + int64_t record_level_offset = 0; + int64_t record_value_offset = 0; + int64_t prev_record_level_offset = 0; + int64_t prev_record_value_offset = 0; - def_match = Roll(def_level); - rep_match = Roll(rep_level); - ++level_offset; + while (level_offset < num_levels) { + def_level = def_levels[level_offset]; + rep_level = rep_levels[level_offset]; + ++level_offset; - if (has_rep_levels) { + if (rep_level == 0) { + // record boundary + record_level_offset = level_offset; + record_value_offset = value_offset; + } + + def_match = Roll(def_level); + rep_match = Roll(rep_level); if (def_level >= level_info_.repeated_ancestor_def_level) { val_match = Roll(leaf_array.GetView(value_offset)); ++value_offset; } else { val_match = false; } - } else { - val_match = Roll(leaf_array.GetView(value_offset)); - ++value_offset; - } - if (Check(def_match || rep_match || val_match)) { - auto levels_to_write = record_level_offset - prev_record_level_offset; - if (levels_to_write > 0) { - result.emplace_back(prev_record_level_offset, prev_record_value_offset, - levels_to_write); - prev_record_level_offset = record_level_offset; - prev_record_value_offset = record_value_offset; + if (Check(def_match || rep_match || val_match)) { + auto levels_to_write = record_level_offset - prev_record_level_offset; + if (levels_to_write > 0) { + result.emplace_back(prev_record_level_offset, prev_record_value_offset, + levels_to_write); + prev_record_level_offset = record_level_offset; + prev_record_value_offset = record_value_offset; + } } } - } - auto levels_to_write = num_levels - prev_record_level_offset; - if (levels_to_write > 0) { - result.emplace_back(prev_record_level_offset, prev_record_value_offset, - levels_to_write); + auto levels_to_write = num_levels - prev_record_level_offset; + if (levels_to_write > 0) { + result.emplace_back(prev_record_level_offset, prev_record_value_offset, + levels_to_write); + } + return result; } + return result; } From bfa5cbd9586900f457818bfdd98f594bb19794ab Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Thu, 30 Jan 2025 17:25:01 +0100 Subject: [PATCH 004/102] normalize chunk sizes according to fastcdc algorithm --- cpp/src/parquet/CMakeLists.txt | 1 + cpp/src/parquet/column_chunker.h | 178 ++++++++++++++----------- cpp/src/parquet/column_chunker_test.cc | 16 +++ cpp/src/parquet/column_writer.cc | 6 +- cpp/src/parquet/properties.h | 20 +-- python/pyarrow/_parquet.pxd | 2 +- python/pyarrow/_parquet.pyx | 4 +- 7 files changed, 131 insertions(+), 96 deletions(-) create mode 100644 cpp/src/parquet/column_chunker_test.cc diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 735653d677b..43319ab7a1f 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -395,6 +395,7 @@ add_parquet_test(reader-test add_parquet_test(writer-test SOURCES + column_chunker_test.cc column_writer_test.cc file_serialize_test.cc stream_writer_test.cc) diff --git a/cpp/src/parquet/column_chunker.h b/cpp/src/parquet/column_chunker.h index 2e143a040e8..478ebb2eecf 100644 --- a/cpp/src/parquet/column_chunker.h +++ b/cpp/src/parquet/column_chunker.h @@ -27,77 +27,74 @@ using arrow::internal::checked_cast; namespace parquet { namespace internal { -// Constants -const uint64_t GEAR_HASH_TABLE[] = { - 0xb088d3a9e840f559, 0x5652c7f739ed20d6, 0x45b28969898972ab, 0x6b0a89d5b68ec777, - 0x368f573e8b7a31b7, 0x1dc636dce936d94b, 0x207a4c4e5554d5b6, 0xa474b34628239acb, - 0x3b06a83e1ca3b912, 0x90e78d6c2f02baf7, 0xe1c92df7150d9a8a, 0x8e95053a1086d3ad, - 0x5a2ef4f1b83a0722, 0xa50fac949f807fae, 0x0e7303eb80d8d681, 0x99b07edc1570ad0f, - 0x689d2fb555fd3076, 0x00005082119ea468, 0xc4b08306a88fcc28, 0x3eb0678af6374afd, - 0xf19f87ab86ad7436, 0xf2129fbfbe6bc736, 0x481149575c98a4ed, 0x0000010695477bc5, - 0x1fba37801a9ceacc, 0x3bf06fd663a49b6d, 0x99687e9782e3874b, 0x79a10673aa50d8e3, - 0xe4accf9e6211f420, 0x2520e71f87579071, 0x2bd5d3fd781a8a9b, 0x00de4dcddd11c873, - 0xeaa9311c5a87392f, 0xdb748eb617bc40ff, 0xaf579a8df620bf6f, 0x86a6e5da1b09c2b1, - 0xcc2fc30ac322a12e, 0x355e2afec1f74267, 0x2d99c8f4c021a47b, 0xbade4b4a9404cfc3, - 0xf7b518721d707d69, 0x3286b6587bf32c20, 0x0000b68886af270c, 0xa115d6e4db8a9079, - 0x484f7e9c97b2e199, 0xccca7bb75713e301, 0xbf2584a62bb0f160, 0xade7e813625dbcc8, - 0x000070940d87955a, 0x8ae69108139e626f, 0xbd776ad72fde38a2, 0xfb6b001fc2fcc0cf, - 0xc7a474b8e67bc427, 0xbaf6f11610eb5d58, 0x09cb1f5b6de770d1, 0xb0b219e6977d4c47, - 0x00ccbc386ea7ad4a, 0xcc849d0adf973f01, 0x73a3ef7d016af770, 0xc807d2d386bdbdfe, - 0x7f2ac9966c791730, 0xd037a86bc6c504da, 0xf3f17c661eaa609d, 0xaca626b04daae687, - 0x755a99374f4a5b07, 0x90837ee65b2caede, 0x6ee8ad93fd560785, 0x0000d9e11053edd8, - 0x9e063bb2d21cdbd7, 0x07ab77f12a01d2b2, 0xec550255e6641b44, 0x78fb94a8449c14c6, - 0xc7510e1bc6c0f5f5, 0x0000320b36e4cae3, 0x827c33262c8b1a2d, 0x14675f0b48ea4144, - 0x267bd3a6498deceb, 0xf1916ff982f5035e, 0x86221b7ff434fb88, 0x9dbecee7386f49d8, - 0xea58f8cac80f8f4a, 0x008d198692fc64d8, 0x6d38704fbabf9a36, 0xe032cb07d1e7be4c, - 0x228d21f6ad450890, 0x635cb1bfc02589a5, 0x4620a1739ca2ce71, 0xa7e7dfe3aae5fb58, - 0x0c10ca932b3c0deb, 0x2727fee884afed7b, 0xa2df1c6df9e2ab1f, 0x4dcdd1ac0774f523, - 0x000070ffad33e24e, 0xa2ace87bc5977816, 0x9892275ab4286049, 0xc2861181ddf18959, - 0xbb9972a042483e19, 0xef70cd3766513078, 0x00000513abfc9864, 0xc058b61858c94083, - 0x09e850859725e0de, 0x9197fb3bf83e7d94, 0x7e1e626d12b64bce, 0x520c54507f7b57d1, - 0xbee1797174e22416, 0x6fd9ac3222e95587, 0x0023957c9adfbf3e, 0xa01c7d7e234bbe15, - 0xaba2c758b8a38cbb, 0x0d1fa0ceec3e2b30, 0x0bb6a58b7e60b991, 0x4333dd5b9fa26635, - 0xc2fd3b7d4001c1a3, 0xfb41802454731127, 0x65a56185a50d18cb, 0xf67a02bd8784b54f, - 0x696f11dd67e65063, 0x00002022fca814ab, 0x8cd6be912db9d852, 0x695189b6e9ae8a57, - 0xee9453b50ada0c28, 0xd8fc5ea91a78845e, 0xab86bf191a4aa767, 0x0000c6b5c86415e5, - 0x267310178e08a22e, 0xed2d101b078bca25, 0x3b41ed84b226a8fb, 0x13e622120f28dc06, - 0xa315f5ebfb706d26, 0x8816c34e3301bace, 0xe9395b9cbb71fdae, 0x002ce9202e721648, - 0x4283db1d2bb3c91c, 0xd77d461ad2b1a6a5, 0xe2ec17e46eeb866b, 0xb8e0be4039fbc47c, - 0xdea160c4d5299d04, 0x7eec86c8d28c3634, 0x2119ad129f98a399, 0xa6ccf46b61a283ef, - 0x2c52cedef658c617, 0x2db4871169acdd83, 0x0000f0d6f39ecbe9, 0x3dd5d8c98d2f9489, - 0x8a1872a22b01f584, 0xf282a4c40e7b3cf2, 0x8020ec2ccb1ba196, 0x6693b6e09e59e313, - 0x0000ce19cc7c83eb, 0x20cb5735f6479c3b, 0x762ebf3759d75a5b, 0x207bfe823d693975, - 0xd77dc112339cd9d5, 0x9ba7834284627d03, 0x217dc513e95f51e9, 0xb27b1a29fc5e7816, - 0x00d5cd9831bb662d, 0x71e39b806d75734c, 0x7e572af006fb1a23, 0xa2734f2f6ae91f85, - 0xbf82c6b5022cddf2, 0x5c3beac60761a0de, 0xcdc893bb47416998, 0x6d1085615c187e01, - 0x77f8ae30ac277c5d, 0x917c6b81122a2c91, 0x5b75b699add16967, 0x0000cf6ae79a069b, - 0xf3c40afa60de1104, 0x2063127aa59167c3, 0x621de62269d1894d, 0xd188ac1de62b4726, - 0x107036e2154b673c, 0x0000b85f28553a1d, 0xf2ef4e4c18236f3d, 0xd9d6de6611b9f602, - 0xa1fc7955fb47911c, 0xeb85fd032f298dbd, 0xbe27502fb3befae1, 0xe3034251c4cd661e, - 0x441364d354071836, 0x0082b36c75f2983e, 0xb145910316fa66f0, 0x021c069c9847caf7, - 0x2910dfc75a4b5221, 0x735b353e1c57a8b5, 0xce44312ce98ed96c, 0xbc942e4506bdfa65, - 0xf05086a71257941b, 0xfec3b215d351cead, 0x00ae1055e0144202, 0xf54b40846f42e454, - 0x00007fd9c8bcbcc8, 0xbfbd9ef317de9bfe, 0xa804302ff2854e12, 0x39ce4957a5e5d8d4, - 0xffb9e2a45637ba84, 0x55b9ad1d9ea0818b, 0x00008acbf319178a, 0x48e2bfc8d0fbfb38, - 0x8be39841e848b5e8, 0x0e2712160696a08b, 0xd51096e84b44242a, 0x1101ba176792e13a, - 0xc22e770f4531689d, 0x1689eff272bbc56c, 0x00a92a197f5650ec, 0xbc765990bda1784e, - 0xc61441e392fcb8ae, 0x07e13a2ced31e4a0, 0x92cbe984234e9d4d, 0x8f4ff572bb7d8ac5, - 0x0b9670c00b963bd0, 0x62955a581a03eb01, 0x645f83e5ea000254, 0x41fce516cd88f299, - 0xbbda9748da7a98cf, 0x0000aab2fe4845fa, 0x19761b069bf56555, 0x8b8f5e8343b6ad56, - 0x3e5d1cfd144821d9, 0xec5c1e2ca2b0cd8f, 0xfaf7e0fea7fbb57f, 0x000000d3ba12961b, - 0xda3f90178401b18e, 0x70ff906de33a5feb, 0x0527d5a7c06970e7, 0x22d8e773607c13e9, - 0xc9ab70df643c3bac, 0xeda4c6dc8abe12e3, 0xecef1f410033e78a, 0x0024c2b274ac72cb, - 0x06740d954fa900b4, 0x1d7a299b323d6304, 0xb3c37cb298cbead5, 0xc986e3c76178739b, - 0x9fabea364b46f58a, 0x6da214c5af85cc56, 0x17a43ed8b7a38f84, 0x6eccec511d9adbeb, - 0xf9cab30913335afb, 0x4a5e60c5f415eed2, 0x00006967503672b4, 0x9da51d121454bb87, - 0x84321e13b9bbc816, 0xfb3d6fb6ab2fdd8d, 0x60305eed8e160a8d, 0xcbbf4b14e9946ce8, - 0x00004f63381b10c3, 0x07d5b7816fcc4e10, 0xe5a536726a6a8155, 0x57afb23447a07fdd, - 0x18f346f7abc9d394, 0x636dc655d61ad33d, 0xcc8bab4939f7f3f6, 0x63c7a906c1dd187b}; +const uint64_t GEAR_TABLE[256] = { + 0x3b5d3c7d207e37dc, 0x784d68ba91123086, 0xcd52880f882e7298, 0xeacf8e4e19fdcca7, + 0xc31f385dfbd1632b, 0x1d5f27001e25abe6, 0x83130bde3c9ad991, 0xc4b225676e9b7649, + 0xaa329b29e08eb499, 0xb67fcbd21e577d58, 0x0027baaada2acf6b, 0xe3ef2d5ac73c2226, + 0x0890f24d6ed312b7, 0xa809e036851d7c7e, 0xf0a6fe5e0013d81b, 0x1d026304452cec14, + 0x03864632648e248f, 0xcdaacf3dcd92b9b4, 0xf5e012e63c187856, 0x8862f9d3821c00b6, + 0xa82f7338750f6f8a, 0x1e583dc6c1cb0b6f, 0x7a3145b69743a7f1, 0xabb20fee404807eb, + 0xb14b3cfe07b83a5d, 0xb9dc27898adb9a0f, 0x3703f5e91baa62be, 0xcf0bb866815f7d98, + 0x3d9867c41ea9dcd3, 0x1be1fa65442bf22c, 0x14300da4c55631d9, 0xe698e9cbc6545c99, + 0x4763107ec64e92a5, 0xc65821fc65696a24, 0x76196c064822f0b7, 0x485be841f3525e01, + 0xf652bc9c85974ff5, 0xcad8352face9e3e9, 0x2a6ed1dceb35e98e, 0xc6f483badc11680f, + 0x3cfd8c17e9cf12f1, 0x89b83c5e2ea56471, 0xae665cfd24e392a9, 0xec33c4e504cb8915, + 0x3fb9b15fc9fe7451, 0xd7fd1fd1945f2195, 0x31ade0853443efd8, 0x255efc9863e1e2d2, + 0x10eab6008d5642cf, 0x46f04863257ac804, 0xa52dc42a789a27d3, 0xdaaadf9ce77af565, + 0x6b479cd53d87febb, 0x6309e2d3f93db72f, 0xc5738ffbaa1ff9d6, 0x6bd57f3f25af7968, + 0x67605486d90d0a4a, 0xe14d0b9663bfbdae, 0xb7bbd8d816eb0414, 0xdef8a4f16b35a116, + 0xe7932d85aaaffed6, 0x08161cbae90cfd48, 0x855507beb294f08b, 0x91234ea6ffd399b2, + 0xad70cf4b2435f302, 0xd289a97565bc2d27, 0x8e558437ffca99de, 0x96d2704b7115c040, + 0x0889bbcdfc660e41, 0x5e0d4e67dc92128d, 0x72a9f8917063ed97, 0x438b69d409e016e3, + 0xdf4fed8a5d8a4397, 0x00f41dcf41d403f7, 0x4814eb038e52603f, 0x9dafbacc58e2d651, + 0xfe2f458e4be170af, 0x4457ec414df6a940, 0x06e62f1451123314, 0xbd1014d173ba92cc, + 0xdef318e25ed57760, 0x9fea0de9dfca8525, 0x459de1e76c20624b, 0xaeec189617e2d666, + 0x126a2c06ab5a83cb, 0xb1321532360f6132, 0x65421503dbb40123, 0x2d67c287ea089ab3, + 0x6c93bff5a56bd6b6, 0x4ffb2036cab6d98d, 0xce7b785b1be7ad4f, 0xedb42ef6189fd163, + 0xdc905288703988f6, 0x365f9c1d2c691884, 0xc640583680d99bfe, 0x3cd4624c07593ec6, + 0x7f1ea8d85d7c5805, 0x014842d480b57149, 0x0b649bcb5a828688, 0xbcd5708ed79b18f0, + 0xe987c862fbd2f2f0, 0x982731671f0cd82c, 0xbaf13e8b16d8c063, 0x8ea3109cbd951bba, + 0xd141045bfb385cad, 0x2acbc1a0af1f7d30, 0xe6444d89df03bfdf, 0xa18cc771b8188ff9, + 0x9834429db01c39bb, 0x214add07fe086a1f, 0x8f07c19b1f6b3ff9, 0x56a297b1bf4ffe55, + 0x94d558e493c54fc7, 0x40bfc24c764552cb, 0x931a706f8a8520cb, 0x32229d322935bd52, + 0x2560d0f5dc4fefaf, 0x9dbcc48355969bb6, 0x0fd81c3985c0b56a, 0xe03817e1560f2bda, + 0xc1bb4f81d892b2d5, 0xb0c4864f4e28d2d7, 0x3ecc49f9d9d6c263, 0x51307e99b52ba65e, + 0x8af2b688da84a752, 0xf5d72523b91b20b6, 0x6d95ff1ff4634806, 0x562f21555458339a, + 0xc0ce47f889336346, 0x487823e5089b40d8, 0xe4727c7ebc6d9592, 0x5a8f7277e94970ba, + 0xfca2f406b1c8bb50, 0x5b1f8a95f1791070, 0xd304af9fc9028605, 0x5440ab7fc930e748, + 0x312d25fbca2ab5a1, 0x10f4a4b234a4d575, 0x90301d55047e7473, 0x3b6372886c61591e, + 0x293402b77c444e06, 0x451f34a4d3e97dd7, 0x3158d814d81bc57b, 0x034942425b9bda69, + 0xe2032ff9e532d9bb, 0x62ae066b8b2179e5, 0x9545e10c2f8d71d8, 0x7ff7483eb2d23fc0, + 0x00945fcebdc98d86, 0x8764bbbe99b26ca2, 0x1b1ec62284c0bfc3, 0x58e0fcc4f0aa362b, + 0x5f4abefa878d458d, 0xfd74ac2f9607c519, 0xa4e3fb37df8cbfa9, 0xbf697e43cac574e5, + 0x86f14a3f68f4cd53, 0x24a23d076f1ce522, 0xe725cd8048868cc8, 0xbf3c729eb2464362, + 0xd8f6cd57b3cc1ed8, 0x6329e52425541577, 0x62aa688ad5ae1ac0, 0x0a242566269bf845, + 0x168b1a4753aca74b, 0xf789afefff2e7e3c, 0x6c3362093b6fccdb, 0x4ce8f50bd28c09b2, + 0x006a2db95ae8aa93, 0x975b0d623c3d1a8c, 0x18605d3935338c5b, 0x5bb6f6136cad3c71, + 0x0f53a20701f8d8a6, 0xab8c5ad2e7e93c67, 0x40b5ac5127acaa29, 0x8c7bf63c2075895f, + 0x78bd9f7e014a805c, 0xb2c9e9f4f9c8c032, 0xefd6049827eb91f3, 0x2be459f482c16fbd, + 0xd92ce0c5745aaa8c, 0x0aaa8fb298d965b9, 0x2b37f92c6c803b15, 0x8c54a5e94e0f0e78, + 0x95f9b6e90c0a3032, 0xe7939faa436c7874, 0xd16bfe8f6a8a40c9, 0x44982b86263fd2fa, + 0xe285fb39f984e583, 0x779a8df72d7619d3, 0xf2d79a8de8d5dd1e, 0xd1037354d66684e2, + 0x004c82a4e668a8e5, 0x31d40a7668b044e6, 0xd70578538bd02c11, 0xdb45431078c5f482, + 0x977121bb7f6a51ad, 0x73d5ccbd34eff8dd, 0xe437a07d356e17cd, 0x47b2782043c95627, + 0x9fb251413e41d49a, 0xccd70b60652513d3, 0x1c95b31e8a1b49b2, 0xcae73dfd1bcb4c1b, + 0x34d98331b1f5b70f, 0x784e39f22338d92f, 0x18613d4a064df420, 0xf1d8dae25f0bcebe, + 0x33f77c15ae855efc, 0x3c88b3b912eb109c, 0x956a2ec96bafeea5, 0x1aa005b5e0ad0e87, + 0x5500d70527c4bb8e, 0xe36c57196421cc44, 0x13c4d286cc36ee39, 0x5654a23d818b2a81, + 0x77b1dc13d161abdc, 0x734f44de5f8d5eb5, 0x60717e174a6c89a2, 0xd47d9649266a211e, + 0x5b13a4322bb69e90, 0xf7669609f8b5fc3c, 0x21e6ac55bedcdac9, 0x9b56b62b61166dea, + 0xf48f66b939797e9c, 0x35f332f9c0e6ae9a, 0xcc733f6a9a878db0, 0x3da161e41cc108c2, + 0xb7d74ae535914d51, 0x4d493b0b11d36469, 0xce264d1dfba9741a, 0xa9d1f2dc7436dc06, + 0x70738016604c2a27, 0x231d36e96e93f3d5, 0x7666881197838d19, 0x4a2a83090aaad40c, + 0xf1e761591668b35d, 0x7363236497f730a7, 0x301080e37379dd4d, 0x502dea2971827042, + 0xc2c5eb858f32625f, 0x786afb9edfafbdff, 0xdaee0d868490b2a4, 0x617366b3268609f6, + 0xae0e35a0fe46173e, 0xd1a07de93e824f11, 0x079b8b115ea4cca8, 0x93a99274558faebb, + 0xfb1e6e22e08a03b3, 0xea635fdba3698dd0, 0xcf53659328503a5c, 0xcde3b31e6fd5d780, + 0x8e3e4221d3614413, 0xef14d0d86bf1a22c, 0xe1d830d3f16c5ddb, 0xaabd2b2a451504e1}; -const uint64_t MASK = 0xffff00000000000; -// const int MIN_LEN = 65536 / 8; -// const int MAX_LEN = 65536 * 2; const int64_t MIN_LEN = 256 * 1024; +const int64_t AVG_LEN = 1 * 1024 * 1024; const int64_t MAX_LEN = 2 * 1024 * 1024; // create a fake null array class with a GetView method returning 0 always @@ -110,26 +107,40 @@ class FakeNullArray { int64_t null_count() const { return 0; } }; -class GearHash { +static uint64_t GetMask(uint64_t avg_len, uint8_t bit_adjustment) { + size_t mask_bits = std::log2(avg_len); + size_t effective_bits = mask_bits + bit_adjustment; + return ((1ULL << effective_bits) - 1) << (64 - effective_bits); +} + +class FastCDC { public: - GearHash(const LevelInfo& level_info, uint64_t mask, uint64_t min_len, uint64_t max_len) + FastCDC(const LevelInfo& level_info, uint64_t min_len, uint64_t avg_len, + uint64_t max_len, uint8_t normalization_level = 1) : level_info_(level_info), - mask_(mask == 0 ? MASK : mask), min_len_(min_len == 0 ? MIN_LEN : min_len), - max_len_(max_len == 0 ? MAX_LEN : max_len) {} + avg_len_(avg_len == 0 ? AVG_LEN : avg_len), + max_len_(max_len == 0 ? MAX_LEN : max_len), + mask_s_(GetMask(avg_len_, -normalization_level)), + mask_l_(GetMask(avg_len_, +normalization_level)) {} template bool Roll(const T value) { constexpr size_t BYTE_WIDTH = sizeof(T); chunk_size_ += BYTE_WIDTH; + uint64_t mask; if (chunk_size_ < min_len_) { return false; + } else if (chunk_size_ < avg_len_) { + mask = mask_l_; + } else { + mask = mask_s_; } auto bytes = reinterpret_cast(&value); bool match = false; for (size_t i = 0; i < BYTE_WIDTH; ++i) { - hash_ = (hash_ << 1) + GEAR_HASH_TABLE[bytes[i]]; - if ((hash_ & mask_) == 0) { + hash_ = (hash_ << 1) + GEAR_TABLE[bytes[i]]; + if ((hash_ & mask) == 0) { match = true; } } @@ -138,13 +149,18 @@ class GearHash { bool Roll(std::string_view value) { chunk_size_ += value.size(); + uint64_t mask; if (chunk_size_ < min_len_) { return false; + } else if (chunk_size_ < avg_len_) { + mask = mask_l_; + } else { + mask = mask_s_; } bool match = false; for (char c : value) { - hash_ = (hash_ << 1) + GEAR_HASH_TABLE[static_cast(c)]; - if ((hash_ & mask_) == 0) { + hash_ = (hash_ << 1) + GEAR_TABLE[static_cast(c)]; + if ((hash_ & mask) == 0) { match = true; } } @@ -303,9 +319,11 @@ class GearHash { private: const internal::LevelInfo& level_info_; - uint64_t mask_ = MASK; - uint64_t min_len_; - uint64_t max_len_; + const uint64_t min_len_; + const uint64_t avg_len_; + const uint64_t max_len_; + const uint64_t mask_s_; + const uint64_t mask_l_; uint64_t hash_ = 0; uint64_t chunk_size_ = 0; }; diff --git a/cpp/src/parquet/column_chunker_test.cc b/cpp/src/parquet/column_chunker_test.cc new file mode 100644 index 00000000000..b248758bc12 --- /dev/null +++ b/cpp/src/parquet/column_chunker_test.cc @@ -0,0 +1,16 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index b6b00d4f8c2..c73f7c0c360 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -754,8 +754,8 @@ class ColumnWriterImpl { fallback_(false), definition_levels_sink_(allocator_), repetition_levels_sink_(allocator_), - content_defined_chunker_(level_info_, properties->cdc_mask(), - properties->cdc_min_size(), properties->cdc_max_size()) { + content_defined_chunker_(level_info_, properties->cdc_min_size(), + properties->cdc_avg_size(), properties->cdc_max_size()) { definition_levels_rle_ = std::static_pointer_cast(AllocateBuffer(allocator_, 0)); repetition_levels_rle_ = @@ -898,7 +898,7 @@ class ColumnWriterImpl { std::vector> data_pages_; - internal::GearHash content_defined_chunker_; + internal::FastCDC content_defined_chunker_; private: void InitSinks() { diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index bebad436fbc..cba5e778aed 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -263,7 +263,7 @@ class PARQUET_EXPORT WriterProperties { page_checksum_enabled_(false), size_statistics_level_(DEFAULT_SIZE_STATISTICS_LEVEL), cdc_enabled_(false), - cdc_mask_(0), + cdc_avg_size_(0), cdc_min_size_(0), cdc_max_size_(0) {} @@ -297,8 +297,8 @@ class PARQUET_EXPORT WriterProperties { return this; } - Builder* cdc_mask(uint64_t mask) { - cdc_mask_ = mask; + Builder* cdc_avg_size(uint64_t avg_size) { + cdc_avg_size_ = avg_size; return this; } @@ -734,8 +734,8 @@ class PARQUET_EXPORT WriterProperties { pagesize_, version_, created_by_, page_checksum_enabled_, size_statistics_level_, std::move(file_encryption_properties_), default_column_properties_, column_properties, data_page_version_, - store_decimal_as_integer_, std::move(sorting_columns_), cdc_enabled_, cdc_mask_, - cdc_min_size_, cdc_max_size_)); + store_decimal_as_integer_, std::move(sorting_columns_), cdc_enabled_, + cdc_avg_size_, cdc_min_size_, cdc_max_size_)); } private: @@ -766,7 +766,7 @@ class PARQUET_EXPORT WriterProperties { std::unordered_map page_index_enabled_; bool cdc_enabled_; - uint64_t cdc_mask_; + uint64_t cdc_avg_size_; uint64_t cdc_min_size_; uint64_t cdc_max_size_; }; @@ -794,7 +794,7 @@ class PARQUET_EXPORT WriterProperties { inline bool page_checksum_enabled() const { return page_checksum_enabled_; } inline bool cdc_enabled() const { return cdc_enabled_; } - inline uint64_t cdc_mask() const { return cdc_mask_; } + inline uint64_t cdc_avg_size() const { return cdc_avg_size_; } inline uint64_t cdc_min_size() const { return cdc_min_size_; } inline uint64_t cdc_max_size() const { return cdc_max_size_; } @@ -900,7 +900,7 @@ class PARQUET_EXPORT WriterProperties { const ColumnProperties& default_column_properties, const std::unordered_map& column_properties, ParquetDataPageVersion data_page_version, bool store_short_decimal_as_integer, - std::vector sorting_columns, bool cdc_enabled, uint64_t cdc_mask, + std::vector sorting_columns, bool cdc_enabled, uint64_t cdc_avg_size, uint64_t cdc_min_size, uint64_t cdc_max_size) : pool_(pool), dictionary_pagesize_limit_(dictionary_pagesize_limit), @@ -918,7 +918,7 @@ class PARQUET_EXPORT WriterProperties { default_column_properties_(default_column_properties), column_properties_(column_properties), cdc_enabled_(cdc_enabled), - cdc_mask_(cdc_mask), + cdc_avg_size_(cdc_avg_size), cdc_min_size_(cdc_min_size), cdc_max_size_(cdc_max_size) {} @@ -942,7 +942,7 @@ class PARQUET_EXPORT WriterProperties { std::unordered_map column_properties_; bool cdc_enabled_; - uint64_t cdc_mask_; + uint64_t cdc_avg_size_; uint64_t cdc_min_size_; uint64_t cdc_max_size_; }; diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 9e5bec110eb..4ae821b5cca 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -497,7 +497,7 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: Builder* disable_page_checksum() Builder* enable_cdc() Builder* disable_cdc() - Builder* cdc_mask(uint64_t mask) + Builder* cdc_avg_size(uint64_t avg_size) Builder* cdc_min_size(uint64_t min_size) Builder* cdc_max_size(uint64_t max_size) shared_ptr[WriterProperties] build() diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index ef251937330..33c71d16272 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -2120,9 +2120,9 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( elif content_defined_chunking is True: props.enable_cdc() elif isinstance(content_defined_chunking, tuple): - mask, min_size, max_size = content_defined_chunking + min_size, avg_size, max_size = content_defined_chunking props.enable_cdc() - props.cdc_mask(mask) + props.cdc_avg_size(avg_size) props.cdc_min_size(min_size) props.cdc_max_size(max_size) else: From 61617b423ca8b84ddaad2860648c7808784ec89d Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Thu, 30 Jan 2025 20:10:56 +0100 Subject: [PATCH 005/102] missing header and fix level_offset incrementation --- cpp/src/parquet/column_chunker.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/column_chunker.h b/cpp/src/parquet/column_chunker.h index 478ebb2eecf..b7b17e26536 100644 --- a/cpp/src/parquet/column_chunker.h +++ b/cpp/src/parquet/column_chunker.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include #include "arrow/array.h" @@ -108,7 +109,7 @@ class FakeNullArray { }; static uint64_t GetMask(uint64_t avg_len, uint8_t bit_adjustment) { - size_t mask_bits = std::log2(avg_len); + size_t mask_bits = static_cast(std::floor(std::log2(avg_len))); size_t effective_bits = mask_bits + bit_adjustment; return ((1ULL << effective_bits) - 1) << (64 - effective_bits); } @@ -232,13 +233,11 @@ class FastCDC { while (level_offset < num_levels) { def_level = def_levels[level_offset]; rep_level = rep_levels[level_offset]; - ++level_offset; - if (rep_level == 0) { - // record boundary record_level_offset = level_offset; record_value_offset = value_offset; } + ++level_offset; def_match = Roll(def_level); rep_match = Roll(rep_level); From cc985bff28ef505d9c0dd2c8b8345fca8a27f988 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 31 Jan 2025 01:10:17 +0100 Subject: [PATCH 006/102] don't use normalization by default --- cpp/src/parquet/column_chunker.h | 36 ++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/column_chunker.h b/cpp/src/parquet/column_chunker.h index b7b17e26536..d6ee6394099 100644 --- a/cpp/src/parquet/column_chunker.h +++ b/cpp/src/parquet/column_chunker.h @@ -21,6 +21,7 @@ #include #include #include "arrow/array.h" +#include "arrow/util/logging.h" #include "parquet/level_conversion.h" using arrow::internal::checked_cast; @@ -28,6 +29,21 @@ using arrow::internal::checked_cast; namespace parquet { namespace internal { +// const uint64_t MASK_TABLE[48] = { +// 0x8000000000000000, 0x8000008000000000, 0x8000800080000000, 0x8008008008000000, +// 0x8040100802000000, 0x8080808080800000, 0x8204081020400000, 0x8208208208200000, +// 0x8420842084200000, 0x8842108842100000, 0x8884442221100000, 0x8888888888880000, +// 0x9112224448880000, 0x9224489224480000, 0x9248924892480000, 0x9249249249240000, +// 0xa492924949240000, 0xa4a4a4a4a4a40000, 0xa5294a5294a40000, 0xa94a94a94a940000, +// 0xaa54aa54aa540000, 0xaaa554aaa5540000, 0xaaaaaa5555540000, 0xaaaaaaaaaaaa0000, +// 0xd55555aaaaaa0000, 0xd55aaad55aaa0000, 0xd5aad5aad5aa0000, 0xd6ad6ad6ad6a0000, +// 0xdad6b5ad6b5a0000, 0xdadadadadada0000, 0xdb6d6db6b6da0000, 0xdb6db6db6db60000, +// 0xedb6edb6edb60000, 0xeddbb6eddbb60000, 0xeeedddbbb7760000, 0xeeeeeeeeeeee0000, +// 0xf77bbbdddeee0000, 0xf7bdeef7bdee0000, 0xfbdefbdefbde0000, 0xfbefbefbefbe0000, +// 0xfdfbf7efdfbe0000, 0xfefefefefefe0000, 0xffbfeff7fdfe0000, 0xffeffeffeffe0000, +// 0xfffefffefffe0000, 0xfffffefffffe0000, 0xfffffffffffe0000, 0xffffffffffff0000 +// }; + const uint64_t GEAR_TABLE[256] = { 0x3b5d3c7d207e37dc, 0x784d68ba91123086, 0xcd52880f882e7298, 0xeacf8e4e19fdcca7, 0xc31f385dfbd1632b, 0x1d5f27001e25abe6, 0x83130bde3c9ad991, 0xc4b225676e9b7649, @@ -98,6 +114,10 @@ const int64_t MIN_LEN = 256 * 1024; const int64_t AVG_LEN = 1 * 1024 * 1024; const int64_t MAX_LEN = 2 * 1024 * 1024; +// const int64_t MIN_LEN = 512 * 1024; +// const int64_t AVG_LEN = 2 * MIN_LEN; +// const int64_t MAX_LEN = 2 * AVG_LEN; + // create a fake null array class with a GetView method returning 0 always class FakeNullArray { public: @@ -108,8 +128,20 @@ class FakeNullArray { int64_t null_count() const { return 0; } }; +// static uint64_t GetMask(uint64_t avg_len, uint8_t bit_adjustment) { +// size_t mask_bits = static_cast(std::floor(std::log2(avg_len))); +// size_t effective_bits = mask_bits + bit_adjustment; +// return MASK_TABLE[effective_bits]; +// } + +// static uint64_t GetMask(uint64_t avg_len, uint8_t bit_adjustment) { +// size_t mask_bits = static_cast(std::floor(std::log2(avg_len))); +// size_t effective_bits = mask_bits + bit_adjustment; +// return ((1ULL << effective_bits) - 1) << (64 - effective_bits); +// } + static uint64_t GetMask(uint64_t avg_len, uint8_t bit_adjustment) { - size_t mask_bits = static_cast(std::floor(std::log2(avg_len))); + size_t mask_bits = 16; size_t effective_bits = mask_bits + bit_adjustment; return ((1ULL << effective_bits) - 1) << (64 - effective_bits); } @@ -117,7 +149,7 @@ static uint64_t GetMask(uint64_t avg_len, uint8_t bit_adjustment) { class FastCDC { public: FastCDC(const LevelInfo& level_info, uint64_t min_len, uint64_t avg_len, - uint64_t max_len, uint8_t normalization_level = 1) + uint64_t max_len, uint8_t normalization_level = 0) : level_info_(level_info), min_len_(min_len == 0 ? MIN_LEN : min_len), avg_len_(avg_len == 0 ? AVG_LEN : avg_len), From 1fedb895f1e9da2b887682de77abf0b67352c353 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 5 Feb 2025 13:05:04 +0100 Subject: [PATCH 007/102] use contexpr for gear hash tables --- cpp/src/parquet/column_chunker.h | 678 +++++++++++++++++++++++++------ cpp/src/parquet/column_writer.cc | 8 +- cpp/src/parquet/properties.h | 34 +- python/pyarrow/_parquet.pxd | 2 - python/pyarrow/_parquet.pyx | 4 +- 5 files changed, 563 insertions(+), 163 deletions(-) diff --git a/cpp/src/parquet/column_chunker.h b/cpp/src/parquet/column_chunker.h index d6ee6394099..30601707e1c 100644 --- a/cpp/src/parquet/column_chunker.h +++ b/cpp/src/parquet/column_chunker.h @@ -29,94 +29,530 @@ using arrow::internal::checked_cast; namespace parquet { namespace internal { -// const uint64_t MASK_TABLE[48] = { -// 0x8000000000000000, 0x8000008000000000, 0x8000800080000000, 0x8008008008000000, -// 0x8040100802000000, 0x8080808080800000, 0x8204081020400000, 0x8208208208200000, -// 0x8420842084200000, 0x8842108842100000, 0x8884442221100000, 0x8888888888880000, -// 0x9112224448880000, 0x9224489224480000, 0x9248924892480000, 0x9249249249240000, -// 0xa492924949240000, 0xa4a4a4a4a4a40000, 0xa5294a5294a40000, 0xa94a94a94a940000, -// 0xaa54aa54aa540000, 0xaaa554aaa5540000, 0xaaaaaa5555540000, 0xaaaaaaaaaaaa0000, -// 0xd55555aaaaaa0000, 0xd55aaad55aaa0000, 0xd5aad5aad5aa0000, 0xd6ad6ad6ad6a0000, -// 0xdad6b5ad6b5a0000, 0xdadadadadada0000, 0xdb6d6db6b6da0000, 0xdb6db6db6db60000, -// 0xedb6edb6edb60000, 0xeddbb6eddbb60000, 0xeeedddbbb7760000, 0xeeeeeeeeeeee0000, -// 0xf77bbbdddeee0000, 0xf7bdeef7bdee0000, 0xfbdefbdefbde0000, 0xfbefbefbefbe0000, -// 0xfdfbf7efdfbe0000, 0xfefefefefefe0000, 0xffbfeff7fdfe0000, 0xffeffeffeffe0000, -// 0xfffefffefffe0000, 0xfffffefffffe0000, 0xfffffffffffe0000, 0xffffffffffff0000 -// }; - -const uint64_t GEAR_TABLE[256] = { - 0x3b5d3c7d207e37dc, 0x784d68ba91123086, 0xcd52880f882e7298, 0xeacf8e4e19fdcca7, - 0xc31f385dfbd1632b, 0x1d5f27001e25abe6, 0x83130bde3c9ad991, 0xc4b225676e9b7649, - 0xaa329b29e08eb499, 0xb67fcbd21e577d58, 0x0027baaada2acf6b, 0xe3ef2d5ac73c2226, - 0x0890f24d6ed312b7, 0xa809e036851d7c7e, 0xf0a6fe5e0013d81b, 0x1d026304452cec14, - 0x03864632648e248f, 0xcdaacf3dcd92b9b4, 0xf5e012e63c187856, 0x8862f9d3821c00b6, - 0xa82f7338750f6f8a, 0x1e583dc6c1cb0b6f, 0x7a3145b69743a7f1, 0xabb20fee404807eb, - 0xb14b3cfe07b83a5d, 0xb9dc27898adb9a0f, 0x3703f5e91baa62be, 0xcf0bb866815f7d98, - 0x3d9867c41ea9dcd3, 0x1be1fa65442bf22c, 0x14300da4c55631d9, 0xe698e9cbc6545c99, - 0x4763107ec64e92a5, 0xc65821fc65696a24, 0x76196c064822f0b7, 0x485be841f3525e01, - 0xf652bc9c85974ff5, 0xcad8352face9e3e9, 0x2a6ed1dceb35e98e, 0xc6f483badc11680f, - 0x3cfd8c17e9cf12f1, 0x89b83c5e2ea56471, 0xae665cfd24e392a9, 0xec33c4e504cb8915, - 0x3fb9b15fc9fe7451, 0xd7fd1fd1945f2195, 0x31ade0853443efd8, 0x255efc9863e1e2d2, - 0x10eab6008d5642cf, 0x46f04863257ac804, 0xa52dc42a789a27d3, 0xdaaadf9ce77af565, - 0x6b479cd53d87febb, 0x6309e2d3f93db72f, 0xc5738ffbaa1ff9d6, 0x6bd57f3f25af7968, - 0x67605486d90d0a4a, 0xe14d0b9663bfbdae, 0xb7bbd8d816eb0414, 0xdef8a4f16b35a116, - 0xe7932d85aaaffed6, 0x08161cbae90cfd48, 0x855507beb294f08b, 0x91234ea6ffd399b2, - 0xad70cf4b2435f302, 0xd289a97565bc2d27, 0x8e558437ffca99de, 0x96d2704b7115c040, - 0x0889bbcdfc660e41, 0x5e0d4e67dc92128d, 0x72a9f8917063ed97, 0x438b69d409e016e3, - 0xdf4fed8a5d8a4397, 0x00f41dcf41d403f7, 0x4814eb038e52603f, 0x9dafbacc58e2d651, - 0xfe2f458e4be170af, 0x4457ec414df6a940, 0x06e62f1451123314, 0xbd1014d173ba92cc, - 0xdef318e25ed57760, 0x9fea0de9dfca8525, 0x459de1e76c20624b, 0xaeec189617e2d666, - 0x126a2c06ab5a83cb, 0xb1321532360f6132, 0x65421503dbb40123, 0x2d67c287ea089ab3, - 0x6c93bff5a56bd6b6, 0x4ffb2036cab6d98d, 0xce7b785b1be7ad4f, 0xedb42ef6189fd163, - 0xdc905288703988f6, 0x365f9c1d2c691884, 0xc640583680d99bfe, 0x3cd4624c07593ec6, - 0x7f1ea8d85d7c5805, 0x014842d480b57149, 0x0b649bcb5a828688, 0xbcd5708ed79b18f0, - 0xe987c862fbd2f2f0, 0x982731671f0cd82c, 0xbaf13e8b16d8c063, 0x8ea3109cbd951bba, - 0xd141045bfb385cad, 0x2acbc1a0af1f7d30, 0xe6444d89df03bfdf, 0xa18cc771b8188ff9, - 0x9834429db01c39bb, 0x214add07fe086a1f, 0x8f07c19b1f6b3ff9, 0x56a297b1bf4ffe55, - 0x94d558e493c54fc7, 0x40bfc24c764552cb, 0x931a706f8a8520cb, 0x32229d322935bd52, - 0x2560d0f5dc4fefaf, 0x9dbcc48355969bb6, 0x0fd81c3985c0b56a, 0xe03817e1560f2bda, - 0xc1bb4f81d892b2d5, 0xb0c4864f4e28d2d7, 0x3ecc49f9d9d6c263, 0x51307e99b52ba65e, - 0x8af2b688da84a752, 0xf5d72523b91b20b6, 0x6d95ff1ff4634806, 0x562f21555458339a, - 0xc0ce47f889336346, 0x487823e5089b40d8, 0xe4727c7ebc6d9592, 0x5a8f7277e94970ba, - 0xfca2f406b1c8bb50, 0x5b1f8a95f1791070, 0xd304af9fc9028605, 0x5440ab7fc930e748, - 0x312d25fbca2ab5a1, 0x10f4a4b234a4d575, 0x90301d55047e7473, 0x3b6372886c61591e, - 0x293402b77c444e06, 0x451f34a4d3e97dd7, 0x3158d814d81bc57b, 0x034942425b9bda69, - 0xe2032ff9e532d9bb, 0x62ae066b8b2179e5, 0x9545e10c2f8d71d8, 0x7ff7483eb2d23fc0, - 0x00945fcebdc98d86, 0x8764bbbe99b26ca2, 0x1b1ec62284c0bfc3, 0x58e0fcc4f0aa362b, - 0x5f4abefa878d458d, 0xfd74ac2f9607c519, 0xa4e3fb37df8cbfa9, 0xbf697e43cac574e5, - 0x86f14a3f68f4cd53, 0x24a23d076f1ce522, 0xe725cd8048868cc8, 0xbf3c729eb2464362, - 0xd8f6cd57b3cc1ed8, 0x6329e52425541577, 0x62aa688ad5ae1ac0, 0x0a242566269bf845, - 0x168b1a4753aca74b, 0xf789afefff2e7e3c, 0x6c3362093b6fccdb, 0x4ce8f50bd28c09b2, - 0x006a2db95ae8aa93, 0x975b0d623c3d1a8c, 0x18605d3935338c5b, 0x5bb6f6136cad3c71, - 0x0f53a20701f8d8a6, 0xab8c5ad2e7e93c67, 0x40b5ac5127acaa29, 0x8c7bf63c2075895f, - 0x78bd9f7e014a805c, 0xb2c9e9f4f9c8c032, 0xefd6049827eb91f3, 0x2be459f482c16fbd, - 0xd92ce0c5745aaa8c, 0x0aaa8fb298d965b9, 0x2b37f92c6c803b15, 0x8c54a5e94e0f0e78, - 0x95f9b6e90c0a3032, 0xe7939faa436c7874, 0xd16bfe8f6a8a40c9, 0x44982b86263fd2fa, - 0xe285fb39f984e583, 0x779a8df72d7619d3, 0xf2d79a8de8d5dd1e, 0xd1037354d66684e2, - 0x004c82a4e668a8e5, 0x31d40a7668b044e6, 0xd70578538bd02c11, 0xdb45431078c5f482, - 0x977121bb7f6a51ad, 0x73d5ccbd34eff8dd, 0xe437a07d356e17cd, 0x47b2782043c95627, - 0x9fb251413e41d49a, 0xccd70b60652513d3, 0x1c95b31e8a1b49b2, 0xcae73dfd1bcb4c1b, - 0x34d98331b1f5b70f, 0x784e39f22338d92f, 0x18613d4a064df420, 0xf1d8dae25f0bcebe, - 0x33f77c15ae855efc, 0x3c88b3b912eb109c, 0x956a2ec96bafeea5, 0x1aa005b5e0ad0e87, - 0x5500d70527c4bb8e, 0xe36c57196421cc44, 0x13c4d286cc36ee39, 0x5654a23d818b2a81, - 0x77b1dc13d161abdc, 0x734f44de5f8d5eb5, 0x60717e174a6c89a2, 0xd47d9649266a211e, - 0x5b13a4322bb69e90, 0xf7669609f8b5fc3c, 0x21e6ac55bedcdac9, 0x9b56b62b61166dea, - 0xf48f66b939797e9c, 0x35f332f9c0e6ae9a, 0xcc733f6a9a878db0, 0x3da161e41cc108c2, - 0xb7d74ae535914d51, 0x4d493b0b11d36469, 0xce264d1dfba9741a, 0xa9d1f2dc7436dc06, - 0x70738016604c2a27, 0x231d36e96e93f3d5, 0x7666881197838d19, 0x4a2a83090aaad40c, - 0xf1e761591668b35d, 0x7363236497f730a7, 0x301080e37379dd4d, 0x502dea2971827042, - 0xc2c5eb858f32625f, 0x786afb9edfafbdff, 0xdaee0d868490b2a4, 0x617366b3268609f6, - 0xae0e35a0fe46173e, 0xd1a07de93e824f11, 0x079b8b115ea4cca8, 0x93a99274558faebb, - 0xfb1e6e22e08a03b3, 0xea635fdba3698dd0, 0xcf53659328503a5c, 0xcde3b31e6fd5d780, - 0x8e3e4221d3614413, 0xef14d0d86bf1a22c, 0xe1d830d3f16c5ddb, 0xaabd2b2a451504e1}; - -const int64_t MIN_LEN = 256 * 1024; -const int64_t AVG_LEN = 1 * 1024 * 1024; -const int64_t MAX_LEN = 2 * 1024 * 1024; +constexpr uint64_t GEAR_HASH_TABLE[8][256] = { + {// seed = 0 + 0xf09f35a563783945, 0x0dcc5b3bc5ae410a, 0x63f1ea8d22554270, 0xfbe5ee7bd05a7b61, + 0x3f692ed5e9934aba, 0xaab3755952250eb8, 0xdefb168dc2888fa5, 0x501b36f7c77a7d47, + 0xd2fff45d1989642d, 0x80217c1c600e30a6, 0xb9469ee2e43df7ac, 0x3654b76a61999706, + 0x6ea73dfe5de0c6b6, 0xdfd662e1937a589d, 0x0dbe0cc74b188a68, 0xde45f4e6d73ffc6f, + 0xcdf7a7759e70d87e, 0x5d6a951b8d38c310, 0xdc9423c3813fcf2c, 0x25dc2976e167ffce, + 0xc2555baa1d031c84, 0x115bc3f2230a3ab6, 0xd4b10260f350bede, 0xdfd3501ab447d723, + 0x022e79217edaf167, 0x1635e2255c5a7526, 0xa0a750350cc77102, 0xc027133e05d39f56, + 0xd949459779cf0387, 0xb92f1464f5c688c2, 0xd9ac5f3e8b42f2f3, 0xdf02bb6f5ecaac21, + 0x8156f988fac7bfa4, 0xe4580f97bede2ec8, 0x44fe7d17a76fca32, 0x885f59bd54c2014c, + 0x435e63ec655ffae9, 0x5ebc51930967b1f1, 0x5428c2084ac29e47, 0x9465938fec30e36b, + 0xc7cb3de4977772cd, 0x15692d7c201e8c3a, 0x505ee65cdc4b17f4, 0x7d9839a0a7aead6b, + 0xeef5f5b6a0105291, 0x76c2fb232ce7f5bf, 0x5c13893c1c3ff3a9, 0x65b6b547d4442f98, + 0xb8ad7487c8c96fce, 0x906bcf51c99974f8, 0x2f56e48bb943a48c, 0xbc9ab109f82d3a44, + 0xcd5160cdc8c7e735, 0xbe9acb9df3427732, 0x386b91d477d7fade, 0x36be463621dd5af2, + 0xcbe6a2faffd627a8, 0x9c8fd528463a2f5a, 0xb9b88c6bb802b184, 0xb414b4e665c597c7, + 0xbedb142568209556, 0x5360d81c25429dce, 0x63a69a960a952f37, 0xc900d63899e1b503, + 0x1abc63a8b37c7728, 0xa8b3a8b6409080eb, 0x495e391f662959f6, 0xdf1e136f3e12229b, + 0x33d5fc526b0dd38d, 0x321221ae2abfac63, 0x7fde18351fda7395, 0xed79fe5c3a6aa4c3, + 0x2dd6965a4867d8d4, 0x54813ca20fe8799b, 0x5d59ea6456465c39, 0x0de0c294d1936b81, + 0x4aaf0755002c588c, 0x3530a1857ad04c6d, 0xb8a64f4ce184442b, 0xe0def10bceedfa17, + 0x46e38d0a443757ec, 0x9795a1c645ee16d7, 0x7e531def245eac8a, 0x683b25c43a0716cf, + 0x884583d372da219d, 0x5b06b62c910416e5, 0x54b6902fbebd3dbe, 0x931198d40a761a75, + 0xead7d8e830013590, 0x80b4d5dc99bfaced, 0xf98272c8108a1ad2, 0x1adce054289a0ec6, + 0x7d53a1143c56b465, 0x497fbe4f00c92b52, 0x525e4cc2e81ebd69, 0xc94478e0d5508ff6, + 0xb8a5da83c196d07c, 0x7667a921b65b0603, 0xf236fabbdefe6cd1, 0x53da978d19a92b98, + 0xc604f6e97087124d, 0x2cbd27221924b094, 0x65cd1102c985b1d2, 0x08c0755dc1a97eb4, + 0x5e0419e921c0fef1, 0x282d2c1196f84a29, 0xe21117fcfc5793f7, 0xcf4e985dc38e6c2e, + 0xd521f4f264d55616, 0xde69b04c485f2a10, 0x59410e245305178a, 0xceab1d477c943601, + 0xa9805732d71ee5e9, 0x054cd443896974f6, 0xf2b517717a423a3e, 0x09517937fa9fac95, + 0x4938233e9ca871e3, 0x9132cbaf56f83ec0, 0x4703421ed1dd027d, 0xfd9933f4e6f1ec4e, + 0xf237c7fded2274a8, 0xdf4616efe68cd7b4, 0x5e46de0f39f0a380, 0x3d41e0c6d8e095b0, + 0xc5272f8a5bb2df09, 0x68aa78e8301fb964, 0xbf5b5b52c8e32ae0, 0xbf28ed3df74bdcf7, + 0xd6198f64c833815a, 0x8cd99d2974267544, 0xd90560ea4465ff2c, 0x571d65ad7ad59261, + 0x309453518baa367a, 0xa60538377bc79fb2, 0xace515da1ab4183c, 0xf56d3c8d891d1c5b, + 0x5b0d8370b59def49, 0x775866ce7c83c762, 0x3d76085695c8e18a, 0xba064d1a9af1b114, + 0xc84ef7cd7b98b521, 0x90b9231681c2bc37, 0x37e2b13e6f585b6b, 0x1d0a34e55e0f369f, + 0x86bb8019cf41447c, 0x4b95c6ef55b3f71f, 0x3b6ed1660732b310, 0x617eee603d137f21, + 0xf4f6278b464f3bbc, 0xdfb763b720da205a, 0x353478899b871cb7, 0xe45fbbff574cc41e, + 0x1a94b60847907d72, 0xb10eef051eff67a5, 0xf0e012ec6a284d40, 0xcc1cd1a11b926d7c, + 0xcf9d9c5453e19cad, 0x270febcc0fc0e86b, 0xd6567568778b781e, 0x7323b98965eeb46b, + 0xccecd374567086ff, 0xef7b44bfc497a704, 0xebc479c051a9f0a5, 0xc9b7410e3e00a235, + 0x1d084f7ecdf83dab, 0xc8a9a97e33ba8ba3, 0x8c75318f5b2350d6, 0xaa3cd5d0c684bdda, + 0xa81125fe0901bedf, 0xf7bcd76020edfc93, 0x834ee4c12e75874f, 0xb2bb8a7beb44fa14, + 0x32cd26f50a4f4e4d, 0x0fc5817ca55d959a, 0xd6e4ae2e3ae10718, 0x074abdcceb8d6e38, + 0xc0cc5f4f9b3a9c43, 0x1115d364363595b2, 0x69861db2eb19f2e8, 0x59b8d804cf92bc67, + 0x9bac9785e5e4b863, 0x7fa0e17a41869561, 0x10d3c9633f0c709c, 0x534a03deee6bc44a, + 0x73b1f7201257f581, 0x46fd6a11e2e0706b, 0x494abb554946e67a, 0xb5d6da317864dc8e, + 0x402ded9238f39687, 0xd8fa37d2cbd6d290, 0xcc818293fcb06791, 0x6482ab344806cd4d, + 0x0956e6ee9d8eb60b, 0x01fee622d8465ac8, 0xae7ece370cbd9c35, 0x7ff09e937a177279, + 0xa2c29ee7a33ca5f1, 0x990e8dbee083923b, 0x4a819b72f610863a, 0xddecfad79d3f08be, + 0x627372480fac20a7, 0x802154d6eca2db4c, 0x8fcf02e42f805e55, 0x040a911ff8cea977, + 0xbb544485bc64d0d4, 0xaddde1aeb406d0fb, 0xf6b35fae23dce66f, 0xc07a9fb3645d2f9b, + 0xccd113907e9c0fed, 0xd17af369984fd213, 0x9223823c59a083e7, 0xe19d475606b81013, + 0xe181ac116a90e57a, 0x71f7b6258c6def4c, 0x2246f34b45964f7c, 0xd74aedaea2d31751, + 0xb1add86e5dd305d1, 0xeb9ba881f16d6471, 0xef7600e036f5c6ff, 0x1d50bc9735b8fb85, + 0xe63942bd1f3e2969, 0x9241ba9f8b3f4e72, 0xee8bb2bca07d35b6, 0x55cd55dab522654e, + 0x94d0cfa7c1a6845d, 0x02f9845d559884c3, 0x8ce70ea21063b560, 0xd70998028ef08b74, + 0xdfdb5bbee310876b, 0x4e21b2e348256d16, 0xde007a981c13debc, 0xe51950cbbddabfdd, + 0xd223301dbe9957c1, 0x084b8634cc2cce4b, 0x90e551378aa9d70c, 0x833b533ac633e448, + 0x7891e232882da57f, 0xa1bf26f0163ce2b3, 0xf33a0171eb9c68d5, 0x2e7de18ca69b3fa2, + 0x666fd6f175619199, 0x1239d37edb5feb9f, 0xfa9fc9382e61ff5c, 0x3ca4ad427e3c126f, + 0x37c6dd4c2c31ae6e, 0x1f1bacb619d427b2, 0x7dd09f5d10759afe, 0xc8d941432327d733, + 0x2b389ba25e1d43a7, 0xa4e3030c3740ff21, 0xcc56dae13fd37463, 0x2481457c175b560f, + 0x9deb35bde77c5c41, 0x847aa6ea5549a0c3, 0xcde01bb48b6e7f02, 0x15a28844e64cb211}, + {// seed = 1 + 0xecfcba92fe5691a3, 0x71377799fea34699, 0xb284c9096fa614e5, 0x54534170f40de6c8, + 0xbbd804d45884fba3, 0x44929a896388c8a1, 0x79b712508e0fa3b1, 0xeb53ab280af31054, + 0x351ea23a6319da7a, 0x2fbe55d9819d85a2, 0x34f4b6568dcd28b1, 0x8c94ea5e5d82967a, + 0x09068d333a46d3c5, 0x762ad4f64cb73381, 0xd5c6db5ef0e22640, 0x36d8ab5a36175680, + 0xd41fe333cdc3525a, 0xa1f51dbdf20ce781, 0x1410a95e786c8be6, 0x96b7499a670c2b41, + 0x3912e1037835d893, 0x272c5bd83e1e9115, 0x2ea7f91cad82a0d6, 0xcd10e85662ce9931, + 0xedad49be8d5e8b74, 0x7ccd8fe0f37d12bc, 0xfac0482005eed593, 0x4513991681f6c8b0, + 0x2804d612eb0ad37d, 0x7cca9e8412b81d34, 0x85ffd6707192b7b8, 0xea0560aeea954411, + 0x0122d28226102bba, 0xf51c47cdbd22fdd1, 0x3707d851183ff17c, 0xaef5a1465f3e902d, + 0xbcb38c2d8736a04f, 0x4025317e864bef15, 0x8d3f66d86e1ea58f, 0xc16759a3d97ed79a, + 0x1c62abdc0659f2f5, 0x23b3eb4e699bd28f, 0x5083c4fceed3ccaf, 0xa65bf34562cc989c, + 0xaa5865932fd79064, 0xf24d08d268c24593, 0x7fbd00a215196999, 0x7812cd366d752964, + 0x62e8dcb27ef3d945, 0xf08b7984e1b946dc, 0x547d23ad9a5c1dcf, 0x496b1fb249b27fb7, + 0xcd692e1db5f3b3ba, 0x41931e39f1e1bc61, 0x286c6a7d7edae82b, 0x17ef6638b6c4ca6e, + 0x609beb5a2576a934, 0xcc5e16fe4a69b83c, 0xbbd14d08b078fc24, 0x2a617680f481cb94, + 0x81dbbd5f86e6d039, 0xeb8205e1fc8ecc3c, 0xe5e3bb576faa8042, 0x5d6f1eb9d9df01b5, + 0x9a47b8739c10fb44, 0x398a7caad7ea7696, 0x9c0fc1d7c46adde6, 0x67cd6de0a51978a6, + 0x68ccc4b77a21cca4, 0x1e067066b82f415c, 0xf7ddade6535e1819, 0xf2185c884291751b, + 0xc322b7381fcbe34f, 0x242f593e88290b9b, 0x8e11ccc0ea5e84a3, 0x40e3a2e3346db8a2, + 0xf18bfc3ad2931a2c, 0x2468397394b00144, 0xeae199cce14e6817, 0x05b462686c75a1ae, + 0xda096cb859c51673, 0xd87aeb967a906bef, 0xaabc74493cb02fe6, 0x74d48fc2e7da143e, + 0x6ec1c8fed3f2c1fd, 0xe01e0704b463f18e, 0xc3d88a4d3a8056e4, 0xd01ae0ffab6c8f3f, + 0x881ba052620ae7c7, 0xcea033aef0a823a5, 0x8d2cad91d83df1e3, 0x18746d205e66dbe9, + 0x3061f8e58d046650, 0xd819c59f0ce2cf8b, 0x144e89e93635e870, 0x3415e88279b21651, + 0xd6f7ab944b86c3fa, 0x45f1dd15d0f67bdc, 0xbf0d97c7f4fa24f4, 0x34a7de520a57fcd2, + 0x4ba86fda03e9e2bc, 0xa7995265a025b552, 0x698f6819d5f51cf7, 0xd07dbe9d8a156981, + 0x2683945373857fc1, 0x116f8a84f96167de, 0x8bc832bd85595ebf, 0xb206519d74fdfafa, + 0xde9519b2e9b5cc5f, 0x16fdd6f2da1d8163, 0x7ba32bd48ef56f11, 0x6f4e4d7ee8b29717, + 0xd31576dde7468aad, 0x023bb08848676045, 0xf6dcc083178160b7, 0x42035f426250e683, + 0x343732993cfed89f, 0x0640a870a22d3d58, 0x65cff80b53b4ae6a, 0x27996fa17ab05215, + 0xfd5db01401b21a04, 0x894508784bc1673c, 0x5bfcf43a2380e27d, 0x4cd6dcc2715583b7, + 0xa43b3763e7d4c902, 0x6da83e12ef0c1257, 0xfe80a602b0335aff, 0x293a7d8f4ff344de, + 0xb4ae7c2b8956bf5a, 0x6b45432d38254b4d, 0xd086acbdf15d9455, 0xa4d19e43f41ea87b, + 0xf01f13ba4bb87fbf, 0xca582cf301a299ff, 0x0ddad3d45298fa7d, 0x0646a130459c3999, + 0xc08e3af3747e2cee, 0xfc7db8aa9ed67295, 0x783b329e7bd79d5f, 0x732dbc607957af7b, + 0x8e446ac19fb26555, 0xff1dfa4d61dc89a5, 0xb6fbc46bd8d011d8, 0x185147ec5779f0d7, + 0x6eb2cf6149a5380f, 0xb0e773df803a1eae, 0xc07706c5519bfce5, 0xc35abcf54fa95f14, + 0x40a01d99a38608ea, 0x776dcd6f603c277f, 0x6ae12389b1d6d0bb, 0x8bd981448df92bb9, + 0x426a6a7ca21a2c16, 0x87efd5b71c1bad26, 0x71fb7fc4cd41de48, 0xdd9033c45619d463, + 0x40eaab322654cef7, 0xe077fffed6f3e3a2, 0x375a4dbef9384447, 0x2066b009d2c4a100, + 0xeca4a5794a068447, 0x2128f64bddf341a1, 0x738b4bb1be90bd61, 0x433772cf3813d52e, + 0x9540c88add8e4474, 0x0b6d5decd21d3519, 0x654ead966745642d, 0xe1bfb03c3b4bdb4c, + 0x0b977a9937515b1f, 0x0a4587509ef63870, 0xe89f0de1d9cfd44a, 0x23a91390272e7f68, + 0xd92defbc9096b8d8, 0x004db87174612539, 0xc88ecaabdd1a71f1, 0x050de38393073346, + 0x8af1426d7964e038, 0xf352c4fef8ad5c87, 0x6f26bc7408e26548, 0x0d41543fd9bf3084, + 0xfc4e07553a840fc6, 0x5ef117de86a555a9, 0x1f11c42dffb5ae1b, 0x4147648f07490fa5, + 0x09b35fd7671b21aa, 0x1453b14f7ccca481, 0x944f6fcce4c9b2ba, 0x5b08dd2e3583dc06, + 0xe0220df78dc9c22d, 0x1c200b9506cbf666, 0x8a0b7465eadb523b, 0xfbcb43a91a1e2d80, + 0xe697f44be3c36a58, 0x2f8a8e48fb7e350d, 0x7baba71b8920d55f, 0x10edc0216105bc96, + 0x52db07c79d7a7a63, 0x1916e8cef9452ac3, 0x5cbbbf21f867b6cc, 0xadd583365a690a4b, + 0x4e4ca2c8bffc2fdb, 0xf5fe3416d2eebcfe, 0x839af8b85e452476, 0x8496c0c54ad44e16, + 0x6c46f1ecad4482bf, 0xb794cad76ae18715, 0x67b762eec7c62985, 0x52dc9e68df5b3a53, + 0x0cc7e444b422a5f9, 0xadbfe90841c112b0, 0xfe37b136f0ca5c34, 0xcfe9e47948a8d73e, + 0xee90572b86a30d91, 0x549e72d8262830aa, 0x3361564b469f32c6, 0x1e6eba9e0d2648e2, + 0x5f8e2b2ac5fcb4eb, 0xe4224fa5f71f7cc6, 0x7357a9230c76757b, 0xcad70f74aaf6b702, + 0xeef28ced23894cc2, 0x753fdd3352aefd68, 0x1fed6ba90bbeb9d2, 0x05316f4ab4034b4b, + 0x3396df022b9f63d6, 0x82d7125a7cfd0935, 0x3519a71caf1f87f0, 0xd1dfb7a5cc3974be, + 0xbfae40ecbdbbcc2a, 0x152c11778e08dd54, 0x4a96566a6c848554, 0x3a84d621c340cdd7, + 0xfd47aa1887e2fb03, 0xa63cae94b2f1d099, 0xed61783f3e5b75e0, 0xefd44864106019be, + 0x145ff78b80b081aa, 0x34670e5fcea9230e, 0x876ef976328db371, 0x4221f3a5269942a6, + 0x95315cbd85c648f4, 0x3ca344dc7c3b1600, 0x38421ea39ff28780, 0x31dbeee967c0435c, + 0x27437c3e268402e7, 0xdd0cf8343312a654, 0x965ab9dad1d8aa29, 0xf871706dd3e23509, + 0xce23d06c7a25e699, 0x1b37d59382b27589, 0x3407f004723d6324, 0x56efb69cdb5deaa1, + 0xf46cdd2b9fd604e0, 0xcad3ca79fdac69bd, 0x7252802a574e63cb, 0xc281fb8acc6ec1d3}, + {// seed = 2 + 0xdd16cb672ba6979c, 0x3954eaa9ec41ae41, 0x52cb802771d2966d, 0xf57ed8eb0d0294f2, + 0x768be23c71da2219, 0x6131e22d95a84ad3, 0xd849e4e49bb15842, 0x18e8e5c4978cf00d, + 0x3af5e5867ce1f9bd, 0x06c75a9fffe83d63, 0xe8de75a00b58a065, 0x0a773251bc0d755a, + 0x629dc21e54548329, 0x2a168f5e5a883e70, 0x33547375f0996c86, 0xdfcb4c7680451322, + 0x55c1ecaaaa57e397, 0x4546c346c24f5a31, 0x6f8f0401dfabc86c, 0x7760d2d36ee340b4, + 0xf6448e48bdeb229d, 0xba70e1633b4dba65, 0x069cda561e273054, 0xa010b6a84aebf340, + 0x5c23b8229eee34b6, 0xea63c926d90153af, 0x7d7de27b3e43ec1b, 0xea119541eddc3491, + 0xf1259daeddfc724c, 0x2873ca9a67730647, 0xa1e7710dade32607, 0x758de030b61d43fd, + 0xd2c9bcbfa475edb4, 0x18ade47bb8a0aa29, 0xf7a74af0ff1aea88, 0x6f8873274a987162, + 0x6963e8d876f4d282, 0xd435d4fe448c6c5b, 0x93ec80ba404cafff, 0xcf90d24c509e41e7, + 0x5f0fc8a62923e36e, 0x9224878fe458f3a4, 0xd9a039edf1945bcd, 0x0877d1892c288441, + 0x75205491f4b4740b, 0x30f9d2d523a9085b, 0x4b7f4029fa097c99, 0x170bb013745709d4, + 0x7087af537f11ef2e, 0x28c62b88e08fc464, 0x84bbcb3e0bb56271, 0x485a4b099165c681, + 0x357c63357caa9292, 0x819eb7d1aee2d27e, 0xdaa759eb9c0f8c9d, 0x42cdc36729cc3db5, + 0x9489aa852eddbb06, 0x8161e4f85a84e6d4, 0xa964863fdad3eb29, 0xcc095ddbce1a6702, + 0x3ecfadbb8dc2ce58, 0x971316509b95a231, 0xc8f484d1dbc38427, 0xae9c510c463574c0, + 0xdf2b31179600c21a, 0x440de87bada4dfa3, 0xbd8d30f3f6fb7522, 0x84e6d7f678a0e2d0, + 0x0ec4d74323e15975, 0xf6947610dad6d9ab, 0x73a55a95d73fe3a5, 0x3e5f623024d37eda, + 0x8d99a728d95d9344, 0x8b82a7956c4acdc4, 0x7faeaea4385b27f6, 0x540625ff4aa2ff21, + 0x4aa43b3ebd92ce2b, 0x899646a6df2da807, 0x49225115780942d7, 0xe16606636af89525, + 0xb980bcf893888e33, 0xf9ed57695291b0d8, 0x5c6dd14464619afa, 0x50606d69b733d4f3, + 0x7fb1af465b990f97, 0x3fab2634c8bbd936, 0x556da6168838b902, 0x0f15975902a30e1f, + 0xb29d782ae9e1991f, 0xae00e26ff8f7e739, 0xd3da86458bb292d5, 0x4528ee0afb27e4ce, + 0x49882d5ba49fabad, 0x7e873b6a7cf875ee, 0x777edd535113c912, 0x94ed05e7ff149594, + 0x0b8f95fc4211df43, 0x9135c2b42426fef2, 0x411e6c2b47307073, 0x503207d1af0c8cf8, + 0xd76f8619059f9a79, 0x64d24617855dee45, 0xf7bc7a877923196a, 0xd6cc42ed6a65be79, + 0xe3912ff09d4fc574, 0x4192d03b2bc2460a, 0xa0dcc37dad98af85, 0xfc59049b2a5818a4, + 0x2128bae90a5b975f, 0xbe7067ca05ea3294, 0x5bab7e7753064c4f, 0x42cbf0949ef88443, + 0x564df4bbd017492c, 0xf2c2eb500cf80564, 0x5b92e67eb00e92af, 0x8c4103eef59c0341, + 0x83412122b8284998, 0x888daf2da0636b6d, 0x4d54b10303dd07d6, 0x201190e7c1e7b5ed, + 0x3797510bb53a5771, 0x03f7bc598b570b79, 0xdc1e15d67d94f73e, 0x721e8b499ebe02c1, + 0x71f954f606d13fa0, 0x0c7a2e408c168bf0, 0x07df2ef14f69c89d, 0xe295096f46b4baaf, + 0x7a2037916438737e, 0xd1e861aeaf8676ea, 0xb36ebdce368b8108, 0xb7e53b090ddb5d25, + 0x5a606607b390b1aa, 0x475e52994f4a2471, 0xbcc2038ba55b2078, 0x28b8a6b6c80df694, + 0xb5f0130ec972c9a2, 0x7a87cd2a93276b54, 0x4d0eec7ecf92d625, 0xac1a8ce16269a42e, + 0xa4ca0237ca9637b8, 0xd8dc8ff91202b6ff, 0x75b29846799d7678, 0x761b11a5edd9c757, + 0xf2581db294ef3307, 0xe3173c2b6a48e20f, 0xe46fd7d486d65b3c, 0x1352024303580d1f, + 0x2d665dae485c1d6d, 0x4e0905c825d74d3b, 0x14ff470c331c229e, 0xbdc656b8613d8805, + 0x36de38e396345721, 0xaae682c1aa8ff13b, 0x57eb28d7b85a1052, 0xf3145290231d443a, + 0xd0f68095e23cbe39, 0x67f99b3c2570b33d, 0x54575285f3017a83, 0x9b2f7bb03d836a79, + 0xa57b209d303367a9, 0x7ccb545dd0939c79, 0x1392b79a37f4716d, 0x6e81bb91a3c79bcd, + 0x2c2cd80307dddf81, 0xb949e119e2a16cbb, 0x69625382c4c7596f, 0xf19c6d97204fb95c, + 0x1b2ea42a24b6b05e, 0x8976f83cd43d20ac, 0x7149dd3de44c9872, 0xc79f1ae2d2623059, + 0xca17a4f143a414e1, 0x66d7a1a21b6f0185, 0xed2c6198fe73f113, 0x16a5f0295cbe06af, + 0x5f27162e38d98013, 0xf54d9f295bdc0f76, 0x9ba7d562073ef77b, 0xa4a24daaa2cfc571, + 0x49884cf486da43cd, 0x74c641c0e2148a24, 0xbff9dcbff504c482, 0xf8fc2d9403c837ab, + 0x6ccc44828af0bb1e, 0xbcf0d69b4c19dfdb, 0x8fe0d962d47abf8f, 0xa65f1d9d5514271d, + 0x26ff393e62ef6a03, 0xc7153500f283e8fc, 0xea5ed99cdd9d15cd, 0xfc16ac2ba8b48bb7, + 0xf49694b70041c67a, 0xbd35dd30f5d15f72, 0xcf10ad7385f83f98, 0x709e52e27339cdc2, + 0xe9505cb3ec893b71, 0x2ffa610e4a229af7, 0x12e1bc774d1f0e52, 0xe301a3bb7eacccc8, + 0x1fdd3b6dcd877ebf, 0x56a7e8bda59c05aa, 0x99acd421035d6ab4, 0xfd21e401cecd2808, + 0x9a89d23df8b8d46f, 0x4e26b1f1eb297b9c, 0x9df24d973e1eae07, 0xe6cdc74da62a6318, + 0xfc360d74df992db0, 0xf4eca0a739514c98, 0x481c515ba9bf5215, 0xce89cce80f5f3022, + 0xf487a10fc80e4777, 0x235b379a87e41832, 0x76f72e028371f194, 0xd044d4a201325a7d, + 0x47d8e855e0ffbdde, 0x268ae196fe7334b0, 0x123f2b26db46faa8, 0x11741175b86eb083, + 0x72ee185a423e6e31, 0x8da113dfe6f6df89, 0x286b72e338bbd548, 0xa922246204973592, + 0x7237b4f939a6b629, 0x31babda9bedf039a, 0xb2e8f18c6aeec258, 0x0f5f6ce6dd65a45e, + 0x8f9071a0f23e57d3, 0x71307115ba598423, 0xcbe70264c0e1768c, 0x1c23729f955681a8, + 0xfbc829099bc2fc24, 0x9619355cbc37d5d6, 0xea694d4e59b59a74, 0xb41cf8d3a7c4f638, + 0xae1e792df721cd0b, 0x7cd855d28aac11f6, 0xca11ba0efec11238, 0x7c433e554ce261d8, + 0xe3140366f042b6ba, 0x8a59d68642b3b18c, 0x094fcdd5d7bccac2, 0x9517d80356362c37, + 0x4a20a9949c6c74e8, 0xc25bcf1699d3b326, 0xa8893f1d1ed2f340, 0x9b58986e0e8a886e, + 0x29d78c647587ce41, 0x3b210181df471767, 0xd45e8e807627849d, 0x1ec56bc3f2b653e3, + 0x974ff23068558b00, 0xdb72bdac5d34262c, 0x23225143bb206b57, 0xd0a34cfe027cbb7e}, + {// seed = 3 + 0x39209fb3eb541043, 0xee0cd3754563088f, 0x36c05fc545bf8abe, 0x842cb6381a9d396b, + 0xd5059dcb443ce3bf, 0xe92545a8dfa7097e, 0xb9d47558d8049174, 0xc6389e426f4c2fc0, + 0xd8e0a6e4c0b850d3, 0x7730e54360bd0d0d, 0x6ecb4d4c50d050d5, 0x07a16584d4eb229f, + 0x13305d05f4a92267, 0xb278ddd75db4baec, 0x32381b774138608f, 0x61fe7a7163948057, + 0x460c58a9092efee6, 0x553bf895d9b5ff62, 0x899daf2dabfd0189, 0xf388ab9c1c4b6f70, + 0xd600fe47027ea4cd, 0x16d527ec2b5ef355, 0x5ac1f58ff6908c81, 0xa08d79ff8ee9ffe8, + 0xc1060a80b7a5e117, 0x14b2c23118c60bda, 0x8cc0defbb890df8f, 0xe29540fd94c6d28b, + 0xa604f003f82d5b71, 0xa67583d4eb066d18, 0xd62cbd796322b3fc, 0x070cfe244cdcccf3, + 0x73557c30b3af47e5, 0x2e544e31153a2163, 0x996eef7464d5bead, 0xbc71cb5ab0586cdc, + 0x0bfcb6c1b517ed69, 0x62b4f1fcc82e8ca0, 0x0edbc68f544965c5, 0x40fa39baa24af412, + 0xf39aeb2413dab165, 0x17e6013e7afee738, 0x8109bff1c8d42a9d, 0x3cd99863390989b5, + 0x02021a4cc9c336c8, 0xa06060778cb60aa4, 0xd96591db60bc1e06, 0xd2727175183f4022, + 0xcdc1f1c5bce3e7ce, 0xb393ccc447872a37, 0xdf6efe63257ead3a, 0x20729d0340dbceb6, + 0x9f3d2d26fc0ea0d7, 0xf392e0885189bd79, 0xdf2ee01eb212b8b6, 0x6e103a0c0f97e2c3, + 0x96c604a763bd841b, 0x9fc590c43bba0169, 0xf92dcd5ddc248c40, 0x113a8b54446941dc, + 0x5943eda146b46bb8, 0xbf657901a36a39a7, 0x5a4e0e7ea6568971, 0xb94c635bae9f9117, + 0x2626fb65b3a4ef81, 0xa59bfd5478ce97de, 0x79112ba9cc1a1c63, 0xf41f102f002cf39c, + 0x0a589bcbfb7ff1c8, 0xa1478c53540c4fa1, 0x60d55e72c86dfaca, 0x312e7b6840ea7a39, + 0x8aae72dcccfe1f75, 0xff2f51f55bf0247a, 0x3c2e4b109edb4a90, 0x5c6d73f6525c7637, + 0xe49acb04a199f61c, 0x27860642d966df7f, 0x541ce75fb1e21c30, 0xd9fcd6f90806c7cc, + 0xb87c27bc93a7969b, 0x92f77a1179b8f8dc, 0xb1f29379deb89ed4, 0x7e63ead35808efe7, + 0x13545183d7fa5420, 0x575f593e34cf029d, 0x27f1199fb07344ae, 0xe67f95f7dc741455, + 0x49b478b761ab850b, 0xd7bedf794adfc21e, 0xdc788dcd2dda40ae, 0x14673eb9f4d8ad35, + 0x0cced3c71ecf5eb1, 0xe62d4e6c84471180, 0xdfe1b9e2cb4ada7d, 0x70185a8fce980426, + 0x0ce2db5e8f9553d6, 0x1fedc57bb37b7264, 0xb9310a2e970b3760, 0x989ff8ab9805e87d, + 0x0b912d7eb712d9ee, 0x1fe272830379e67c, 0x16e6a73aff4738fb, 0xeed196d98ba43866, + 0x7088ca12d356cbe2, 0x23539aa43a71eee0, 0xed52f0311fa0f7ad, 0xa12b16233f302eea, + 0xc477786f0870ecb4, 0xd603674717a93920, 0x4abe0ae17fa62a4c, 0xa18f1ad79e4edc8d, + 0xc49fe6db967c6981, 0xcc154d7e3c1271e9, 0xdd075d640013c0c0, 0xc026cd797d10922a, + 0xead7339703f95572, 0x4342f6f11739eb4b, 0x9862f4657d15c197, 0x4f3cb1d4d392f9ff, + 0xe35bffa018b97d03, 0x600c755031939ad3, 0xb8c6557ffea83abf, 0x14c9e7f2f8a122ea, + 0x0a2eb9285ee95a7c, 0x8823fec19840c46f, 0x2c4c445c736ed1d0, 0x83181dff233449f1, + 0x15ed3fca3107bef5, 0x305e9adb688a4c71, 0x7dbef196f68a3e2e, 0x93e47ece3e249187, + 0x8353c5e890ead93c, 0xea8a7ae66abafdf7, 0xf956dbb6becf7f74, 0x9f37c494fbfdb6e4, + 0x11c6cbaa2485dd32, 0x206f336fcca11320, 0x9befe9a59135d8fe, 0x5f3ef8b8db92c7db, + 0xbb305e556ce0ce9a, 0xf26bdafb1305887f, 0xcbf28abe23f08c61, 0x0bc64173b914e00b, + 0x9168da52e983f54a, 0x6ea41d09c3574a3e, 0x78aa44d4a74459ae, 0x2931422878387bf5, + 0x018f64a3a92c2d9c, 0x9be43f6752e66b34, 0xae378890decd1152, 0x07325329a1cb7623, + 0x3b96f4ee3dd9c525, 0x2d6ebcdbe77d61a3, 0x10e32b0e975f510c, 0xffc007b9da959bf9, + 0x38bf66c6559e5d90, 0xbe22bdf0bf8899fe, 0x87807d7a991632a8, 0x149a0d702816766a, + 0x026f723db057e9ab, 0xeeecb83625ec6798, 0xcec2ed5984208148, 0xd985a78e97f03c84, + 0xf96c279e7927b116, 0x99d5027b3204f6e2, 0x13a84878c3d34c55, 0x5cf5ec96229e9676, + 0x0bc36b07e4f8e289, 0xbed33b80a069914d, 0x2fbfbdd1ff4b9396, 0xab352bb6982da90f, + 0x154d219e4fa3f62b, 0x4d087512bb6b9be7, 0xc582e31775ee400e, 0x7dadb002ae8c4a4e, + 0xaae2957375c1aee2, 0x5f36ca643356625b, 0xf87cf8eb76e07fb7, 0x46f432a755e02cc3, + 0x36087e07aba09642, 0xe5642c1e4ebb9939, 0xb9152d22338eefad, 0xf7ba44278a22cf7f, + 0xd3b8013502acd838, 0x7761511da6482659, 0xb0857621638e8e50, 0x552eddb4a8b1d5f5, + 0xc43d9861e812c3ea, 0xd765c2aada47910c, 0x21c935b68f552b19, 0x6256d5641a2b47dc, + 0xab711d8e6c94bc79, 0xa8d0b91a2a01ab81, 0x5e6d66141e8d632a, 0x7638285124d5d602, + 0x794876dbca3e471f, 0x951937d8682670ce, 0x0f99cb1f52ed466a, 0x8c7cd205543b804c, + 0x2fd24d74a9c33783, 0xe5dcb7b7762e5af1, 0x45e6749cca4af77c, 0x540ac7ee61f2259f, + 0x89c505c72802ce86, 0xeab83b9d2d8000d1, 0x9f01d5e76748d005, 0xc740aaef3035b6d0, + 0x49afcd31d582d054, 0xcba5dc4c1efb5ddc, 0xc0a4c07434350ca1, 0xfc8dfaddcc65ee80, + 0x157c9780f6e4b2d9, 0x9762a872e1797617, 0xc4afae2cf3c7e1bd, 0x71cde14591b595d4, + 0x8843c3e0e641f3b9, 0xd92ecd91dce28750, 0x1474e7a1742cb19f, 0xec198e22764fa06b, + 0x39394edb47330c7d, 0x00ba1d925242533d, 0xaed8702536c6fb30, 0x6d3618e531c2967a, + 0x77f7cedcd7cc0411, 0xbc1e2ab82be5b752, 0x07b0cf9223676977, 0x596c693b099edd53, + 0xbb7f570f5b9b2811, 0x96bfdad3c4a6840c, 0x668015e79b60c534, 0x3ad38d72123f1366, + 0x6b994d81d2fcbb09, 0x70885f022c5052d8, 0xc891ee79d9306a7b, 0x2c4df05c0ed02497, + 0x19ebc13816898be2, 0xea7c64df11c392a2, 0xb7663e88dd12e1bd, 0x79f768cb8e154c21, + 0x1fb21b12e945933b, 0xe6a9045643f6906e, 0x544c47acd7e15371, 0xb7709b14f727e3d1, + 0x326ee36a46942971, 0x477f1cf7b0e2d847, 0x88b8f6b82b3b0c24, 0x18bc357b80e3cd5c, + 0x3333de70e4d66e0b, 0x4fd4c5e148583cf6, 0xae1b62f3008c0af3, 0xc49f419b6ab29cf5, + 0x2c29fa65afc3fa28, 0x4b19d93734d03009, 0x7dd6c09e589276ad, 0x1cece97f30de48ad}, + {// seed = 4 + 0x58bdf4338602e4fb, 0x71a5620b02c926d5, 0x3811c960129c2d9f, 0x29c2fb11fccac567, + 0x0d6b1ea7780f1352, 0xcc4d3ddfae3f87b3, 0xfdd30257362a586b, 0xabc948fde69f25f1, + 0x51b3523469d30f7b, 0xe0f0322724405ace, 0xd3729266d896da1e, 0xb10c37e5147915bf, + 0x8b577039f9fa32a3, 0xe677c6a9cbfb44b3, 0x7317a756ebb51a03, 0xf8e988ef37359485, + 0x600fc1ef3f469ff3, 0xbf0b8f8520444e01, 0x3711168b08b63d73, 0x34146f2944a6cb36, + 0x717feb263862cdde, 0x7185f8347db00412, 0x900798d82127e693, 0x84089e976a473268, + 0x10f8308c0d293719, 0xf62a618d4e5719b8, 0x8bdbd257a1a9516f, 0xf49f666fd7a75110, + 0xbaf45e2db7864339, 0xe4efa1ea0c627697, 0x3e71d4c82a09fe10, 0x54a2a51cf12127bb, + 0xa0592c9f54ba14cd, 0x27dd627a101c7a42, 0x3d2ceb44b3d20d72, 0x7ee1f94a68ca8f5d, + 0x7e8cb8651b006c36, 0xbd9fa7ca3a475259, 0x856de173586a7b34, 0xcedb291b594cb1b5, + 0xa3d6e462fd21cddc, 0x74561d10af9118e4, 0x13a3d389fc2d4b36, 0xeea8594a4a054856, + 0xf56d7474d9ba4b13, 0x25ddce2f6490b2fd, 0x920653ff3a8d830b, 0xcd8c0c9cdac740d1, + 0x2c348a738db9c4a0, 0x2967ccbe8ea44c22, 0x47963f69adb049f8, 0xf9d01eb5b4cf7eb6, + 0x7a5c26eb63a86bd2, 0x62ad8b7a71fa0566, 0xb373213179f250ae, 0x589d4e9a88245a4d, + 0x433dafebe2d558a8, 0x521fbef2c8fe4399, 0x62a31f9ff9ccd46b, 0x51602203eba7c1a6, + 0x9afc8c451b06c99f, 0xb529085bdbaffcea, 0xac251825cc75892b, 0x94976a5bce23d58e, + 0xdd17925b6c71b515, 0x568fd07a57bce92e, 0xefac31200d8bd340, 0x716c3e466b540ef9, + 0x3d2c9e380063c69b, 0x14168f9a3662dd83, 0xd298c7504dbc412f, 0x74490a94f016719f, + 0x0e0da431e1ab80c8, 0xe321f63dc6b169ae, 0xf08671544febc95a, 0x39324450cc394b3b, + 0xea6e3d35f1aa3a70, 0x8ef8a886508ce486, 0xdc1a631ef0a17f06, 0xfda2b3fbcd79e87b, + 0xd75bcae936403b10, 0xf88b5bd9f035f875, 0xc43efec2e3792dd4, 0xe9fac21a9d47cd94, + 0xc2876f0c4b7d47c3, 0xaba156cf49f368b4, 0x5ccda2170fa58bf9, 0xadc92c879ed18df7, + 0x110c1b227354e6c8, 0x298ee7a603249200, 0xde92142ede0e8ee7, 0x88e4a4610644ba9e, + 0xbb62d277e7641d3a, 0xb9be1985b7bf8073, 0x29024e5426cdb0d1, 0xf6aefd01f3092ab8, + 0x2a07087b313133aa, 0x6d71f445d6dfc839, 0x1e2412ff12e5526b, 0xed5cdeba6617b9e1, + 0x20b1d0d5e5f8760e, 0x12ff15705c368260, 0x7bf4338b7c387203, 0x34ff25f00cd06185, + 0x1148c706c518cf28, 0x5c04f0623388f025, 0xcb9d649275d87d79, 0x9b5f0c24fabc42ec, + 0x1a7b5e7964e33858, 0x2a81bbd8efdc6793, 0x8d05431ffe42752e, 0x83915cd511002677, + 0x580ed4d791837b31, 0x5982e041d19ff306, 0xcad0d08fa5d864ca, 0x867bee6efe1afa63, + 0x26467b0320f23009, 0xd842414dfda4ec36, 0x047fcdcbc0a76725, 0xbddb340a3768aeca, + 0xef4ce6fa6e99ab45, 0x88c5b66c7762bf9b, 0x5679f1c51ffb225d, 0xdab79048317d77ee, + 0xf14e9b8a8ba03803, 0xe77f07f7731184c1, 0x4c2aab9a108c1ef5, 0xa137795718e6ad97, + 0x8d6c7cc73350b88b, 0x5c34e2ae74131a49, 0xd4828f579570a056, 0xb7862594da5336fc, + 0x6fd590a4a2bed7a5, 0x138d327de35e0ec1, 0xe8290eb33d585b0b, 0xcee01d52cdf88833, + 0x165c7c76484f160e, 0x7232653da72fc7f6, 0x66600f13445ca481, 0x6bbdf0a01f7b127d, + 0xd7b71d6a1992c73b, 0xcf259d37ae3fda4a, 0xf570c70d05895acf, 0x1e01e6a3e8f60155, + 0x2dacbb83c2bd3671, 0x9c291f5a5bca81af, 0xd976826c68b4ee90, 0x95112eec1f6310a2, + 0x11ebc7f623bc4c9a, 0x18471781b1122b30, 0x48f7c65414b00187, 0x6834b03efa2f5c30, + 0x0875ef5c2c56b164, 0x45248d4f2a60ba71, 0x5a7d466e7f7ba830, 0x2bebe6a5e42c4a1d, + 0xd871d8483db51d10, 0x6ee37decd2fd392f, 0x7d724392010cede3, 0x8e96ef11e1c9bcc8, + 0x804a61d86b89d178, 0xbb1b83ce956055ec, 0xcb44e107410ff64f, 0xc426bb09ee0ba955, + 0x057c08f42c3dd7f1, 0x40ea1ec148602bdf, 0xc24688deeb65d7f1, 0xd8bcc53c768ba4e4, + 0x16e0e3af65c1106c, 0xfc12f7e7d647218b, 0x70d6e1d3ee93cef4, 0x01d2a505c4541ef9, + 0x1ef79e16e764d5c3, 0x0363d14d13870b98, 0xb56ef64345d06b11, 0xe653d557ebb7c346, + 0x8304a8597c2b2706, 0x1536e1322ce7e7bb, 0x525aec08a65af822, 0x91f66d6e98d28e43, + 0xe65af12c0b5c0274, 0xdf6ae56b7d5ea4c2, 0x5cef621cedf3c81c, 0x41e8b1ffd4889944, + 0xb5c0f452c213c3e5, 0x77af86f3e67e499b, 0xe20e76ea5b010704, 0xbdc205ab0c889ec0, + 0xc76d93eb0469cd83, 0x17ac27f65cab0034, 0xd49ec4531fd62133, 0x07a873ea2f1b9984, + 0xbff270dfef0032ee, 0x1764dbe91592f255, 0xe40363126f79e859, 0xa06cad3ab46971f6, + 0x0be596e90dedd875, 0x3387cce5c1658461, 0x44246acf88a9585e, 0xe0ad82b92d5ecb2c, + 0x2177491c9a1600a6, 0x16e7c4aac0f02422, 0x75792eeeec15c4e1, 0x2309cd359d08ee30, + 0x7cd9831dd1b83b0a, 0x374914a7c4ee8cf0, 0x0dd17765c9ac2e54, 0xb7847470ba9a7688, + 0xfba4f4bbe2991173, 0x422b203fc3de040e, 0x63bfcaf2ecf2ab0e, 0x0c5559f3a192946e, + 0xfdf80675c1847695, 0xf5f570accab842c9, 0x65cc5a448767afea, 0x1efeb0a7ee234f2f, + 0x9b05f03d81e7b5d2, 0xe7c31317a8626cf4, 0x620f2a53081d0398, 0x1b6de96cdd9943ae, + 0x8c226a436777d303, 0xa08fbbd50fafb10d, 0x6a64c5ec20104883, 0x9c9c653502c0f671, + 0x678a02b2174f52a0, 0x68e008ba16bbad4b, 0xa317c16d2efb860f, 0xeab2075d17ed714c, + 0x565eeeddf0c4ea15, 0x8ec8e94d242a6c19, 0x139e8e27d9000fae, 0xc977a7ff1b33d2f5, + 0x1d0accca84420346, 0xc9e82602cd436e03, 0x6a2231da53d2ccd3, 0xb44b12d917826e2a, + 0x4f4567c6a74cf0b9, 0xd8e115a42fc6da8f, 0xb6bbe79d95742a74, 0x5686c647f1707dab, + 0xa70d58eb6c008fc5, 0xaaedc2dbe4418026, 0x6661e2267bdcfd3d, 0x4882a6eda7706f9e, + 0xf6c2d2c912dafdd0, 0x2f2298c142fd61f9, 0x31d75afeb17143a8, 0x1f9b96580a2a982f, + 0xa6cd3e5604a8ad49, 0x0dae2a80aad17419, 0xdb9a9d12868124ac, 0x66b6109f80877fac, + 0x9a81d9c703a94029, 0xbd3b381b1e03c647, 0xe88bc07b70f31083, 0x4e17878356a55822}, + {// seed = 5 + 0xb3c58c2483ad5ead, 0x6570847428cdcf6c, 0x2b38adbf813ac866, 0x8cb9945d37eb9ad3, + 0xf5b409ec3d1aed1c, 0xa35f4bffc9bb5a93, 0x5db89cde3c9e9340, 0xff1225231b2afb2b, + 0x157b0b212b9cc47d, 0xf03faf97a2b2e04d, 0x86fdab8544a20f87, 0xfcb8732744ae5c1c, + 0xd91744c0787986d5, 0x5f8db2a76d65ad05, 0xcff605cbed17a90d, 0xf80284980a3164e7, + 0x59cc24e713fccc7d, 0x268982cada117ce4, 0xcd020e63896e730e, 0xe760dc46e9fe9885, + 0x6aaece8ab49c6b5d, 0x7451194d597aae3e, 0x35d4385900332457, 0xa40fb563a096583d, + 0xa797b612f7f11b76, 0x2fed6eb68e6a2b9b, 0x2f06ee64aeffd943, 0x9dd0e49d9ca45330, + 0x97d48f08bd7f1d8f, 0x1cfa7fe3ebe4d8ee, 0x2a2ba076bd397d42, 0x68c4344f7472f333, + 0xce21ec31987d74b5, 0xb73dabdc91d84088, 0x801aadee592222fe, 0xaf41345398ebc3f5, + 0x8a8f653d7f15ee46, 0xce2d065ff2ba2965, 0x4e05da515da2adb7, 0xa6dbdb8aa25f0fd4, + 0xca9f9666bbd2d5a9, 0x6b917ce50bd46408, 0x1550cc564ba6c84d, 0xb3063ae043506504, + 0x84e5f96bb796653d, 0xe2364798096cf6e3, 0x3b0dfedf6d3a53d0, 0xb7e4c7c77bde8d93, + 0xe99545bac9ab418a, 0xa0e31f96889507bb, 0x883c74f80c346885, 0xf674ae0b039fd341, + 0x8bb6ce2d5e8d1c75, 0x0c48737966a7ed7c, 0x04fcdf897b34c61c, 0xe96ac181bacbd4d6, + 0x5a9c55a6106a9c01, 0x2520f020de4f45d3, 0x935730955e94d208, 0xce5ad4d7f3f67d3b, + 0xa4b6d107fe2d81ca, 0x4f0033f50ae7944e, 0x32c5d28dd8a645a7, 0x57ce018223ef1039, + 0x2cbab15a661ab68e, 0x6de08798c0b5bec2, 0xee197fb2c5c007c6, 0x31b630ac63e7bda2, + 0xab98785aefe9efe3, 0xa36006158a606bf7, 0x7b20376b9f4af635, 0xa40762fdc3c08680, + 0x943b5faffd0ebee2, 0x7f39f41d0b81f06e, 0x7c4b399b116a90f8, 0x24e1662ac92bc9f3, + 0xcf586fc4e8e6c7db, 0xe46e0d047eeb12d7, 0xe8021076e4ea9958, 0x11fc13492e3ca22a, + 0xd61eae01410397e3, 0x7e8c4a58036a8e9f, 0x068a6de267970745, 0x64faab129bef1a41, + 0xb4a6f720943dad01, 0x631491058d73a9d5, 0xdad4fe95eab3ec02, 0x0a8b141c5c3a44f6, + 0x9fc69d4c2b335b98, 0x94d5f84a07d6e4cd, 0x1b73965de143c608, 0x443932c2dda54bcc, + 0x7397818fb0b04cd2, 0xef4ab03a1202b277, 0xf3d2ee459c0c2b92, 0x182d4daf8b058a87, + 0x90e63035d7b51368, 0xba4cd8b9a95d45fd, 0x12a7392c76731090, 0x890d264ec5d082d2, + 0xeeaf5c363da4994e, 0xd6aad756902123fb, 0xb531ebebdb28f191, 0xe71ce659fc59babd, + 0x37c1b94f63f2dcb5, 0xe4e3abeb311f9b96, 0x4a31b72ccb8695d3, 0x52cae1f0629fdce4, + 0xe5b0475e2ed71369, 0x2724e8c3506414fb, 0xbab0367920672deb, 0x0161a781c305449f, + 0x37b70f40f5bb60be, 0xddd1094c50251a01, 0x3b28283afd17224e, 0x06dec0cfe889fc6b, + 0x47608ea95bb4902d, 0xad883ebc12c00e82, 0x9e8d7ae0f7a8df29, 0xa79443e9f7c013a1, + 0xcfa26f68b7c68b71, 0x33ae6cc19bda1f23, 0xd9741e22b407887f, 0xf2bff78066d46b1c, + 0x794123191c9d32d4, 0x56cb6b903764ec76, 0x98775d0ef91e1a5a, 0xae7b713bc15c1db9, + 0x3b4c1a7870ed7a0d, 0x46666965f305cc34, 0x0ea0c3b2e9c6b3cd, 0x4dc387039a143bff, + 0x5f38bb9229ef9477, 0xea5d39ba72af7850, 0x69a5ed0174ce2b6d, 0x06969a36bfe7594d, + 0x0adee8e4065ccaa3, 0x908a581d57113718, 0x64822d6c5a8190ed, 0x8c5068b56ace4e4c, + 0x88ba3b4fb4e30bef, 0xa6ec0b8bb5896cfe, 0x4e23fcc6b47996fd, 0xe18e75b0dd549c7a, + 0xcd90f17e106cf939, 0x1666fdfb2ef7c52f, 0x4fae325f206dd88c, 0xe7bc1160e25b062d, + 0x3cc999cb246db950, 0xc5930a7326cd5c37, 0xb008a48a211367bd, 0xc5559da145a88fd4, + 0x1e3ad46655fac69c, 0x7834266b4841bfd7, 0xa764450fbffc58cc, 0x54d8cf93a939c667, + 0x93c51f11b21b2d9d, 0x0964112082ed65cc, 0x4c2df21213e7fb03, 0xf0405bc877468615, + 0x17b4fc835d116ab4, 0xa6b112ae5f3cb4ef, 0x23cfc8a7fd38a46e, 0x8e0a360dc2774808, + 0x24ca9c8092105ad5, 0xafd3f75524f2e0d5, 0x4f39ed7dbaddc24c, 0xe5e362c7679a7875, + 0x00914a916b07b389, 0xdfe1119b7d5ab5da, 0xabd6ed9940e46161, 0x630ed2044171e22c, + 0xdecc244157dd1601, 0x777e6d5b4b4868d5, 0x9b3530bee67017d8, 0xd2faf08b291fdcb9, + 0x006e99455d6523de, 0xd559b5817f6955b5, 0xefcc1063b0088c61, 0xed73145ae0f00ae7, + 0xab2af402cf5b7421, 0x897767f537644926, 0x26c9c0473ca83695, 0x192e34e1881b2962, + 0xf7cf666ec3b3d020, 0x27f9b79c7404afb7, 0xe533e8bed3010767, 0xe5817838e11d05d3, + 0x65659c531bd36517, 0xd427c5e0a23836fd, 0xf3eab7ea58fa3528, 0x07683adae1289f35, + 0x201d6af7e896dd32, 0xd5da938b9a21ad88, 0x843fb73ad67bc316, 0x1782ec7d5feef21b, + 0x943f66f6ec772877, 0x7e9112e7b26da097, 0xeac8161f8663c2c7, 0xe8600db480a9ebf4, + 0x07807fc90f6eaf5f, 0xe0e4c9deb41abf83, 0xbdf533db271f9c15, 0xb398411b0497afe2, + 0xdebb45ef25448940, 0xe7a5decefcd376c4, 0xaf1ef3c728c83735, 0xb8b83a99355cb15a, + 0x6444a0344f1611e4, 0xe8bb7f5cf3c60179, 0x77ab5c5177e75ff7, 0xc38fd6fa849d585d, + 0x390d57d53029060a, 0xa66327eb7b8b593c, 0x6350a14f6fcd5ac9, 0x2c08125bcd7008b4, + 0x2d00c299a6a6bf8e, 0x6b0039c1f68d1445, 0x0035150c5d06f143, 0xa34d01628cc927e1, + 0xdf5b3164d7b2ede1, 0x8167db1d0583d72e, 0x4e13b341cd2ae8bc, 0xa693d9b1f416e306, + 0xc15ed7ca0bc67609, 0xdc344313c1c4f0af, 0x88b6887ccf772bb4, 0x6326d8f93ca0b20e, + 0x6964fad667dc2f11, 0xe9783dd38fc6d515, 0x359ed258fa022718, 0x27ac934d1f7fd60a, + 0xd68130437294dbcc, 0xaf5f869921f8f416, 0x2b8f149b4ab4bf9f, 0xc41caca607e421cb, + 0x7746976904238ef9, 0x604cb5529b1532f0, 0x1c94cd17c4c4e4ab, 0xe833274b734d6bbe, + 0xe9f1d3ef674539ce, 0x64f56ed68d193c6a, 0xe34192343d8ecfc1, 0xcb162f6c3aa71fe8, + 0x99eaf25f4c0f8fa4, 0x92f11e7361cb8d02, 0xb89170cddff37197, 0x4f86e68a51e071e3, + 0x31abf6afd911a75b, 0x6d20cf259c269333, 0x4150b9f88fcb6513, 0x705063989ebf7451, + 0x559231d927c84410, 0x1ca8ec4b098bc687, 0xebed22405c9180e0, 0xaa815b37d052af59}, + {// seed = 6 + 0x946ac62246e04460, 0x9cebee264fcbc1ae, 0x8af54943a415652b, 0x2b327ed3b17b8682, + 0x983fde47b3c3847e, 0x10a3013f99a2ad33, 0x6e230bb92d2721ef, 0x1cf8b8369e5c5c50, + 0x7f64017f2b7b3738, 0xd393248a62417fa1, 0x9ff01c0b20a372c5, 0xb0e44abce7e7c220, + 0xcebb9f88d48a815f, 0xdb7df6bd09033886, 0x7844fc82b6fa9091, 0x72d095449863b8ec, + 0xc13e678c89da2c7e, 0x6caf4d5ad231d12f, 0x2e0ab7b5fcf35c49, 0xf410720cb932a70f, + 0xd66ea581f16fce06, 0x175c9f002f57dc98, 0xccbcfd0d32988775, 0xfde4c407d3b0a232, + 0x5db2931ae7e97223, 0x6e07e2173085809f, 0x6e1d1ec0f9cad73c, 0xb2fc251a7f802619, + 0xbc1fc17f04f342de, 0x8de8f21ec658e078, 0x72c0f40cbee53fd6, 0x0678244411fc17a1, + 0x1d5837ca166b9bbd, 0xc8cada003c554345, 0x6a2fe2bfb2e58652, 0xfca9d797a6f7988b, + 0x6699e24ac737948b, 0x69623ffcb05789ba, 0x946429c529d95b75, 0x0d14df0b2a13970f, + 0x593d8592c440dfec, 0x2ee176f3d7e74b94, 0xae003f1da3be9e26, 0x0c7b02c4c0f6764a, + 0x3117e2fa1f632462, 0xf0f23265b6f1eaeb, 0x3111255d9b10c137, 0xc82745e509a00397, + 0xbd1d04037005fea7, 0xe104ab0dd22a9036, 0x51b27ce50851ac7a, 0xb2cb9fb21b471b15, + 0x29d298074c5a3e26, 0x6ebdf2058b737418, 0xc4a974041431b96f, 0x1ec5a30ccb6bdaac, + 0xe818beede9bf4425, 0x4b69b1bce67a5555, 0xf5c35f1eb0d62698, 0xf4509bbd8e99867c, + 0xb17206debd52e1bc, 0x35785668c770b3be, 0xe9343987ff5863bc, 0x2ee768499ac73114, + 0x5132bb3426eeaaf4, 0x471bce2c6833c5ff, 0xbb9a2d5428e6f6f9, 0xd5678943c595792d, + 0xab2a65e7f81e479c, 0xa82407bb23990b31, 0xdae321383984923c, 0x01823bb22648e6f1, + 0xda6e8df4214a8b04, 0x0e172bb88e03d94f, 0x552da6c22e362777, 0x7ce67329fb0e90cb, + 0x7b2d7f287ede7ebf, 0xd44f8222500651bd, 0x4acca1ef58fbb8ab, 0x428ecf058df9656b, + 0xd7e1ec6a8987c185, 0x365be6a54b253246, 0x168849be1e271ee8, 0x6a00f3c4151a8db2, + 0x37602727ca94b33d, 0xf6b50f18504fa9ce, 0x1c10817f6bc872de, 0x4bfe1fe42b0f3638, + 0x135fad4b8ef6143b, 0x1b25ad2bafc25f58, 0x41e37f85cf321f92, 0xfc73f75d9d5b9bea, + 0x9eb3694d1e9cb7e1, 0x601d51f08fa83b90, 0x234a2a9b88366f41, 0x63fe903e16f2c3bf, + 0x1cdbd34fa751c0b0, 0x0ce4fc6747c0558c, 0x51ed72afb8bb49aa, 0x20313ba13ca12c96, + 0x271fa38f9ebd54c1, 0x3696a5ac03a8edde, 0x05602be7df625702, 0x11f1ac73790f7a9f, + 0xa2836c099f0810bd, 0xe5ac2e47caa532fa, 0xd9c000a66d39f681, 0xd93d900e6f3d9d5f, + 0x792c81c65b7900f2, 0x5c5dce790ee20da1, 0x74ff1950edec1aee, 0x71fc85fa1e277d8f, + 0x0e77df17d6546cbc, 0x07debad44816c3b4, 0xbafa721581e92a70, 0x8ab6fbe2ed27bba8, + 0xe83243a20dea304a, 0xaa85a63a84c00a07, 0xde0e79917fc4153a, 0x21bb445e83537896, + 0xeedcac49fc0b433a, 0xffb2926a810ae57a, 0xf724be1f41d28702, 0x79cb95746039bb3b, + 0x5a54fe3742a00900, 0xda4768d64922c04f, 0x420396a84a339dae, 0xa171e26ee5e8724e, + 0x4c8da7c5d289c20a, 0x9ebd79a1a8e94742, 0x39235232b97e9782, 0xb75df0be9bba7d80, + 0x0c1d204dd87d48fc, 0x8f81f3e7177266e8, 0xe4a460b39e78d72b, 0x50b98fa151e65351, + 0xb7cb585c3ee1eddc, 0x11cdad9a76ee1dc4, 0xa38054a78595dc1c, 0x92f09e2ec4978edc, + 0xa8f0061b5efdabaa, 0x04bcc4abc224d230, 0xc58606738e692d46, 0xdd2b27b565952433, + 0x19e6ed1b740beec0, 0xceadd49b2ef9891f, 0x328178c28fe95cad, 0xe5ad4c43afe02848, + 0x03c0cb538cd967c0, 0xec4352526d19a630, 0x4c7e99389d39b031, 0xf65dd05362c2deb6, + 0xd1e70daf6879d28d, 0xbe9f57db6309b265, 0xa4b66f370b872bb7, 0xe26896fbc6ee1fd5, + 0xac705e661bfcf7c5, 0xab4d0d07d7f09940, 0x976417c06aeb6267, 0x8161c684a6bd468c, + 0xf77b6b9976dc4601, 0xc6489b779a39c12c, 0xb2aa58d5681cea1a, 0x043b1b40f8c3e04c, + 0x681fcbfadc845430, 0xab8896c921ba8def, 0x57aaf172606f37b2, 0xc3735048cd5eb8d7, + 0xa7078b96955631bd, 0xdd6b3543aa187f33, 0xc7103ea4a2a697fd, 0x8d7b95f6ff1f7407, + 0xe44f419e84709530, 0xf340caa9132cbb0a, 0x2ba407283143c66c, 0xe1be240ca636c844, + 0x90d32f2877ac08bc, 0x5d26e6294b2c8673, 0x4a6b2f5b27c87a44, 0x961fb9043f76d34f, + 0x0afee02d8d3c55d2, 0x6228e3f48c42e5dc, 0xc338e69ee6593675, 0x853f74b16efb7bdd, + 0xd062f40bdd22e687, 0x647164b9ab4c4190, 0xf94689f67d598369, 0x8e4b29d87a5012d7, + 0xaf02b8b925656fbd, 0x7a722a767179a630, 0xb5c8afe937a75ace, 0xfdb8e8d02d279372, + 0x887ef700cb25fae1, 0xcfe9bd912f72cabe, 0xb1d4dedc24f978de, 0x517522d38319cc2a, + 0x7dd87b2b36aab798, 0x579c4ff3046b5a04, 0xf5c5975c5028b7a7, 0x7094579d1000ec84, + 0xbc8d5b1ea70a5291, 0x161b2d783be8855c, 0xd26d0b0d6d18279f, 0x0be1945f02a78bd5, + 0xb822a5a9e045415b, 0x2fe9d68b1ccc3562, 0xb2e375960033d14f, 0x26aca04e49b4ff22, + 0x732a81c862112aea, 0x8bd901ed6e4260b8, 0xe839532c561ad5b0, 0x8fb6e4d517a79b12, + 0x0dd37f8c0be9b429, 0xc8ad87ad12f1b1b0, 0xc51f3aa62b90318b, 0x031a7e8b86c1cefc, + 0xa95547af2b70fc76, 0x9cb3615c5a98801e, 0xa387e3c3341d7032, 0xa087ea52a1debaef, + 0x16325ec9a2e6e835, 0x587944a484c585eb, 0xc8879033bde22ecc, 0xa39dbfce709c464a, + 0x7acc010f99208774, 0x98dd2973a096c5ad, 0x26458b51139f198c, 0x2f5d19575e8c4f02, + 0x726643f0d38af352, 0x44d879b6d73e6e94, 0xa68a03885c980abe, 0x06048acd161c40c0, + 0xa4dab8f89d405d28, 0x7120c880cb04be18, 0xa062ace22a1cf0cf, 0x3901a9daf29704f4, + 0xff08f3ed989db30a, 0x6d22b13e874c67e9, 0x80c6f35518d73f4d, 0xc23c2a521aac6f29, + 0x2e708fd83aaa42e0, 0x7fc3780f55f1b0fd, 0xabb3075c98cf87f2, 0xb4df3f40f7c61143, + 0x2a04418098a76d75, 0x0d9eeee9509b2d37, 0x6be8ae51f4b59cdc, 0xe746cc7c00e4a2ab, + 0x785bc6df9cac597c, 0x33cb6620ce8adc48, 0xc1ba30739bffcef7, 0x6d95771f18e503f7, + 0xf7be3ae2e62652ff, 0xc8d82ffd2a73c62b, 0x8725a3ba5b110973, 0x67ed6b9c724757ec}, + {// seed = 7 + 0xc0272d42c19ff3ae, 0x4694228b43ea043b, 0x5709a6ef8a462841, 0xc9210a1e538805c9, + 0x279b171196113ec2, 0x859b769fc2d9e815, 0x0d5d3125a2bf14d3, 0x22bca1cfefa878ba, + 0x481b6bf58037bd83, 0x4933ba8647728d22, 0xf08c7b6b56f6e1b6, 0x374e8af5a15407c7, + 0xa95c4dc3d2487a5c, 0x9b832808ff11e751, 0xf2048507e9da01d5, 0xa9c576189f544a4a, + 0xf6c2a45b2e9d2b41, 0x9b9874c9f10ecc2f, 0x37d9b5f51f8c149e, 0x93aead54c9de9467, + 0x59cf0b4af262da23, 0xe7e9929af18194b2, 0x9df2644e33eb0178, 0xde4122d6f0671938, + 0xf005786c07f4800b, 0xb1fc9d254b5d1039, 0x0bf1088631f6dd7b, 0x665623f0a4b8f0c7, + 0x60f0113a9187db7c, 0xfd7cceda4f0d23a6, 0x26c01e9d89955940, 0x33afa1dfc0f5a6a0, + 0xeb77daf215e9283c, 0xc7575214bf85edb4, 0xeb0d804bf297e616, 0x84bff4ffd564f747, + 0xc4ac33189246f620, 0x43ef61213ecc1005, 0xcbbb0dea6cd96acd, 0x8ed27abfa8cfcb05, + 0x543b61529cb996b6, 0xa5f987ca41ea5e59, 0x3c50e0ac5254cb7a, 0x4192b0446c06d1e6, + 0x3e86592e21b45388, 0xdb766f06fcc6e51e, 0x0448ee36efe632db, 0x663c9db689253e35, + 0x72e0bd4985331dd4, 0xff501b5bf7d94e74, 0xe911ce758e2113a8, 0xec3a8d03a75a6ba4, + 0xaf6b4b72f56edc83, 0xf284857936c0a391, 0x5ba6feff407d46f4, 0x9d689c26de9d6702, + 0x28c04a9083726b5d, 0x2ccf4a627a029730, 0x7b4719500d4f0c71, 0x76470a9a7da250a8, + 0xcc48409404a1c890, 0xccefbdc7ec9a8055, 0xe0db91bff3cc42d3, 0x0532436426141254, + 0xf2ee9325e6f0ff0b, 0x149c20a5fbb28d9d, 0xe71624cd8d2d14d4, 0x8f01d4dc8cc2dd77, + 0x29cf409b333015b7, 0xba8bebd211884dd1, 0xc3396635e8c8db1d, 0x8ed0f6208d0528b8, + 0x0d90b43fdd0ee334, 0xd73c9a3333a044c7, 0xa2595cd208dbdc38, 0xae93cb264f940c09, + 0x8e0538d8afb07a97, 0x19115ec881385ba2, 0xa886f9e6a8039c6a, 0xcd5d62147ce3ecac, + 0xaecdf9e0bb4969f7, 0x2ddd631c53dcad10, 0x73ad1c97b3412054, 0xb08915fa2722efc6, + 0x97966047e5067eb0, 0x337f1675ed91445c, 0xb3a833d150b96a0d, 0x5940a98fe35e5e2e, + 0xfd03cc354ed0d8ff, 0x4e65b98291a8644a, 0x14a259f2852a60b2, 0x7648e3478c1e8e5f, + 0xbc0fbef6d9a919b4, 0xbec4302081346cf1, 0x57d2ce7aa1c7c511, 0x234c209d8f4e1ac3, + 0x87cf80cc933ce443, 0x7c262c616931e94e, 0xc5e33b049cf9eddf, 0x1a80790ed03ae51b, + 0xf2e8b9494f7220cf, 0x124cb59c14fff3ff, 0xa8a06cbfdb86ce18, 0x9068ef1f80b37653, + 0x0c55417b8d90338f, 0xcd579a523f6bcd30, 0xa31bfe2476a8d2a9, 0x1f8d142208094223, + 0x332dc40a5203cfad, 0xf8792fe5b2d33b4c, 0x443bd9668bf9461e, 0xc9019db0ace1409e, + 0x781bea919a113e8b, 0xb0f11d866abfbeec, 0xcfe139a60db0c26a, 0x869ab8721e6aa39e, + 0xdb48a4977717837a, 0x588a5ff151065b18, 0xe4a251ea0028864d, 0x7f0e43ba408a77c3, + 0x65f66dd50a536135, 0x6f49e934d9331c3e, 0xb8d742e0f0fa6b09, 0xe4e9b272deca2348, + 0xaee132ff902f773c, 0x43f658f7c2a0c90a, 0x28cb4dbc76cc53ea, 0x7d92253aa99ac39b, + 0x4fea3d832370baab, 0xb29e36936e51d78e, 0xea10778712321064, 0xff4f21f8ef274be2, + 0x84eff18ddfa0933f, 0xd0ec6a9f86c758a0, 0xaf82e5973c431ae0, 0x352023c00c045425, + 0xad34d7bc4a2f8961, 0xbdb4a02a24d4dee0, 0x354a4846d97447cf, 0x331a8b944d5bc19f, + 0x5ce04f8e17909035, 0x6497581bad8f4aab, 0x07c503bba647111e, 0x85f412ba78e1f7ff, + 0x7f3b920fd20f4cff, 0x424e1a9a4ce34e2f, 0x3035e2d62e1b9f0a, 0xef63114bff7b729a, + 0xe86a05889ab6bb60, 0xee0830cf095585a1, 0x4a54f7fa47d9c94b, 0x17daeece9fcb556a, + 0xc506d3f391834c6f, 0xb3f24be362e1af64, 0xc435e4e23608efdd, 0xeeba9caaa4cc1768, + 0x5a71f306daddc22d, 0x18e5205f41eba1a0, 0x7b29b4d1f6610925, 0x065cb65a0258d9a9, + 0x3e5ac8faa9fd1f95, 0x3b362362c1ea0470, 0xce0e4f6434db7a2e, 0xf327341098de52f2, + 0xcfca3b9e2a1992c3, 0x7483bf9401233e41, 0xbafbac531c6f9281, 0x4b52dd71b2c106f8, + 0xdf73b66e50b5a1f7, 0x237aec0202a20283, 0x23dd5be23dffdf2b, 0xea9730731ee122ef, + 0x5cb3f846014fbcd3, 0xc3b21c8ffdce9201, 0x06a99a02f91a8760, 0x721a81fa8fd7b7a3, + 0x6aafcdddc53cbcd8, 0xd03b464005a93bcc, 0x8212edc1b1669dcb, 0x71f4c31364c31bc7, + 0xfeeec0eba8772307, 0x1948d00a13d88cf1, 0x19064fd6d943ada8, 0x4ec8d31722697bfd, + 0x596d9a953a516609, 0xc4cb4bff53507da2, 0x1d59f3c5be36e4ca, 0xe5b4fc5bf6044c9b, + 0x1bb74e052232f735, 0x04e8a0db611ddd5d, 0x8d04eaa009b421bf, 0xa7878ae0ac0e6d58, + 0x28c1030217cab2b3, 0x827943767e56a883, 0x28fce5fa02d22809, 0xb30c322fffc8c58e, + 0x1ca5a6a9f8066c5b, 0xb24db5f1462b2513, 0x02f653b89b7e5f6c, 0xe31f8fb5d5f78eee, + 0x266acc514ed93501, 0x936879d1c6fddcc4, 0xcd51be3636af1952, 0x3fdbb6fc332c78c8, + 0x9eb656379fa73094, 0x056146cc92fa0f96, 0xed6c4f1836c027c3, 0x021e0bb5d2113f2a, + 0x8983e42ec1c626b3, 0x73ea9bc6513ad9c9, 0x0c904903b24f4247, 0xacbac1e6243e2525, + 0x0b1069a0c230fb06, 0x77d709fca3fc1ce5, 0x87ad0f65020947e6, 0x555302641c53f4e6, + 0x65ea87871fa9aaee, 0x58aaf4ecc1067bb4, 0x1a66c48cc4c65b3f, 0xca96aca48b2ea969, + 0xa68eb70bad14de2b, 0x5ccdb3d7e00a6f6e, 0xe178fbfec73fe72f, 0x2b63d6a16b83e890, + 0x32fdb7a5330fbae0, 0x2ab5803c8d1bf32c, 0xda838388c1527c94, 0x16a50bdc4de24acb, + 0xe561301f134c074a, 0xd7ae63d2816b4db1, 0x036aabd4df0dd741, 0xc5e0db8783435b9d, + 0x9c4386cf0a07f3b2, 0x6a72ac1aa56a13a1, 0x299bbdb04bb20a23, 0x138c1018fda16b81, + 0x0e354f0b3bda49df, 0x9f4c295b23127437, 0xd133ceb2bd561341, 0xd8b4bfd5a526ac29, + 0xcdd0a70ddc1c7bbd, 0x81dce595bf572225, 0x1c6f925c05f6efd7, 0x8ae5097553856ea0, + 0x3aabeaeef248f60d, 0xd9005809d19a69e2, 0x2a3a1a314311cc27, 0x89bb2dc76b2b624a, + 0x50a2a95d0412e289, 0x9def8df564e68581, 0xf49010a9b2e2ea5c, 0x8602ae175d9ff3f0, + 0xbf037e245369a618, 0x8038164365f6e2b5, 0xe2e1f6163b4e8d08, 0x8df9314914f0857e}, +}; -// const int64_t MIN_LEN = 512 * 1024; -// const int64_t AVG_LEN = 2 * MIN_LEN; -// const int64_t MAX_LEN = 2 * AVG_LEN; +const int64_t AVG_LEN = 1024 * 1024; // create a fake null array class with a GetView method returning 0 always class FakeNullArray { @@ -128,52 +564,33 @@ class FakeNullArray { int64_t null_count() const { return 0; } }; -// static uint64_t GetMask(uint64_t avg_len, uint8_t bit_adjustment) { -// size_t mask_bits = static_cast(std::floor(std::log2(avg_len))); -// size_t effective_bits = mask_bits + bit_adjustment; -// return MASK_TABLE[effective_bits]; -// } - -// static uint64_t GetMask(uint64_t avg_len, uint8_t bit_adjustment) { -// size_t mask_bits = static_cast(std::floor(std::log2(avg_len))); -// size_t effective_bits = mask_bits + bit_adjustment; -// return ((1ULL << effective_bits) - 1) << (64 - effective_bits); -// } - -static uint64_t GetMask(uint64_t avg_len, uint8_t bit_adjustment) { - size_t mask_bits = 16; - size_t effective_bits = mask_bits + bit_adjustment; - return ((1ULL << effective_bits) - 1) << (64 - effective_bits); +static uint64_t GetMask(uint64_t avg_len, size_t adjustement_level) { + size_t mask_bits = static_cast(std::ceil(std::log2(avg_len))); + return (1ULL << (mask_bits - adjustement_level)) - 1; } +// rename it since it is not FastCDC anymore class FastCDC { public: - FastCDC(const LevelInfo& level_info, uint64_t min_len, uint64_t avg_len, - uint64_t max_len, uint8_t normalization_level = 0) + FastCDC(const LevelInfo& level_info, uint64_t avg_len, uint8_t granurality_level = 5) : level_info_(level_info), - min_len_(min_len == 0 ? MIN_LEN : min_len), avg_len_(avg_len == 0 ? AVG_LEN : avg_len), - max_len_(max_len == 0 ? MAX_LEN : max_len), - mask_s_(GetMask(avg_len_, -normalization_level)), - mask_l_(GetMask(avg_len_, +normalization_level)) {} + min_len_(avg_len_ * 0.6), + max_len_(avg_len_ * 1.4), + hash_mask_(GetMask(avg_len_, granurality_level + 3)) {} template bool Roll(const T value) { constexpr size_t BYTE_WIDTH = sizeof(T); chunk_size_ += BYTE_WIDTH; - uint64_t mask; if (chunk_size_ < min_len_) { return false; - } else if (chunk_size_ < avg_len_) { - mask = mask_l_; - } else { - mask = mask_s_; } auto bytes = reinterpret_cast(&value); bool match = false; for (size_t i = 0; i < BYTE_WIDTH; ++i) { - hash_ = (hash_ << 1) + GEAR_TABLE[bytes[i]]; - if ((hash_ & mask) == 0) { + rolling_hash_ = (rolling_hash_ << 1) + GEAR_HASH_TABLE[nth_run_][bytes[i]]; + if ((rolling_hash_ & hash_mask_) == 0) { match = true; } } @@ -182,18 +599,14 @@ class FastCDC { bool Roll(std::string_view value) { chunk_size_ += value.size(); - uint64_t mask; if (chunk_size_ < min_len_) { return false; - } else if (chunk_size_ < avg_len_) { - mask = mask_l_; - } else { - mask = mask_s_; } bool match = false; for (char c : value) { - hash_ = (hash_ << 1) + GEAR_TABLE[static_cast(c)]; - if ((hash_ & mask) == 0) { + rolling_hash_ = + (rolling_hash_ << 1) + GEAR_HASH_TABLE[nth_run_][static_cast(c)]; + if ((rolling_hash_ & hash_mask_) == 0) { match = true; } } @@ -201,7 +614,15 @@ class FastCDC { } inline bool Check(bool match) { - if (match || (chunk_size_ >= max_len_)) { + if (match) { + if (++nth_run_ >= 7) { + nth_run_ = 0; + chunk_size_ = 0; + return true; + } else { + return false; + } + } else if (chunk_size_ >= max_len_) { chunk_size_ = 0; return true; } else { @@ -350,13 +771,14 @@ class FastCDC { private: const internal::LevelInfo& level_info_; - const uint64_t min_len_; const uint64_t avg_len_; + const uint64_t min_len_; const uint64_t max_len_; - const uint64_t mask_s_; - const uint64_t mask_l_; - uint64_t hash_ = 0; + const uint64_t hash_mask_; + + uint8_t nth_run_ = 0; uint64_t chunk_size_ = 0; + uint64_t rolling_hash_ = 0; }; } // namespace internal diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index c73f7c0c360..67bff8795ee 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -754,8 +754,7 @@ class ColumnWriterImpl { fallback_(false), definition_levels_sink_(allocator_), repetition_levels_sink_(allocator_), - content_defined_chunker_(level_info_, properties->cdc_min_size(), - properties->cdc_avg_size(), properties->cdc_max_size()) { + content_defined_chunker_(level_info_, properties->cdc_avg_size()) { definition_levels_rle_ = std::static_pointer_cast(AllocateBuffer(allocator_, 0)); repetition_levels_rle_ = @@ -1380,7 +1379,10 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, rep_levels + level_offset, levels_to_write, *sliced_array, ctx, maybe_parent_nulls)); } - AddDataPage(); + if (num_buffered_values_ > 0) { + AddDataPage(); + } + // AddDataPage(); } return Status::OK(); } else { diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index cba5e778aed..86f590b68e8 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -263,9 +263,7 @@ class PARQUET_EXPORT WriterProperties { page_checksum_enabled_(false), size_statistics_level_(DEFAULT_SIZE_STATISTICS_LEVEL), cdc_enabled_(false), - cdc_avg_size_(0), - cdc_min_size_(0), - cdc_max_size_(0) {} + cdc_avg_size_(0) {} explicit Builder(const WriterProperties& properties) : pool_(properties.memory_pool()), @@ -282,8 +280,7 @@ class PARQUET_EXPORT WriterProperties { sorting_columns_(properties.sorting_columns()), default_column_properties_(properties.default_column_properties()), cdc_enabled_(properties.cdc_enabled()), - cdc_min_size_(properties.cdc_min_size()), - cdc_max_size_(properties.cdc_max_size()) {} + cdc_avg_size_(properties.cdc_avg_size()) {} virtual ~Builder() {} @@ -302,16 +299,6 @@ class PARQUET_EXPORT WriterProperties { return this; } - Builder* cdc_min_size(uint64_t min_size) { - cdc_min_size_ = min_size; - return this; - } - - Builder* cdc_max_size(uint64_t max_size) { - cdc_max_size_ = max_size; - return this; - } - /// Specify the memory pool for the writer. Default default_memory_pool. Builder* memory_pool(MemoryPool* pool) { pool_ = pool; @@ -735,7 +722,7 @@ class PARQUET_EXPORT WriterProperties { size_statistics_level_, std::move(file_encryption_properties_), default_column_properties_, column_properties, data_page_version_, store_decimal_as_integer_, std::move(sorting_columns_), cdc_enabled_, - cdc_avg_size_, cdc_min_size_, cdc_max_size_)); + cdc_avg_size_)); } private: @@ -767,8 +754,6 @@ class PARQUET_EXPORT WriterProperties { bool cdc_enabled_; uint64_t cdc_avg_size_; - uint64_t cdc_min_size_; - uint64_t cdc_max_size_; }; inline MemoryPool* memory_pool() const { return pool_; } @@ -795,8 +780,6 @@ class PARQUET_EXPORT WriterProperties { inline bool cdc_enabled() const { return cdc_enabled_; } inline uint64_t cdc_avg_size() const { return cdc_avg_size_; } - inline uint64_t cdc_min_size() const { return cdc_min_size_; } - inline uint64_t cdc_max_size() const { return cdc_max_size_; } inline SizeStatisticsLevel size_statistics_level() const { return size_statistics_level_; @@ -900,8 +883,7 @@ class PARQUET_EXPORT WriterProperties { const ColumnProperties& default_column_properties, const std::unordered_map& column_properties, ParquetDataPageVersion data_page_version, bool store_short_decimal_as_integer, - std::vector sorting_columns, bool cdc_enabled, uint64_t cdc_avg_size, - uint64_t cdc_min_size, uint64_t cdc_max_size) + std::vector sorting_columns, bool cdc_enabled, uint64_t cdc_avg_size) : pool_(pool), dictionary_pagesize_limit_(dictionary_pagesize_limit), write_batch_size_(write_batch_size), @@ -918,9 +900,9 @@ class PARQUET_EXPORT WriterProperties { default_column_properties_(default_column_properties), column_properties_(column_properties), cdc_enabled_(cdc_enabled), - cdc_avg_size_(cdc_avg_size), - cdc_min_size_(cdc_min_size), - cdc_max_size_(cdc_max_size) {} + cdc_avg_size_(cdc_avg_size) + + {} MemoryPool* pool_; int64_t dictionary_pagesize_limit_; @@ -943,8 +925,6 @@ class PARQUET_EXPORT WriterProperties { bool cdc_enabled_; uint64_t cdc_avg_size_; - uint64_t cdc_min_size_; - uint64_t cdc_max_size_; }; PARQUET_EXPORT const std::shared_ptr& default_writer_properties(); diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 4ae821b5cca..51986eba842 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -498,8 +498,6 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: Builder* enable_cdc() Builder* disable_cdc() Builder* cdc_avg_size(uint64_t avg_size) - Builder* cdc_min_size(uint64_t min_size) - Builder* cdc_max_size(uint64_t max_size) shared_ptr[WriterProperties] build() cdef cppclass ArrowWriterProperties: diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 33c71d16272..ae53d6d868e 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -2120,11 +2120,9 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( elif content_defined_chunking is True: props.enable_cdc() elif isinstance(content_defined_chunking, tuple): - min_size, avg_size, max_size = content_defined_chunking + avg_size, = content_defined_chunking props.enable_cdc() props.cdc_avg_size(avg_size) - props.cdc_min_size(min_size) - props.cdc_max_size(max_size) else: raise ValueError( "Unsupported value for content_defined_chunking: {0}" From 7c4d716a6557bc53b0fb5a4855c938b6be9eb302 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Thu, 6 Feb 2025 17:23:19 +0100 Subject: [PATCH 008/102] don't include loging --- cpp/src/parquet/column_chunker.h | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/parquet/column_chunker.h b/cpp/src/parquet/column_chunker.h index 30601707e1c..9dd591943b8 100644 --- a/cpp/src/parquet/column_chunker.h +++ b/cpp/src/parquet/column_chunker.h @@ -21,7 +21,6 @@ #include #include #include "arrow/array.h" -#include "arrow/util/logging.h" #include "parquet/level_conversion.h" using arrow::internal::checked_cast; From ffcea22bdc090a45214e2b51f739c1497276a091 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Thu, 6 Feb 2025 17:31:32 +0100 Subject: [PATCH 009/102] please msvc --- cpp/src/parquet/column_chunker.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/column_chunker.h b/cpp/src/parquet/column_chunker.h index 9dd591943b8..d2840d72d79 100644 --- a/cpp/src/parquet/column_chunker.h +++ b/cpp/src/parquet/column_chunker.h @@ -574,8 +574,8 @@ class FastCDC { FastCDC(const LevelInfo& level_info, uint64_t avg_len, uint8_t granurality_level = 5) : level_info_(level_info), avg_len_(avg_len == 0 ? AVG_LEN : avg_len), - min_len_(avg_len_ * 0.6), - max_len_(avg_len_ * 1.4), + min_len_(static_cast(avg_len_ * 0.6)), + max_len_(static_cast(avg_len_ * 1.4)), hash_mask_(GetMask(avg_len_, granurality_level + 3)) {} template From 9f3896e8da07e5a7458c3a1c183f3a62515b6f9c Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 7 Feb 2025 10:41:26 +0100 Subject: [PATCH 010/102] increase the min/max bands around the avg chunk size --- cpp/src/parquet/column_chunker.h | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/cpp/src/parquet/column_chunker.h b/cpp/src/parquet/column_chunker.h index d2840d72d79..f7d791168cd 100644 --- a/cpp/src/parquet/column_chunker.h +++ b/cpp/src/parquet/column_chunker.h @@ -574,8 +574,8 @@ class FastCDC { FastCDC(const LevelInfo& level_info, uint64_t avg_len, uint8_t granurality_level = 5) : level_info_(level_info), avg_len_(avg_len == 0 ? AVG_LEN : avg_len), - min_len_(static_cast(avg_len_ * 0.6)), - max_len_(static_cast(avg_len_ * 1.4)), + min_len_(static_cast(avg_len_ * 0.5)), + max_len_(static_cast(avg_len_ * 2.0)), hash_mask_(GetMask(avg_len_, granurality_level + 3)) {} template @@ -589,9 +589,7 @@ class FastCDC { bool match = false; for (size_t i = 0; i < BYTE_WIDTH; ++i) { rolling_hash_ = (rolling_hash_ << 1) + GEAR_HASH_TABLE[nth_run_][bytes[i]]; - if ((rolling_hash_ & hash_mask_) == 0) { - match = true; - } + match |= (rolling_hash_ & hash_mask_) == 0; } return match; } @@ -605,23 +603,17 @@ class FastCDC { for (char c : value) { rolling_hash_ = (rolling_hash_ << 1) + GEAR_HASH_TABLE[nth_run_][static_cast(c)]; - if ((rolling_hash_ & hash_mask_) == 0) { - match = true; - } + match |= (rolling_hash_ & hash_mask_) == 0; } return match; } inline bool Check(bool match) { - if (match) { - if (++nth_run_ >= 7) { - nth_run_ = 0; - chunk_size_ = 0; - return true; - } else { - return false; - } - } else if (chunk_size_ >= max_len_) { + if (ARROW_PREDICT_FALSE(match && (++nth_run_ >= 7))) { + nth_run_ = 0; + chunk_size_ = 0; + return true; + } else if (ARROW_PREDICT_FALSE(chunk_size_ >= max_len_)) { chunk_size_ = 0; return true; } else { @@ -693,7 +685,7 @@ class FastCDC { def_match = Roll(def_level); rep_match = Roll(rep_level); - if (def_level >= level_info_.repeated_ancestor_def_level) { + if (ARROW_PREDICT_TRUE(def_level >= level_info_.repeated_ancestor_def_level)) { val_match = Roll(leaf_array.GetView(value_offset)); ++value_offset; } else { From 32ad613beb621f7934b172e4b13c63af529404f4 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 14 Feb 2025 11:55:52 +0100 Subject: [PATCH 011/102] use a chunk struct instead of a tuple to carry boundary information --- cpp/src/parquet/column_chunker.h | 36 ++++++++++++++++++++++---------- cpp/src/parquet/column_writer.cc | 20 +++++++----------- 2 files changed, 33 insertions(+), 23 deletions(-) diff --git a/cpp/src/parquet/column_chunker.h b/cpp/src/parquet/column_chunker.h index f7d791168cd..6a9285c5b58 100644 --- a/cpp/src/parquet/column_chunker.h +++ b/cpp/src/parquet/column_chunker.h @@ -551,7 +551,7 @@ constexpr uint64_t GEAR_HASH_TABLE[8][256] = { 0xbf037e245369a618, 0x8038164365f6e2b5, 0xe2e1f6163b4e8d08, 0x8df9314914f0857e}, }; -const int64_t AVG_LEN = 1024 * 1024; +const uint64_t AVG_LEN = 1024 * 1024; // create a fake null array class with a GetView method returning 0 always class FakeNullArray { @@ -563,9 +563,22 @@ class FakeNullArray { int64_t null_count() const { return 0; } }; -static uint64_t GetMask(uint64_t avg_len, size_t adjustement_level) { - size_t mask_bits = static_cast(std::ceil(std::log2(avg_len))); - return (1ULL << (mask_bits - adjustement_level)) - 1; +struct Chunk { + int64_t level_offset; + int64_t value_offset; + int64_t levels_to_write; + + Chunk(int64_t level_offset, int64_t value_offset, int64_t levels_to_write) + : level_offset(level_offset), + value_offset(value_offset), + levels_to_write(levels_to_write) {} +}; + +static uint64_t GetMask(uint64_t min_size, uint64_t max_size) { + uint64_t avg_size = (min_size + max_size) / 2; + size_t mask_bits = static_cast(std::ceil(std::log2(avg_size))); + size_t effective_bits = mask_bits - 3 - 5; + return (1ULL << effective_bits) - 1; } // rename it since it is not FastCDC anymore @@ -622,10 +635,10 @@ class FastCDC { } template - const std::vector> GetBoundaries( - const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, - const T& leaf_array) { - std::vector> result; + const std::vector GetBoundaries(const int16_t* def_levels, + const int16_t* rep_levels, int64_t num_levels, + const T& leaf_array) { + std::vector result; bool has_def_levels = level_info_.def_level > 0; bool has_rep_levels = level_info_.rep_level > 0; @@ -719,9 +732,10 @@ class FastCDC { return GetBoundaries(def_levels, rep_levels, num_levels, \ checked_cast(values)); - const ::arrow::Result>> GetBoundaries( - const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, - const ::arrow::Array& values) { + const ::arrow::Result> GetBoundaries(const int16_t* def_levels, + const int16_t* rep_levels, + int64_t num_levels, + const ::arrow::Array& values) { auto type_id = values.type()->id(); switch (type_id) { PRIMITIVE_CASE(BOOL, Boolean) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 67bff8795ee..12c253806a8 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1365,24 +1365,20 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, ARROW_ASSIGN_OR_RAISE(auto boundaries, content_defined_chunker_.GetBoundaries( def_levels, rep_levels, num_levels, leaf_array)); - for (auto boundary : boundaries) { - auto level_offset = std::get<0>(boundary); - auto array_offset = std::get<1>(boundary); - auto levels_to_write = std::get<2>(boundary); - auto sliced_array = leaf_array.Slice(array_offset); + for (auto chunk : boundaries) { + auto sliced_array = leaf_array.Slice(chunk.value_offset); if (leaf_array.type()->id() == ::arrow::Type::DICTIONARY) { - ARROW_CHECK_OK(WriteArrowDictionary(def_levels + level_offset, - rep_levels + level_offset, levels_to_write, - *sliced_array, ctx, maybe_parent_nulls)); + ARROW_CHECK_OK(WriteArrowDictionary( + def_levels + chunk.level_offset, rep_levels + chunk.level_offset, + chunk.levels_to_write, *sliced_array, ctx, maybe_parent_nulls)); } else { - ARROW_CHECK_OK(WriteArrowDense(def_levels + level_offset, - rep_levels + level_offset, levels_to_write, - *sliced_array, ctx, maybe_parent_nulls)); + ARROW_CHECK_OK(WriteArrowDense( + def_levels + chunk.level_offset, rep_levels + chunk.level_offset, + chunk.levels_to_write, *sliced_array, ctx, maybe_parent_nulls)); } if (num_buffered_values_ > 0) { AddDataPage(); } - // AddDataPage(); } return Status::OK(); } else { From 2886e176c65dcce99912ddd8d9a85ee6865c62ad Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 14 Feb 2025 12:43:57 +0100 Subject: [PATCH 012/102] split implementation and header files --- cpp/src/parquet/CMakeLists.txt | 1 + cpp/src/parquet/column_chunker.cc | 764 ++++++++++++++++++++++++++++++ cpp/src/parquet/column_chunker.h | 738 +---------------------------- 3 files changed, 777 insertions(+), 726 deletions(-) create mode 100644 cpp/src/parquet/column_chunker.cc diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 43319ab7a1f..31dbcdd7cc3 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -160,6 +160,7 @@ set(PARQUET_SRCS arrow/writer.cc bloom_filter.cc bloom_filter_reader.cc + column_chunker.cc column_reader.cc column_scanner.cc column_writer.cc diff --git a/cpp/src/parquet/column_chunker.cc b/cpp/src/parquet/column_chunker.cc new file mode 100644 index 00000000000..a9564a79e7c --- /dev/null +++ b/cpp/src/parquet/column_chunker.cc @@ -0,0 +1,764 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/column_chunker.h" +#include +#include +#include +#include "arrow/array.h" +#include "parquet/level_conversion.h" + +using arrow::internal::checked_cast; + +namespace parquet { +namespace internal { + +constexpr uint64_t GEAR_HASH_TABLE[8][256] = { + {// seed = 0 + 0xf09f35a563783945, 0x0dcc5b3bc5ae410a, 0x63f1ea8d22554270, 0xfbe5ee7bd05a7b61, + 0x3f692ed5e9934aba, 0xaab3755952250eb8, 0xdefb168dc2888fa5, 0x501b36f7c77a7d47, + 0xd2fff45d1989642d, 0x80217c1c600e30a6, 0xb9469ee2e43df7ac, 0x3654b76a61999706, + 0x6ea73dfe5de0c6b6, 0xdfd662e1937a589d, 0x0dbe0cc74b188a68, 0xde45f4e6d73ffc6f, + 0xcdf7a7759e70d87e, 0x5d6a951b8d38c310, 0xdc9423c3813fcf2c, 0x25dc2976e167ffce, + 0xc2555baa1d031c84, 0x115bc3f2230a3ab6, 0xd4b10260f350bede, 0xdfd3501ab447d723, + 0x022e79217edaf167, 0x1635e2255c5a7526, 0xa0a750350cc77102, 0xc027133e05d39f56, + 0xd949459779cf0387, 0xb92f1464f5c688c2, 0xd9ac5f3e8b42f2f3, 0xdf02bb6f5ecaac21, + 0x8156f988fac7bfa4, 0xe4580f97bede2ec8, 0x44fe7d17a76fca32, 0x885f59bd54c2014c, + 0x435e63ec655ffae9, 0x5ebc51930967b1f1, 0x5428c2084ac29e47, 0x9465938fec30e36b, + 0xc7cb3de4977772cd, 0x15692d7c201e8c3a, 0x505ee65cdc4b17f4, 0x7d9839a0a7aead6b, + 0xeef5f5b6a0105291, 0x76c2fb232ce7f5bf, 0x5c13893c1c3ff3a9, 0x65b6b547d4442f98, + 0xb8ad7487c8c96fce, 0x906bcf51c99974f8, 0x2f56e48bb943a48c, 0xbc9ab109f82d3a44, + 0xcd5160cdc8c7e735, 0xbe9acb9df3427732, 0x386b91d477d7fade, 0x36be463621dd5af2, + 0xcbe6a2faffd627a8, 0x9c8fd528463a2f5a, 0xb9b88c6bb802b184, 0xb414b4e665c597c7, + 0xbedb142568209556, 0x5360d81c25429dce, 0x63a69a960a952f37, 0xc900d63899e1b503, + 0x1abc63a8b37c7728, 0xa8b3a8b6409080eb, 0x495e391f662959f6, 0xdf1e136f3e12229b, + 0x33d5fc526b0dd38d, 0x321221ae2abfac63, 0x7fde18351fda7395, 0xed79fe5c3a6aa4c3, + 0x2dd6965a4867d8d4, 0x54813ca20fe8799b, 0x5d59ea6456465c39, 0x0de0c294d1936b81, + 0x4aaf0755002c588c, 0x3530a1857ad04c6d, 0xb8a64f4ce184442b, 0xe0def10bceedfa17, + 0x46e38d0a443757ec, 0x9795a1c645ee16d7, 0x7e531def245eac8a, 0x683b25c43a0716cf, + 0x884583d372da219d, 0x5b06b62c910416e5, 0x54b6902fbebd3dbe, 0x931198d40a761a75, + 0xead7d8e830013590, 0x80b4d5dc99bfaced, 0xf98272c8108a1ad2, 0x1adce054289a0ec6, + 0x7d53a1143c56b465, 0x497fbe4f00c92b52, 0x525e4cc2e81ebd69, 0xc94478e0d5508ff6, + 0xb8a5da83c196d07c, 0x7667a921b65b0603, 0xf236fabbdefe6cd1, 0x53da978d19a92b98, + 0xc604f6e97087124d, 0x2cbd27221924b094, 0x65cd1102c985b1d2, 0x08c0755dc1a97eb4, + 0x5e0419e921c0fef1, 0x282d2c1196f84a29, 0xe21117fcfc5793f7, 0xcf4e985dc38e6c2e, + 0xd521f4f264d55616, 0xde69b04c485f2a10, 0x59410e245305178a, 0xceab1d477c943601, + 0xa9805732d71ee5e9, 0x054cd443896974f6, 0xf2b517717a423a3e, 0x09517937fa9fac95, + 0x4938233e9ca871e3, 0x9132cbaf56f83ec0, 0x4703421ed1dd027d, 0xfd9933f4e6f1ec4e, + 0xf237c7fded2274a8, 0xdf4616efe68cd7b4, 0x5e46de0f39f0a380, 0x3d41e0c6d8e095b0, + 0xc5272f8a5bb2df09, 0x68aa78e8301fb964, 0xbf5b5b52c8e32ae0, 0xbf28ed3df74bdcf7, + 0xd6198f64c833815a, 0x8cd99d2974267544, 0xd90560ea4465ff2c, 0x571d65ad7ad59261, + 0x309453518baa367a, 0xa60538377bc79fb2, 0xace515da1ab4183c, 0xf56d3c8d891d1c5b, + 0x5b0d8370b59def49, 0x775866ce7c83c762, 0x3d76085695c8e18a, 0xba064d1a9af1b114, + 0xc84ef7cd7b98b521, 0x90b9231681c2bc37, 0x37e2b13e6f585b6b, 0x1d0a34e55e0f369f, + 0x86bb8019cf41447c, 0x4b95c6ef55b3f71f, 0x3b6ed1660732b310, 0x617eee603d137f21, + 0xf4f6278b464f3bbc, 0xdfb763b720da205a, 0x353478899b871cb7, 0xe45fbbff574cc41e, + 0x1a94b60847907d72, 0xb10eef051eff67a5, 0xf0e012ec6a284d40, 0xcc1cd1a11b926d7c, + 0xcf9d9c5453e19cad, 0x270febcc0fc0e86b, 0xd6567568778b781e, 0x7323b98965eeb46b, + 0xccecd374567086ff, 0xef7b44bfc497a704, 0xebc479c051a9f0a5, 0xc9b7410e3e00a235, + 0x1d084f7ecdf83dab, 0xc8a9a97e33ba8ba3, 0x8c75318f5b2350d6, 0xaa3cd5d0c684bdda, + 0xa81125fe0901bedf, 0xf7bcd76020edfc93, 0x834ee4c12e75874f, 0xb2bb8a7beb44fa14, + 0x32cd26f50a4f4e4d, 0x0fc5817ca55d959a, 0xd6e4ae2e3ae10718, 0x074abdcceb8d6e38, + 0xc0cc5f4f9b3a9c43, 0x1115d364363595b2, 0x69861db2eb19f2e8, 0x59b8d804cf92bc67, + 0x9bac9785e5e4b863, 0x7fa0e17a41869561, 0x10d3c9633f0c709c, 0x534a03deee6bc44a, + 0x73b1f7201257f581, 0x46fd6a11e2e0706b, 0x494abb554946e67a, 0xb5d6da317864dc8e, + 0x402ded9238f39687, 0xd8fa37d2cbd6d290, 0xcc818293fcb06791, 0x6482ab344806cd4d, + 0x0956e6ee9d8eb60b, 0x01fee622d8465ac8, 0xae7ece370cbd9c35, 0x7ff09e937a177279, + 0xa2c29ee7a33ca5f1, 0x990e8dbee083923b, 0x4a819b72f610863a, 0xddecfad79d3f08be, + 0x627372480fac20a7, 0x802154d6eca2db4c, 0x8fcf02e42f805e55, 0x040a911ff8cea977, + 0xbb544485bc64d0d4, 0xaddde1aeb406d0fb, 0xf6b35fae23dce66f, 0xc07a9fb3645d2f9b, + 0xccd113907e9c0fed, 0xd17af369984fd213, 0x9223823c59a083e7, 0xe19d475606b81013, + 0xe181ac116a90e57a, 0x71f7b6258c6def4c, 0x2246f34b45964f7c, 0xd74aedaea2d31751, + 0xb1add86e5dd305d1, 0xeb9ba881f16d6471, 0xef7600e036f5c6ff, 0x1d50bc9735b8fb85, + 0xe63942bd1f3e2969, 0x9241ba9f8b3f4e72, 0xee8bb2bca07d35b6, 0x55cd55dab522654e, + 0x94d0cfa7c1a6845d, 0x02f9845d559884c3, 0x8ce70ea21063b560, 0xd70998028ef08b74, + 0xdfdb5bbee310876b, 0x4e21b2e348256d16, 0xde007a981c13debc, 0xe51950cbbddabfdd, + 0xd223301dbe9957c1, 0x084b8634cc2cce4b, 0x90e551378aa9d70c, 0x833b533ac633e448, + 0x7891e232882da57f, 0xa1bf26f0163ce2b3, 0xf33a0171eb9c68d5, 0x2e7de18ca69b3fa2, + 0x666fd6f175619199, 0x1239d37edb5feb9f, 0xfa9fc9382e61ff5c, 0x3ca4ad427e3c126f, + 0x37c6dd4c2c31ae6e, 0x1f1bacb619d427b2, 0x7dd09f5d10759afe, 0xc8d941432327d733, + 0x2b389ba25e1d43a7, 0xa4e3030c3740ff21, 0xcc56dae13fd37463, 0x2481457c175b560f, + 0x9deb35bde77c5c41, 0x847aa6ea5549a0c3, 0xcde01bb48b6e7f02, 0x15a28844e64cb211}, + {// seed = 1 + 0xecfcba92fe5691a3, 0x71377799fea34699, 0xb284c9096fa614e5, 0x54534170f40de6c8, + 0xbbd804d45884fba3, 0x44929a896388c8a1, 0x79b712508e0fa3b1, 0xeb53ab280af31054, + 0x351ea23a6319da7a, 0x2fbe55d9819d85a2, 0x34f4b6568dcd28b1, 0x8c94ea5e5d82967a, + 0x09068d333a46d3c5, 0x762ad4f64cb73381, 0xd5c6db5ef0e22640, 0x36d8ab5a36175680, + 0xd41fe333cdc3525a, 0xa1f51dbdf20ce781, 0x1410a95e786c8be6, 0x96b7499a670c2b41, + 0x3912e1037835d893, 0x272c5bd83e1e9115, 0x2ea7f91cad82a0d6, 0xcd10e85662ce9931, + 0xedad49be8d5e8b74, 0x7ccd8fe0f37d12bc, 0xfac0482005eed593, 0x4513991681f6c8b0, + 0x2804d612eb0ad37d, 0x7cca9e8412b81d34, 0x85ffd6707192b7b8, 0xea0560aeea954411, + 0x0122d28226102bba, 0xf51c47cdbd22fdd1, 0x3707d851183ff17c, 0xaef5a1465f3e902d, + 0xbcb38c2d8736a04f, 0x4025317e864bef15, 0x8d3f66d86e1ea58f, 0xc16759a3d97ed79a, + 0x1c62abdc0659f2f5, 0x23b3eb4e699bd28f, 0x5083c4fceed3ccaf, 0xa65bf34562cc989c, + 0xaa5865932fd79064, 0xf24d08d268c24593, 0x7fbd00a215196999, 0x7812cd366d752964, + 0x62e8dcb27ef3d945, 0xf08b7984e1b946dc, 0x547d23ad9a5c1dcf, 0x496b1fb249b27fb7, + 0xcd692e1db5f3b3ba, 0x41931e39f1e1bc61, 0x286c6a7d7edae82b, 0x17ef6638b6c4ca6e, + 0x609beb5a2576a934, 0xcc5e16fe4a69b83c, 0xbbd14d08b078fc24, 0x2a617680f481cb94, + 0x81dbbd5f86e6d039, 0xeb8205e1fc8ecc3c, 0xe5e3bb576faa8042, 0x5d6f1eb9d9df01b5, + 0x9a47b8739c10fb44, 0x398a7caad7ea7696, 0x9c0fc1d7c46adde6, 0x67cd6de0a51978a6, + 0x68ccc4b77a21cca4, 0x1e067066b82f415c, 0xf7ddade6535e1819, 0xf2185c884291751b, + 0xc322b7381fcbe34f, 0x242f593e88290b9b, 0x8e11ccc0ea5e84a3, 0x40e3a2e3346db8a2, + 0xf18bfc3ad2931a2c, 0x2468397394b00144, 0xeae199cce14e6817, 0x05b462686c75a1ae, + 0xda096cb859c51673, 0xd87aeb967a906bef, 0xaabc74493cb02fe6, 0x74d48fc2e7da143e, + 0x6ec1c8fed3f2c1fd, 0xe01e0704b463f18e, 0xc3d88a4d3a8056e4, 0xd01ae0ffab6c8f3f, + 0x881ba052620ae7c7, 0xcea033aef0a823a5, 0x8d2cad91d83df1e3, 0x18746d205e66dbe9, + 0x3061f8e58d046650, 0xd819c59f0ce2cf8b, 0x144e89e93635e870, 0x3415e88279b21651, + 0xd6f7ab944b86c3fa, 0x45f1dd15d0f67bdc, 0xbf0d97c7f4fa24f4, 0x34a7de520a57fcd2, + 0x4ba86fda03e9e2bc, 0xa7995265a025b552, 0x698f6819d5f51cf7, 0xd07dbe9d8a156981, + 0x2683945373857fc1, 0x116f8a84f96167de, 0x8bc832bd85595ebf, 0xb206519d74fdfafa, + 0xde9519b2e9b5cc5f, 0x16fdd6f2da1d8163, 0x7ba32bd48ef56f11, 0x6f4e4d7ee8b29717, + 0xd31576dde7468aad, 0x023bb08848676045, 0xf6dcc083178160b7, 0x42035f426250e683, + 0x343732993cfed89f, 0x0640a870a22d3d58, 0x65cff80b53b4ae6a, 0x27996fa17ab05215, + 0xfd5db01401b21a04, 0x894508784bc1673c, 0x5bfcf43a2380e27d, 0x4cd6dcc2715583b7, + 0xa43b3763e7d4c902, 0x6da83e12ef0c1257, 0xfe80a602b0335aff, 0x293a7d8f4ff344de, + 0xb4ae7c2b8956bf5a, 0x6b45432d38254b4d, 0xd086acbdf15d9455, 0xa4d19e43f41ea87b, + 0xf01f13ba4bb87fbf, 0xca582cf301a299ff, 0x0ddad3d45298fa7d, 0x0646a130459c3999, + 0xc08e3af3747e2cee, 0xfc7db8aa9ed67295, 0x783b329e7bd79d5f, 0x732dbc607957af7b, + 0x8e446ac19fb26555, 0xff1dfa4d61dc89a5, 0xb6fbc46bd8d011d8, 0x185147ec5779f0d7, + 0x6eb2cf6149a5380f, 0xb0e773df803a1eae, 0xc07706c5519bfce5, 0xc35abcf54fa95f14, + 0x40a01d99a38608ea, 0x776dcd6f603c277f, 0x6ae12389b1d6d0bb, 0x8bd981448df92bb9, + 0x426a6a7ca21a2c16, 0x87efd5b71c1bad26, 0x71fb7fc4cd41de48, 0xdd9033c45619d463, + 0x40eaab322654cef7, 0xe077fffed6f3e3a2, 0x375a4dbef9384447, 0x2066b009d2c4a100, + 0xeca4a5794a068447, 0x2128f64bddf341a1, 0x738b4bb1be90bd61, 0x433772cf3813d52e, + 0x9540c88add8e4474, 0x0b6d5decd21d3519, 0x654ead966745642d, 0xe1bfb03c3b4bdb4c, + 0x0b977a9937515b1f, 0x0a4587509ef63870, 0xe89f0de1d9cfd44a, 0x23a91390272e7f68, + 0xd92defbc9096b8d8, 0x004db87174612539, 0xc88ecaabdd1a71f1, 0x050de38393073346, + 0x8af1426d7964e038, 0xf352c4fef8ad5c87, 0x6f26bc7408e26548, 0x0d41543fd9bf3084, + 0xfc4e07553a840fc6, 0x5ef117de86a555a9, 0x1f11c42dffb5ae1b, 0x4147648f07490fa5, + 0x09b35fd7671b21aa, 0x1453b14f7ccca481, 0x944f6fcce4c9b2ba, 0x5b08dd2e3583dc06, + 0xe0220df78dc9c22d, 0x1c200b9506cbf666, 0x8a0b7465eadb523b, 0xfbcb43a91a1e2d80, + 0xe697f44be3c36a58, 0x2f8a8e48fb7e350d, 0x7baba71b8920d55f, 0x10edc0216105bc96, + 0x52db07c79d7a7a63, 0x1916e8cef9452ac3, 0x5cbbbf21f867b6cc, 0xadd583365a690a4b, + 0x4e4ca2c8bffc2fdb, 0xf5fe3416d2eebcfe, 0x839af8b85e452476, 0x8496c0c54ad44e16, + 0x6c46f1ecad4482bf, 0xb794cad76ae18715, 0x67b762eec7c62985, 0x52dc9e68df5b3a53, + 0x0cc7e444b422a5f9, 0xadbfe90841c112b0, 0xfe37b136f0ca5c34, 0xcfe9e47948a8d73e, + 0xee90572b86a30d91, 0x549e72d8262830aa, 0x3361564b469f32c6, 0x1e6eba9e0d2648e2, + 0x5f8e2b2ac5fcb4eb, 0xe4224fa5f71f7cc6, 0x7357a9230c76757b, 0xcad70f74aaf6b702, + 0xeef28ced23894cc2, 0x753fdd3352aefd68, 0x1fed6ba90bbeb9d2, 0x05316f4ab4034b4b, + 0x3396df022b9f63d6, 0x82d7125a7cfd0935, 0x3519a71caf1f87f0, 0xd1dfb7a5cc3974be, + 0xbfae40ecbdbbcc2a, 0x152c11778e08dd54, 0x4a96566a6c848554, 0x3a84d621c340cdd7, + 0xfd47aa1887e2fb03, 0xa63cae94b2f1d099, 0xed61783f3e5b75e0, 0xefd44864106019be, + 0x145ff78b80b081aa, 0x34670e5fcea9230e, 0x876ef976328db371, 0x4221f3a5269942a6, + 0x95315cbd85c648f4, 0x3ca344dc7c3b1600, 0x38421ea39ff28780, 0x31dbeee967c0435c, + 0x27437c3e268402e7, 0xdd0cf8343312a654, 0x965ab9dad1d8aa29, 0xf871706dd3e23509, + 0xce23d06c7a25e699, 0x1b37d59382b27589, 0x3407f004723d6324, 0x56efb69cdb5deaa1, + 0xf46cdd2b9fd604e0, 0xcad3ca79fdac69bd, 0x7252802a574e63cb, 0xc281fb8acc6ec1d3}, + {// seed = 2 + 0xdd16cb672ba6979c, 0x3954eaa9ec41ae41, 0x52cb802771d2966d, 0xf57ed8eb0d0294f2, + 0x768be23c71da2219, 0x6131e22d95a84ad3, 0xd849e4e49bb15842, 0x18e8e5c4978cf00d, + 0x3af5e5867ce1f9bd, 0x06c75a9fffe83d63, 0xe8de75a00b58a065, 0x0a773251bc0d755a, + 0x629dc21e54548329, 0x2a168f5e5a883e70, 0x33547375f0996c86, 0xdfcb4c7680451322, + 0x55c1ecaaaa57e397, 0x4546c346c24f5a31, 0x6f8f0401dfabc86c, 0x7760d2d36ee340b4, + 0xf6448e48bdeb229d, 0xba70e1633b4dba65, 0x069cda561e273054, 0xa010b6a84aebf340, + 0x5c23b8229eee34b6, 0xea63c926d90153af, 0x7d7de27b3e43ec1b, 0xea119541eddc3491, + 0xf1259daeddfc724c, 0x2873ca9a67730647, 0xa1e7710dade32607, 0x758de030b61d43fd, + 0xd2c9bcbfa475edb4, 0x18ade47bb8a0aa29, 0xf7a74af0ff1aea88, 0x6f8873274a987162, + 0x6963e8d876f4d282, 0xd435d4fe448c6c5b, 0x93ec80ba404cafff, 0xcf90d24c509e41e7, + 0x5f0fc8a62923e36e, 0x9224878fe458f3a4, 0xd9a039edf1945bcd, 0x0877d1892c288441, + 0x75205491f4b4740b, 0x30f9d2d523a9085b, 0x4b7f4029fa097c99, 0x170bb013745709d4, + 0x7087af537f11ef2e, 0x28c62b88e08fc464, 0x84bbcb3e0bb56271, 0x485a4b099165c681, + 0x357c63357caa9292, 0x819eb7d1aee2d27e, 0xdaa759eb9c0f8c9d, 0x42cdc36729cc3db5, + 0x9489aa852eddbb06, 0x8161e4f85a84e6d4, 0xa964863fdad3eb29, 0xcc095ddbce1a6702, + 0x3ecfadbb8dc2ce58, 0x971316509b95a231, 0xc8f484d1dbc38427, 0xae9c510c463574c0, + 0xdf2b31179600c21a, 0x440de87bada4dfa3, 0xbd8d30f3f6fb7522, 0x84e6d7f678a0e2d0, + 0x0ec4d74323e15975, 0xf6947610dad6d9ab, 0x73a55a95d73fe3a5, 0x3e5f623024d37eda, + 0x8d99a728d95d9344, 0x8b82a7956c4acdc4, 0x7faeaea4385b27f6, 0x540625ff4aa2ff21, + 0x4aa43b3ebd92ce2b, 0x899646a6df2da807, 0x49225115780942d7, 0xe16606636af89525, + 0xb980bcf893888e33, 0xf9ed57695291b0d8, 0x5c6dd14464619afa, 0x50606d69b733d4f3, + 0x7fb1af465b990f97, 0x3fab2634c8bbd936, 0x556da6168838b902, 0x0f15975902a30e1f, + 0xb29d782ae9e1991f, 0xae00e26ff8f7e739, 0xd3da86458bb292d5, 0x4528ee0afb27e4ce, + 0x49882d5ba49fabad, 0x7e873b6a7cf875ee, 0x777edd535113c912, 0x94ed05e7ff149594, + 0x0b8f95fc4211df43, 0x9135c2b42426fef2, 0x411e6c2b47307073, 0x503207d1af0c8cf8, + 0xd76f8619059f9a79, 0x64d24617855dee45, 0xf7bc7a877923196a, 0xd6cc42ed6a65be79, + 0xe3912ff09d4fc574, 0x4192d03b2bc2460a, 0xa0dcc37dad98af85, 0xfc59049b2a5818a4, + 0x2128bae90a5b975f, 0xbe7067ca05ea3294, 0x5bab7e7753064c4f, 0x42cbf0949ef88443, + 0x564df4bbd017492c, 0xf2c2eb500cf80564, 0x5b92e67eb00e92af, 0x8c4103eef59c0341, + 0x83412122b8284998, 0x888daf2da0636b6d, 0x4d54b10303dd07d6, 0x201190e7c1e7b5ed, + 0x3797510bb53a5771, 0x03f7bc598b570b79, 0xdc1e15d67d94f73e, 0x721e8b499ebe02c1, + 0x71f954f606d13fa0, 0x0c7a2e408c168bf0, 0x07df2ef14f69c89d, 0xe295096f46b4baaf, + 0x7a2037916438737e, 0xd1e861aeaf8676ea, 0xb36ebdce368b8108, 0xb7e53b090ddb5d25, + 0x5a606607b390b1aa, 0x475e52994f4a2471, 0xbcc2038ba55b2078, 0x28b8a6b6c80df694, + 0xb5f0130ec972c9a2, 0x7a87cd2a93276b54, 0x4d0eec7ecf92d625, 0xac1a8ce16269a42e, + 0xa4ca0237ca9637b8, 0xd8dc8ff91202b6ff, 0x75b29846799d7678, 0x761b11a5edd9c757, + 0xf2581db294ef3307, 0xe3173c2b6a48e20f, 0xe46fd7d486d65b3c, 0x1352024303580d1f, + 0x2d665dae485c1d6d, 0x4e0905c825d74d3b, 0x14ff470c331c229e, 0xbdc656b8613d8805, + 0x36de38e396345721, 0xaae682c1aa8ff13b, 0x57eb28d7b85a1052, 0xf3145290231d443a, + 0xd0f68095e23cbe39, 0x67f99b3c2570b33d, 0x54575285f3017a83, 0x9b2f7bb03d836a79, + 0xa57b209d303367a9, 0x7ccb545dd0939c79, 0x1392b79a37f4716d, 0x6e81bb91a3c79bcd, + 0x2c2cd80307dddf81, 0xb949e119e2a16cbb, 0x69625382c4c7596f, 0xf19c6d97204fb95c, + 0x1b2ea42a24b6b05e, 0x8976f83cd43d20ac, 0x7149dd3de44c9872, 0xc79f1ae2d2623059, + 0xca17a4f143a414e1, 0x66d7a1a21b6f0185, 0xed2c6198fe73f113, 0x16a5f0295cbe06af, + 0x5f27162e38d98013, 0xf54d9f295bdc0f76, 0x9ba7d562073ef77b, 0xa4a24daaa2cfc571, + 0x49884cf486da43cd, 0x74c641c0e2148a24, 0xbff9dcbff504c482, 0xf8fc2d9403c837ab, + 0x6ccc44828af0bb1e, 0xbcf0d69b4c19dfdb, 0x8fe0d962d47abf8f, 0xa65f1d9d5514271d, + 0x26ff393e62ef6a03, 0xc7153500f283e8fc, 0xea5ed99cdd9d15cd, 0xfc16ac2ba8b48bb7, + 0xf49694b70041c67a, 0xbd35dd30f5d15f72, 0xcf10ad7385f83f98, 0x709e52e27339cdc2, + 0xe9505cb3ec893b71, 0x2ffa610e4a229af7, 0x12e1bc774d1f0e52, 0xe301a3bb7eacccc8, + 0x1fdd3b6dcd877ebf, 0x56a7e8bda59c05aa, 0x99acd421035d6ab4, 0xfd21e401cecd2808, + 0x9a89d23df8b8d46f, 0x4e26b1f1eb297b9c, 0x9df24d973e1eae07, 0xe6cdc74da62a6318, + 0xfc360d74df992db0, 0xf4eca0a739514c98, 0x481c515ba9bf5215, 0xce89cce80f5f3022, + 0xf487a10fc80e4777, 0x235b379a87e41832, 0x76f72e028371f194, 0xd044d4a201325a7d, + 0x47d8e855e0ffbdde, 0x268ae196fe7334b0, 0x123f2b26db46faa8, 0x11741175b86eb083, + 0x72ee185a423e6e31, 0x8da113dfe6f6df89, 0x286b72e338bbd548, 0xa922246204973592, + 0x7237b4f939a6b629, 0x31babda9bedf039a, 0xb2e8f18c6aeec258, 0x0f5f6ce6dd65a45e, + 0x8f9071a0f23e57d3, 0x71307115ba598423, 0xcbe70264c0e1768c, 0x1c23729f955681a8, + 0xfbc829099bc2fc24, 0x9619355cbc37d5d6, 0xea694d4e59b59a74, 0xb41cf8d3a7c4f638, + 0xae1e792df721cd0b, 0x7cd855d28aac11f6, 0xca11ba0efec11238, 0x7c433e554ce261d8, + 0xe3140366f042b6ba, 0x8a59d68642b3b18c, 0x094fcdd5d7bccac2, 0x9517d80356362c37, + 0x4a20a9949c6c74e8, 0xc25bcf1699d3b326, 0xa8893f1d1ed2f340, 0x9b58986e0e8a886e, + 0x29d78c647587ce41, 0x3b210181df471767, 0xd45e8e807627849d, 0x1ec56bc3f2b653e3, + 0x974ff23068558b00, 0xdb72bdac5d34262c, 0x23225143bb206b57, 0xd0a34cfe027cbb7e}, + {// seed = 3 + 0x39209fb3eb541043, 0xee0cd3754563088f, 0x36c05fc545bf8abe, 0x842cb6381a9d396b, + 0xd5059dcb443ce3bf, 0xe92545a8dfa7097e, 0xb9d47558d8049174, 0xc6389e426f4c2fc0, + 0xd8e0a6e4c0b850d3, 0x7730e54360bd0d0d, 0x6ecb4d4c50d050d5, 0x07a16584d4eb229f, + 0x13305d05f4a92267, 0xb278ddd75db4baec, 0x32381b774138608f, 0x61fe7a7163948057, + 0x460c58a9092efee6, 0x553bf895d9b5ff62, 0x899daf2dabfd0189, 0xf388ab9c1c4b6f70, + 0xd600fe47027ea4cd, 0x16d527ec2b5ef355, 0x5ac1f58ff6908c81, 0xa08d79ff8ee9ffe8, + 0xc1060a80b7a5e117, 0x14b2c23118c60bda, 0x8cc0defbb890df8f, 0xe29540fd94c6d28b, + 0xa604f003f82d5b71, 0xa67583d4eb066d18, 0xd62cbd796322b3fc, 0x070cfe244cdcccf3, + 0x73557c30b3af47e5, 0x2e544e31153a2163, 0x996eef7464d5bead, 0xbc71cb5ab0586cdc, + 0x0bfcb6c1b517ed69, 0x62b4f1fcc82e8ca0, 0x0edbc68f544965c5, 0x40fa39baa24af412, + 0xf39aeb2413dab165, 0x17e6013e7afee738, 0x8109bff1c8d42a9d, 0x3cd99863390989b5, + 0x02021a4cc9c336c8, 0xa06060778cb60aa4, 0xd96591db60bc1e06, 0xd2727175183f4022, + 0xcdc1f1c5bce3e7ce, 0xb393ccc447872a37, 0xdf6efe63257ead3a, 0x20729d0340dbceb6, + 0x9f3d2d26fc0ea0d7, 0xf392e0885189bd79, 0xdf2ee01eb212b8b6, 0x6e103a0c0f97e2c3, + 0x96c604a763bd841b, 0x9fc590c43bba0169, 0xf92dcd5ddc248c40, 0x113a8b54446941dc, + 0x5943eda146b46bb8, 0xbf657901a36a39a7, 0x5a4e0e7ea6568971, 0xb94c635bae9f9117, + 0x2626fb65b3a4ef81, 0xa59bfd5478ce97de, 0x79112ba9cc1a1c63, 0xf41f102f002cf39c, + 0x0a589bcbfb7ff1c8, 0xa1478c53540c4fa1, 0x60d55e72c86dfaca, 0x312e7b6840ea7a39, + 0x8aae72dcccfe1f75, 0xff2f51f55bf0247a, 0x3c2e4b109edb4a90, 0x5c6d73f6525c7637, + 0xe49acb04a199f61c, 0x27860642d966df7f, 0x541ce75fb1e21c30, 0xd9fcd6f90806c7cc, + 0xb87c27bc93a7969b, 0x92f77a1179b8f8dc, 0xb1f29379deb89ed4, 0x7e63ead35808efe7, + 0x13545183d7fa5420, 0x575f593e34cf029d, 0x27f1199fb07344ae, 0xe67f95f7dc741455, + 0x49b478b761ab850b, 0xd7bedf794adfc21e, 0xdc788dcd2dda40ae, 0x14673eb9f4d8ad35, + 0x0cced3c71ecf5eb1, 0xe62d4e6c84471180, 0xdfe1b9e2cb4ada7d, 0x70185a8fce980426, + 0x0ce2db5e8f9553d6, 0x1fedc57bb37b7264, 0xb9310a2e970b3760, 0x989ff8ab9805e87d, + 0x0b912d7eb712d9ee, 0x1fe272830379e67c, 0x16e6a73aff4738fb, 0xeed196d98ba43866, + 0x7088ca12d356cbe2, 0x23539aa43a71eee0, 0xed52f0311fa0f7ad, 0xa12b16233f302eea, + 0xc477786f0870ecb4, 0xd603674717a93920, 0x4abe0ae17fa62a4c, 0xa18f1ad79e4edc8d, + 0xc49fe6db967c6981, 0xcc154d7e3c1271e9, 0xdd075d640013c0c0, 0xc026cd797d10922a, + 0xead7339703f95572, 0x4342f6f11739eb4b, 0x9862f4657d15c197, 0x4f3cb1d4d392f9ff, + 0xe35bffa018b97d03, 0x600c755031939ad3, 0xb8c6557ffea83abf, 0x14c9e7f2f8a122ea, + 0x0a2eb9285ee95a7c, 0x8823fec19840c46f, 0x2c4c445c736ed1d0, 0x83181dff233449f1, + 0x15ed3fca3107bef5, 0x305e9adb688a4c71, 0x7dbef196f68a3e2e, 0x93e47ece3e249187, + 0x8353c5e890ead93c, 0xea8a7ae66abafdf7, 0xf956dbb6becf7f74, 0x9f37c494fbfdb6e4, + 0x11c6cbaa2485dd32, 0x206f336fcca11320, 0x9befe9a59135d8fe, 0x5f3ef8b8db92c7db, + 0xbb305e556ce0ce9a, 0xf26bdafb1305887f, 0xcbf28abe23f08c61, 0x0bc64173b914e00b, + 0x9168da52e983f54a, 0x6ea41d09c3574a3e, 0x78aa44d4a74459ae, 0x2931422878387bf5, + 0x018f64a3a92c2d9c, 0x9be43f6752e66b34, 0xae378890decd1152, 0x07325329a1cb7623, + 0x3b96f4ee3dd9c525, 0x2d6ebcdbe77d61a3, 0x10e32b0e975f510c, 0xffc007b9da959bf9, + 0x38bf66c6559e5d90, 0xbe22bdf0bf8899fe, 0x87807d7a991632a8, 0x149a0d702816766a, + 0x026f723db057e9ab, 0xeeecb83625ec6798, 0xcec2ed5984208148, 0xd985a78e97f03c84, + 0xf96c279e7927b116, 0x99d5027b3204f6e2, 0x13a84878c3d34c55, 0x5cf5ec96229e9676, + 0x0bc36b07e4f8e289, 0xbed33b80a069914d, 0x2fbfbdd1ff4b9396, 0xab352bb6982da90f, + 0x154d219e4fa3f62b, 0x4d087512bb6b9be7, 0xc582e31775ee400e, 0x7dadb002ae8c4a4e, + 0xaae2957375c1aee2, 0x5f36ca643356625b, 0xf87cf8eb76e07fb7, 0x46f432a755e02cc3, + 0x36087e07aba09642, 0xe5642c1e4ebb9939, 0xb9152d22338eefad, 0xf7ba44278a22cf7f, + 0xd3b8013502acd838, 0x7761511da6482659, 0xb0857621638e8e50, 0x552eddb4a8b1d5f5, + 0xc43d9861e812c3ea, 0xd765c2aada47910c, 0x21c935b68f552b19, 0x6256d5641a2b47dc, + 0xab711d8e6c94bc79, 0xa8d0b91a2a01ab81, 0x5e6d66141e8d632a, 0x7638285124d5d602, + 0x794876dbca3e471f, 0x951937d8682670ce, 0x0f99cb1f52ed466a, 0x8c7cd205543b804c, + 0x2fd24d74a9c33783, 0xe5dcb7b7762e5af1, 0x45e6749cca4af77c, 0x540ac7ee61f2259f, + 0x89c505c72802ce86, 0xeab83b9d2d8000d1, 0x9f01d5e76748d005, 0xc740aaef3035b6d0, + 0x49afcd31d582d054, 0xcba5dc4c1efb5ddc, 0xc0a4c07434350ca1, 0xfc8dfaddcc65ee80, + 0x157c9780f6e4b2d9, 0x9762a872e1797617, 0xc4afae2cf3c7e1bd, 0x71cde14591b595d4, + 0x8843c3e0e641f3b9, 0xd92ecd91dce28750, 0x1474e7a1742cb19f, 0xec198e22764fa06b, + 0x39394edb47330c7d, 0x00ba1d925242533d, 0xaed8702536c6fb30, 0x6d3618e531c2967a, + 0x77f7cedcd7cc0411, 0xbc1e2ab82be5b752, 0x07b0cf9223676977, 0x596c693b099edd53, + 0xbb7f570f5b9b2811, 0x96bfdad3c4a6840c, 0x668015e79b60c534, 0x3ad38d72123f1366, + 0x6b994d81d2fcbb09, 0x70885f022c5052d8, 0xc891ee79d9306a7b, 0x2c4df05c0ed02497, + 0x19ebc13816898be2, 0xea7c64df11c392a2, 0xb7663e88dd12e1bd, 0x79f768cb8e154c21, + 0x1fb21b12e945933b, 0xe6a9045643f6906e, 0x544c47acd7e15371, 0xb7709b14f727e3d1, + 0x326ee36a46942971, 0x477f1cf7b0e2d847, 0x88b8f6b82b3b0c24, 0x18bc357b80e3cd5c, + 0x3333de70e4d66e0b, 0x4fd4c5e148583cf6, 0xae1b62f3008c0af3, 0xc49f419b6ab29cf5, + 0x2c29fa65afc3fa28, 0x4b19d93734d03009, 0x7dd6c09e589276ad, 0x1cece97f30de48ad}, + {// seed = 4 + 0x58bdf4338602e4fb, 0x71a5620b02c926d5, 0x3811c960129c2d9f, 0x29c2fb11fccac567, + 0x0d6b1ea7780f1352, 0xcc4d3ddfae3f87b3, 0xfdd30257362a586b, 0xabc948fde69f25f1, + 0x51b3523469d30f7b, 0xe0f0322724405ace, 0xd3729266d896da1e, 0xb10c37e5147915bf, + 0x8b577039f9fa32a3, 0xe677c6a9cbfb44b3, 0x7317a756ebb51a03, 0xf8e988ef37359485, + 0x600fc1ef3f469ff3, 0xbf0b8f8520444e01, 0x3711168b08b63d73, 0x34146f2944a6cb36, + 0x717feb263862cdde, 0x7185f8347db00412, 0x900798d82127e693, 0x84089e976a473268, + 0x10f8308c0d293719, 0xf62a618d4e5719b8, 0x8bdbd257a1a9516f, 0xf49f666fd7a75110, + 0xbaf45e2db7864339, 0xe4efa1ea0c627697, 0x3e71d4c82a09fe10, 0x54a2a51cf12127bb, + 0xa0592c9f54ba14cd, 0x27dd627a101c7a42, 0x3d2ceb44b3d20d72, 0x7ee1f94a68ca8f5d, + 0x7e8cb8651b006c36, 0xbd9fa7ca3a475259, 0x856de173586a7b34, 0xcedb291b594cb1b5, + 0xa3d6e462fd21cddc, 0x74561d10af9118e4, 0x13a3d389fc2d4b36, 0xeea8594a4a054856, + 0xf56d7474d9ba4b13, 0x25ddce2f6490b2fd, 0x920653ff3a8d830b, 0xcd8c0c9cdac740d1, + 0x2c348a738db9c4a0, 0x2967ccbe8ea44c22, 0x47963f69adb049f8, 0xf9d01eb5b4cf7eb6, + 0x7a5c26eb63a86bd2, 0x62ad8b7a71fa0566, 0xb373213179f250ae, 0x589d4e9a88245a4d, + 0x433dafebe2d558a8, 0x521fbef2c8fe4399, 0x62a31f9ff9ccd46b, 0x51602203eba7c1a6, + 0x9afc8c451b06c99f, 0xb529085bdbaffcea, 0xac251825cc75892b, 0x94976a5bce23d58e, + 0xdd17925b6c71b515, 0x568fd07a57bce92e, 0xefac31200d8bd340, 0x716c3e466b540ef9, + 0x3d2c9e380063c69b, 0x14168f9a3662dd83, 0xd298c7504dbc412f, 0x74490a94f016719f, + 0x0e0da431e1ab80c8, 0xe321f63dc6b169ae, 0xf08671544febc95a, 0x39324450cc394b3b, + 0xea6e3d35f1aa3a70, 0x8ef8a886508ce486, 0xdc1a631ef0a17f06, 0xfda2b3fbcd79e87b, + 0xd75bcae936403b10, 0xf88b5bd9f035f875, 0xc43efec2e3792dd4, 0xe9fac21a9d47cd94, + 0xc2876f0c4b7d47c3, 0xaba156cf49f368b4, 0x5ccda2170fa58bf9, 0xadc92c879ed18df7, + 0x110c1b227354e6c8, 0x298ee7a603249200, 0xde92142ede0e8ee7, 0x88e4a4610644ba9e, + 0xbb62d277e7641d3a, 0xb9be1985b7bf8073, 0x29024e5426cdb0d1, 0xf6aefd01f3092ab8, + 0x2a07087b313133aa, 0x6d71f445d6dfc839, 0x1e2412ff12e5526b, 0xed5cdeba6617b9e1, + 0x20b1d0d5e5f8760e, 0x12ff15705c368260, 0x7bf4338b7c387203, 0x34ff25f00cd06185, + 0x1148c706c518cf28, 0x5c04f0623388f025, 0xcb9d649275d87d79, 0x9b5f0c24fabc42ec, + 0x1a7b5e7964e33858, 0x2a81bbd8efdc6793, 0x8d05431ffe42752e, 0x83915cd511002677, + 0x580ed4d791837b31, 0x5982e041d19ff306, 0xcad0d08fa5d864ca, 0x867bee6efe1afa63, + 0x26467b0320f23009, 0xd842414dfda4ec36, 0x047fcdcbc0a76725, 0xbddb340a3768aeca, + 0xef4ce6fa6e99ab45, 0x88c5b66c7762bf9b, 0x5679f1c51ffb225d, 0xdab79048317d77ee, + 0xf14e9b8a8ba03803, 0xe77f07f7731184c1, 0x4c2aab9a108c1ef5, 0xa137795718e6ad97, + 0x8d6c7cc73350b88b, 0x5c34e2ae74131a49, 0xd4828f579570a056, 0xb7862594da5336fc, + 0x6fd590a4a2bed7a5, 0x138d327de35e0ec1, 0xe8290eb33d585b0b, 0xcee01d52cdf88833, + 0x165c7c76484f160e, 0x7232653da72fc7f6, 0x66600f13445ca481, 0x6bbdf0a01f7b127d, + 0xd7b71d6a1992c73b, 0xcf259d37ae3fda4a, 0xf570c70d05895acf, 0x1e01e6a3e8f60155, + 0x2dacbb83c2bd3671, 0x9c291f5a5bca81af, 0xd976826c68b4ee90, 0x95112eec1f6310a2, + 0x11ebc7f623bc4c9a, 0x18471781b1122b30, 0x48f7c65414b00187, 0x6834b03efa2f5c30, + 0x0875ef5c2c56b164, 0x45248d4f2a60ba71, 0x5a7d466e7f7ba830, 0x2bebe6a5e42c4a1d, + 0xd871d8483db51d10, 0x6ee37decd2fd392f, 0x7d724392010cede3, 0x8e96ef11e1c9bcc8, + 0x804a61d86b89d178, 0xbb1b83ce956055ec, 0xcb44e107410ff64f, 0xc426bb09ee0ba955, + 0x057c08f42c3dd7f1, 0x40ea1ec148602bdf, 0xc24688deeb65d7f1, 0xd8bcc53c768ba4e4, + 0x16e0e3af65c1106c, 0xfc12f7e7d647218b, 0x70d6e1d3ee93cef4, 0x01d2a505c4541ef9, + 0x1ef79e16e764d5c3, 0x0363d14d13870b98, 0xb56ef64345d06b11, 0xe653d557ebb7c346, + 0x8304a8597c2b2706, 0x1536e1322ce7e7bb, 0x525aec08a65af822, 0x91f66d6e98d28e43, + 0xe65af12c0b5c0274, 0xdf6ae56b7d5ea4c2, 0x5cef621cedf3c81c, 0x41e8b1ffd4889944, + 0xb5c0f452c213c3e5, 0x77af86f3e67e499b, 0xe20e76ea5b010704, 0xbdc205ab0c889ec0, + 0xc76d93eb0469cd83, 0x17ac27f65cab0034, 0xd49ec4531fd62133, 0x07a873ea2f1b9984, + 0xbff270dfef0032ee, 0x1764dbe91592f255, 0xe40363126f79e859, 0xa06cad3ab46971f6, + 0x0be596e90dedd875, 0x3387cce5c1658461, 0x44246acf88a9585e, 0xe0ad82b92d5ecb2c, + 0x2177491c9a1600a6, 0x16e7c4aac0f02422, 0x75792eeeec15c4e1, 0x2309cd359d08ee30, + 0x7cd9831dd1b83b0a, 0x374914a7c4ee8cf0, 0x0dd17765c9ac2e54, 0xb7847470ba9a7688, + 0xfba4f4bbe2991173, 0x422b203fc3de040e, 0x63bfcaf2ecf2ab0e, 0x0c5559f3a192946e, + 0xfdf80675c1847695, 0xf5f570accab842c9, 0x65cc5a448767afea, 0x1efeb0a7ee234f2f, + 0x9b05f03d81e7b5d2, 0xe7c31317a8626cf4, 0x620f2a53081d0398, 0x1b6de96cdd9943ae, + 0x8c226a436777d303, 0xa08fbbd50fafb10d, 0x6a64c5ec20104883, 0x9c9c653502c0f671, + 0x678a02b2174f52a0, 0x68e008ba16bbad4b, 0xa317c16d2efb860f, 0xeab2075d17ed714c, + 0x565eeeddf0c4ea15, 0x8ec8e94d242a6c19, 0x139e8e27d9000fae, 0xc977a7ff1b33d2f5, + 0x1d0accca84420346, 0xc9e82602cd436e03, 0x6a2231da53d2ccd3, 0xb44b12d917826e2a, + 0x4f4567c6a74cf0b9, 0xd8e115a42fc6da8f, 0xb6bbe79d95742a74, 0x5686c647f1707dab, + 0xa70d58eb6c008fc5, 0xaaedc2dbe4418026, 0x6661e2267bdcfd3d, 0x4882a6eda7706f9e, + 0xf6c2d2c912dafdd0, 0x2f2298c142fd61f9, 0x31d75afeb17143a8, 0x1f9b96580a2a982f, + 0xa6cd3e5604a8ad49, 0x0dae2a80aad17419, 0xdb9a9d12868124ac, 0x66b6109f80877fac, + 0x9a81d9c703a94029, 0xbd3b381b1e03c647, 0xe88bc07b70f31083, 0x4e17878356a55822}, + {// seed = 5 + 0xb3c58c2483ad5ead, 0x6570847428cdcf6c, 0x2b38adbf813ac866, 0x8cb9945d37eb9ad3, + 0xf5b409ec3d1aed1c, 0xa35f4bffc9bb5a93, 0x5db89cde3c9e9340, 0xff1225231b2afb2b, + 0x157b0b212b9cc47d, 0xf03faf97a2b2e04d, 0x86fdab8544a20f87, 0xfcb8732744ae5c1c, + 0xd91744c0787986d5, 0x5f8db2a76d65ad05, 0xcff605cbed17a90d, 0xf80284980a3164e7, + 0x59cc24e713fccc7d, 0x268982cada117ce4, 0xcd020e63896e730e, 0xe760dc46e9fe9885, + 0x6aaece8ab49c6b5d, 0x7451194d597aae3e, 0x35d4385900332457, 0xa40fb563a096583d, + 0xa797b612f7f11b76, 0x2fed6eb68e6a2b9b, 0x2f06ee64aeffd943, 0x9dd0e49d9ca45330, + 0x97d48f08bd7f1d8f, 0x1cfa7fe3ebe4d8ee, 0x2a2ba076bd397d42, 0x68c4344f7472f333, + 0xce21ec31987d74b5, 0xb73dabdc91d84088, 0x801aadee592222fe, 0xaf41345398ebc3f5, + 0x8a8f653d7f15ee46, 0xce2d065ff2ba2965, 0x4e05da515da2adb7, 0xa6dbdb8aa25f0fd4, + 0xca9f9666bbd2d5a9, 0x6b917ce50bd46408, 0x1550cc564ba6c84d, 0xb3063ae043506504, + 0x84e5f96bb796653d, 0xe2364798096cf6e3, 0x3b0dfedf6d3a53d0, 0xb7e4c7c77bde8d93, + 0xe99545bac9ab418a, 0xa0e31f96889507bb, 0x883c74f80c346885, 0xf674ae0b039fd341, + 0x8bb6ce2d5e8d1c75, 0x0c48737966a7ed7c, 0x04fcdf897b34c61c, 0xe96ac181bacbd4d6, + 0x5a9c55a6106a9c01, 0x2520f020de4f45d3, 0x935730955e94d208, 0xce5ad4d7f3f67d3b, + 0xa4b6d107fe2d81ca, 0x4f0033f50ae7944e, 0x32c5d28dd8a645a7, 0x57ce018223ef1039, + 0x2cbab15a661ab68e, 0x6de08798c0b5bec2, 0xee197fb2c5c007c6, 0x31b630ac63e7bda2, + 0xab98785aefe9efe3, 0xa36006158a606bf7, 0x7b20376b9f4af635, 0xa40762fdc3c08680, + 0x943b5faffd0ebee2, 0x7f39f41d0b81f06e, 0x7c4b399b116a90f8, 0x24e1662ac92bc9f3, + 0xcf586fc4e8e6c7db, 0xe46e0d047eeb12d7, 0xe8021076e4ea9958, 0x11fc13492e3ca22a, + 0xd61eae01410397e3, 0x7e8c4a58036a8e9f, 0x068a6de267970745, 0x64faab129bef1a41, + 0xb4a6f720943dad01, 0x631491058d73a9d5, 0xdad4fe95eab3ec02, 0x0a8b141c5c3a44f6, + 0x9fc69d4c2b335b98, 0x94d5f84a07d6e4cd, 0x1b73965de143c608, 0x443932c2dda54bcc, + 0x7397818fb0b04cd2, 0xef4ab03a1202b277, 0xf3d2ee459c0c2b92, 0x182d4daf8b058a87, + 0x90e63035d7b51368, 0xba4cd8b9a95d45fd, 0x12a7392c76731090, 0x890d264ec5d082d2, + 0xeeaf5c363da4994e, 0xd6aad756902123fb, 0xb531ebebdb28f191, 0xe71ce659fc59babd, + 0x37c1b94f63f2dcb5, 0xe4e3abeb311f9b96, 0x4a31b72ccb8695d3, 0x52cae1f0629fdce4, + 0xe5b0475e2ed71369, 0x2724e8c3506414fb, 0xbab0367920672deb, 0x0161a781c305449f, + 0x37b70f40f5bb60be, 0xddd1094c50251a01, 0x3b28283afd17224e, 0x06dec0cfe889fc6b, + 0x47608ea95bb4902d, 0xad883ebc12c00e82, 0x9e8d7ae0f7a8df29, 0xa79443e9f7c013a1, + 0xcfa26f68b7c68b71, 0x33ae6cc19bda1f23, 0xd9741e22b407887f, 0xf2bff78066d46b1c, + 0x794123191c9d32d4, 0x56cb6b903764ec76, 0x98775d0ef91e1a5a, 0xae7b713bc15c1db9, + 0x3b4c1a7870ed7a0d, 0x46666965f305cc34, 0x0ea0c3b2e9c6b3cd, 0x4dc387039a143bff, + 0x5f38bb9229ef9477, 0xea5d39ba72af7850, 0x69a5ed0174ce2b6d, 0x06969a36bfe7594d, + 0x0adee8e4065ccaa3, 0x908a581d57113718, 0x64822d6c5a8190ed, 0x8c5068b56ace4e4c, + 0x88ba3b4fb4e30bef, 0xa6ec0b8bb5896cfe, 0x4e23fcc6b47996fd, 0xe18e75b0dd549c7a, + 0xcd90f17e106cf939, 0x1666fdfb2ef7c52f, 0x4fae325f206dd88c, 0xe7bc1160e25b062d, + 0x3cc999cb246db950, 0xc5930a7326cd5c37, 0xb008a48a211367bd, 0xc5559da145a88fd4, + 0x1e3ad46655fac69c, 0x7834266b4841bfd7, 0xa764450fbffc58cc, 0x54d8cf93a939c667, + 0x93c51f11b21b2d9d, 0x0964112082ed65cc, 0x4c2df21213e7fb03, 0xf0405bc877468615, + 0x17b4fc835d116ab4, 0xa6b112ae5f3cb4ef, 0x23cfc8a7fd38a46e, 0x8e0a360dc2774808, + 0x24ca9c8092105ad5, 0xafd3f75524f2e0d5, 0x4f39ed7dbaddc24c, 0xe5e362c7679a7875, + 0x00914a916b07b389, 0xdfe1119b7d5ab5da, 0xabd6ed9940e46161, 0x630ed2044171e22c, + 0xdecc244157dd1601, 0x777e6d5b4b4868d5, 0x9b3530bee67017d8, 0xd2faf08b291fdcb9, + 0x006e99455d6523de, 0xd559b5817f6955b5, 0xefcc1063b0088c61, 0xed73145ae0f00ae7, + 0xab2af402cf5b7421, 0x897767f537644926, 0x26c9c0473ca83695, 0x192e34e1881b2962, + 0xf7cf666ec3b3d020, 0x27f9b79c7404afb7, 0xe533e8bed3010767, 0xe5817838e11d05d3, + 0x65659c531bd36517, 0xd427c5e0a23836fd, 0xf3eab7ea58fa3528, 0x07683adae1289f35, + 0x201d6af7e896dd32, 0xd5da938b9a21ad88, 0x843fb73ad67bc316, 0x1782ec7d5feef21b, + 0x943f66f6ec772877, 0x7e9112e7b26da097, 0xeac8161f8663c2c7, 0xe8600db480a9ebf4, + 0x07807fc90f6eaf5f, 0xe0e4c9deb41abf83, 0xbdf533db271f9c15, 0xb398411b0497afe2, + 0xdebb45ef25448940, 0xe7a5decefcd376c4, 0xaf1ef3c728c83735, 0xb8b83a99355cb15a, + 0x6444a0344f1611e4, 0xe8bb7f5cf3c60179, 0x77ab5c5177e75ff7, 0xc38fd6fa849d585d, + 0x390d57d53029060a, 0xa66327eb7b8b593c, 0x6350a14f6fcd5ac9, 0x2c08125bcd7008b4, + 0x2d00c299a6a6bf8e, 0x6b0039c1f68d1445, 0x0035150c5d06f143, 0xa34d01628cc927e1, + 0xdf5b3164d7b2ede1, 0x8167db1d0583d72e, 0x4e13b341cd2ae8bc, 0xa693d9b1f416e306, + 0xc15ed7ca0bc67609, 0xdc344313c1c4f0af, 0x88b6887ccf772bb4, 0x6326d8f93ca0b20e, + 0x6964fad667dc2f11, 0xe9783dd38fc6d515, 0x359ed258fa022718, 0x27ac934d1f7fd60a, + 0xd68130437294dbcc, 0xaf5f869921f8f416, 0x2b8f149b4ab4bf9f, 0xc41caca607e421cb, + 0x7746976904238ef9, 0x604cb5529b1532f0, 0x1c94cd17c4c4e4ab, 0xe833274b734d6bbe, + 0xe9f1d3ef674539ce, 0x64f56ed68d193c6a, 0xe34192343d8ecfc1, 0xcb162f6c3aa71fe8, + 0x99eaf25f4c0f8fa4, 0x92f11e7361cb8d02, 0xb89170cddff37197, 0x4f86e68a51e071e3, + 0x31abf6afd911a75b, 0x6d20cf259c269333, 0x4150b9f88fcb6513, 0x705063989ebf7451, + 0x559231d927c84410, 0x1ca8ec4b098bc687, 0xebed22405c9180e0, 0xaa815b37d052af59}, + {// seed = 6 + 0x946ac62246e04460, 0x9cebee264fcbc1ae, 0x8af54943a415652b, 0x2b327ed3b17b8682, + 0x983fde47b3c3847e, 0x10a3013f99a2ad33, 0x6e230bb92d2721ef, 0x1cf8b8369e5c5c50, + 0x7f64017f2b7b3738, 0xd393248a62417fa1, 0x9ff01c0b20a372c5, 0xb0e44abce7e7c220, + 0xcebb9f88d48a815f, 0xdb7df6bd09033886, 0x7844fc82b6fa9091, 0x72d095449863b8ec, + 0xc13e678c89da2c7e, 0x6caf4d5ad231d12f, 0x2e0ab7b5fcf35c49, 0xf410720cb932a70f, + 0xd66ea581f16fce06, 0x175c9f002f57dc98, 0xccbcfd0d32988775, 0xfde4c407d3b0a232, + 0x5db2931ae7e97223, 0x6e07e2173085809f, 0x6e1d1ec0f9cad73c, 0xb2fc251a7f802619, + 0xbc1fc17f04f342de, 0x8de8f21ec658e078, 0x72c0f40cbee53fd6, 0x0678244411fc17a1, + 0x1d5837ca166b9bbd, 0xc8cada003c554345, 0x6a2fe2bfb2e58652, 0xfca9d797a6f7988b, + 0x6699e24ac737948b, 0x69623ffcb05789ba, 0x946429c529d95b75, 0x0d14df0b2a13970f, + 0x593d8592c440dfec, 0x2ee176f3d7e74b94, 0xae003f1da3be9e26, 0x0c7b02c4c0f6764a, + 0x3117e2fa1f632462, 0xf0f23265b6f1eaeb, 0x3111255d9b10c137, 0xc82745e509a00397, + 0xbd1d04037005fea7, 0xe104ab0dd22a9036, 0x51b27ce50851ac7a, 0xb2cb9fb21b471b15, + 0x29d298074c5a3e26, 0x6ebdf2058b737418, 0xc4a974041431b96f, 0x1ec5a30ccb6bdaac, + 0xe818beede9bf4425, 0x4b69b1bce67a5555, 0xf5c35f1eb0d62698, 0xf4509bbd8e99867c, + 0xb17206debd52e1bc, 0x35785668c770b3be, 0xe9343987ff5863bc, 0x2ee768499ac73114, + 0x5132bb3426eeaaf4, 0x471bce2c6833c5ff, 0xbb9a2d5428e6f6f9, 0xd5678943c595792d, + 0xab2a65e7f81e479c, 0xa82407bb23990b31, 0xdae321383984923c, 0x01823bb22648e6f1, + 0xda6e8df4214a8b04, 0x0e172bb88e03d94f, 0x552da6c22e362777, 0x7ce67329fb0e90cb, + 0x7b2d7f287ede7ebf, 0xd44f8222500651bd, 0x4acca1ef58fbb8ab, 0x428ecf058df9656b, + 0xd7e1ec6a8987c185, 0x365be6a54b253246, 0x168849be1e271ee8, 0x6a00f3c4151a8db2, + 0x37602727ca94b33d, 0xf6b50f18504fa9ce, 0x1c10817f6bc872de, 0x4bfe1fe42b0f3638, + 0x135fad4b8ef6143b, 0x1b25ad2bafc25f58, 0x41e37f85cf321f92, 0xfc73f75d9d5b9bea, + 0x9eb3694d1e9cb7e1, 0x601d51f08fa83b90, 0x234a2a9b88366f41, 0x63fe903e16f2c3bf, + 0x1cdbd34fa751c0b0, 0x0ce4fc6747c0558c, 0x51ed72afb8bb49aa, 0x20313ba13ca12c96, + 0x271fa38f9ebd54c1, 0x3696a5ac03a8edde, 0x05602be7df625702, 0x11f1ac73790f7a9f, + 0xa2836c099f0810bd, 0xe5ac2e47caa532fa, 0xd9c000a66d39f681, 0xd93d900e6f3d9d5f, + 0x792c81c65b7900f2, 0x5c5dce790ee20da1, 0x74ff1950edec1aee, 0x71fc85fa1e277d8f, + 0x0e77df17d6546cbc, 0x07debad44816c3b4, 0xbafa721581e92a70, 0x8ab6fbe2ed27bba8, + 0xe83243a20dea304a, 0xaa85a63a84c00a07, 0xde0e79917fc4153a, 0x21bb445e83537896, + 0xeedcac49fc0b433a, 0xffb2926a810ae57a, 0xf724be1f41d28702, 0x79cb95746039bb3b, + 0x5a54fe3742a00900, 0xda4768d64922c04f, 0x420396a84a339dae, 0xa171e26ee5e8724e, + 0x4c8da7c5d289c20a, 0x9ebd79a1a8e94742, 0x39235232b97e9782, 0xb75df0be9bba7d80, + 0x0c1d204dd87d48fc, 0x8f81f3e7177266e8, 0xe4a460b39e78d72b, 0x50b98fa151e65351, + 0xb7cb585c3ee1eddc, 0x11cdad9a76ee1dc4, 0xa38054a78595dc1c, 0x92f09e2ec4978edc, + 0xa8f0061b5efdabaa, 0x04bcc4abc224d230, 0xc58606738e692d46, 0xdd2b27b565952433, + 0x19e6ed1b740beec0, 0xceadd49b2ef9891f, 0x328178c28fe95cad, 0xe5ad4c43afe02848, + 0x03c0cb538cd967c0, 0xec4352526d19a630, 0x4c7e99389d39b031, 0xf65dd05362c2deb6, + 0xd1e70daf6879d28d, 0xbe9f57db6309b265, 0xa4b66f370b872bb7, 0xe26896fbc6ee1fd5, + 0xac705e661bfcf7c5, 0xab4d0d07d7f09940, 0x976417c06aeb6267, 0x8161c684a6bd468c, + 0xf77b6b9976dc4601, 0xc6489b779a39c12c, 0xb2aa58d5681cea1a, 0x043b1b40f8c3e04c, + 0x681fcbfadc845430, 0xab8896c921ba8def, 0x57aaf172606f37b2, 0xc3735048cd5eb8d7, + 0xa7078b96955631bd, 0xdd6b3543aa187f33, 0xc7103ea4a2a697fd, 0x8d7b95f6ff1f7407, + 0xe44f419e84709530, 0xf340caa9132cbb0a, 0x2ba407283143c66c, 0xe1be240ca636c844, + 0x90d32f2877ac08bc, 0x5d26e6294b2c8673, 0x4a6b2f5b27c87a44, 0x961fb9043f76d34f, + 0x0afee02d8d3c55d2, 0x6228e3f48c42e5dc, 0xc338e69ee6593675, 0x853f74b16efb7bdd, + 0xd062f40bdd22e687, 0x647164b9ab4c4190, 0xf94689f67d598369, 0x8e4b29d87a5012d7, + 0xaf02b8b925656fbd, 0x7a722a767179a630, 0xb5c8afe937a75ace, 0xfdb8e8d02d279372, + 0x887ef700cb25fae1, 0xcfe9bd912f72cabe, 0xb1d4dedc24f978de, 0x517522d38319cc2a, + 0x7dd87b2b36aab798, 0x579c4ff3046b5a04, 0xf5c5975c5028b7a7, 0x7094579d1000ec84, + 0xbc8d5b1ea70a5291, 0x161b2d783be8855c, 0xd26d0b0d6d18279f, 0x0be1945f02a78bd5, + 0xb822a5a9e045415b, 0x2fe9d68b1ccc3562, 0xb2e375960033d14f, 0x26aca04e49b4ff22, + 0x732a81c862112aea, 0x8bd901ed6e4260b8, 0xe839532c561ad5b0, 0x8fb6e4d517a79b12, + 0x0dd37f8c0be9b429, 0xc8ad87ad12f1b1b0, 0xc51f3aa62b90318b, 0x031a7e8b86c1cefc, + 0xa95547af2b70fc76, 0x9cb3615c5a98801e, 0xa387e3c3341d7032, 0xa087ea52a1debaef, + 0x16325ec9a2e6e835, 0x587944a484c585eb, 0xc8879033bde22ecc, 0xa39dbfce709c464a, + 0x7acc010f99208774, 0x98dd2973a096c5ad, 0x26458b51139f198c, 0x2f5d19575e8c4f02, + 0x726643f0d38af352, 0x44d879b6d73e6e94, 0xa68a03885c980abe, 0x06048acd161c40c0, + 0xa4dab8f89d405d28, 0x7120c880cb04be18, 0xa062ace22a1cf0cf, 0x3901a9daf29704f4, + 0xff08f3ed989db30a, 0x6d22b13e874c67e9, 0x80c6f35518d73f4d, 0xc23c2a521aac6f29, + 0x2e708fd83aaa42e0, 0x7fc3780f55f1b0fd, 0xabb3075c98cf87f2, 0xb4df3f40f7c61143, + 0x2a04418098a76d75, 0x0d9eeee9509b2d37, 0x6be8ae51f4b59cdc, 0xe746cc7c00e4a2ab, + 0x785bc6df9cac597c, 0x33cb6620ce8adc48, 0xc1ba30739bffcef7, 0x6d95771f18e503f7, + 0xf7be3ae2e62652ff, 0xc8d82ffd2a73c62b, 0x8725a3ba5b110973, 0x67ed6b9c724757ec}, + {// seed = 7 + 0xc0272d42c19ff3ae, 0x4694228b43ea043b, 0x5709a6ef8a462841, 0xc9210a1e538805c9, + 0x279b171196113ec2, 0x859b769fc2d9e815, 0x0d5d3125a2bf14d3, 0x22bca1cfefa878ba, + 0x481b6bf58037bd83, 0x4933ba8647728d22, 0xf08c7b6b56f6e1b6, 0x374e8af5a15407c7, + 0xa95c4dc3d2487a5c, 0x9b832808ff11e751, 0xf2048507e9da01d5, 0xa9c576189f544a4a, + 0xf6c2a45b2e9d2b41, 0x9b9874c9f10ecc2f, 0x37d9b5f51f8c149e, 0x93aead54c9de9467, + 0x59cf0b4af262da23, 0xe7e9929af18194b2, 0x9df2644e33eb0178, 0xde4122d6f0671938, + 0xf005786c07f4800b, 0xb1fc9d254b5d1039, 0x0bf1088631f6dd7b, 0x665623f0a4b8f0c7, + 0x60f0113a9187db7c, 0xfd7cceda4f0d23a6, 0x26c01e9d89955940, 0x33afa1dfc0f5a6a0, + 0xeb77daf215e9283c, 0xc7575214bf85edb4, 0xeb0d804bf297e616, 0x84bff4ffd564f747, + 0xc4ac33189246f620, 0x43ef61213ecc1005, 0xcbbb0dea6cd96acd, 0x8ed27abfa8cfcb05, + 0x543b61529cb996b6, 0xa5f987ca41ea5e59, 0x3c50e0ac5254cb7a, 0x4192b0446c06d1e6, + 0x3e86592e21b45388, 0xdb766f06fcc6e51e, 0x0448ee36efe632db, 0x663c9db689253e35, + 0x72e0bd4985331dd4, 0xff501b5bf7d94e74, 0xe911ce758e2113a8, 0xec3a8d03a75a6ba4, + 0xaf6b4b72f56edc83, 0xf284857936c0a391, 0x5ba6feff407d46f4, 0x9d689c26de9d6702, + 0x28c04a9083726b5d, 0x2ccf4a627a029730, 0x7b4719500d4f0c71, 0x76470a9a7da250a8, + 0xcc48409404a1c890, 0xccefbdc7ec9a8055, 0xe0db91bff3cc42d3, 0x0532436426141254, + 0xf2ee9325e6f0ff0b, 0x149c20a5fbb28d9d, 0xe71624cd8d2d14d4, 0x8f01d4dc8cc2dd77, + 0x29cf409b333015b7, 0xba8bebd211884dd1, 0xc3396635e8c8db1d, 0x8ed0f6208d0528b8, + 0x0d90b43fdd0ee334, 0xd73c9a3333a044c7, 0xa2595cd208dbdc38, 0xae93cb264f940c09, + 0x8e0538d8afb07a97, 0x19115ec881385ba2, 0xa886f9e6a8039c6a, 0xcd5d62147ce3ecac, + 0xaecdf9e0bb4969f7, 0x2ddd631c53dcad10, 0x73ad1c97b3412054, 0xb08915fa2722efc6, + 0x97966047e5067eb0, 0x337f1675ed91445c, 0xb3a833d150b96a0d, 0x5940a98fe35e5e2e, + 0xfd03cc354ed0d8ff, 0x4e65b98291a8644a, 0x14a259f2852a60b2, 0x7648e3478c1e8e5f, + 0xbc0fbef6d9a919b4, 0xbec4302081346cf1, 0x57d2ce7aa1c7c511, 0x234c209d8f4e1ac3, + 0x87cf80cc933ce443, 0x7c262c616931e94e, 0xc5e33b049cf9eddf, 0x1a80790ed03ae51b, + 0xf2e8b9494f7220cf, 0x124cb59c14fff3ff, 0xa8a06cbfdb86ce18, 0x9068ef1f80b37653, + 0x0c55417b8d90338f, 0xcd579a523f6bcd30, 0xa31bfe2476a8d2a9, 0x1f8d142208094223, + 0x332dc40a5203cfad, 0xf8792fe5b2d33b4c, 0x443bd9668bf9461e, 0xc9019db0ace1409e, + 0x781bea919a113e8b, 0xb0f11d866abfbeec, 0xcfe139a60db0c26a, 0x869ab8721e6aa39e, + 0xdb48a4977717837a, 0x588a5ff151065b18, 0xe4a251ea0028864d, 0x7f0e43ba408a77c3, + 0x65f66dd50a536135, 0x6f49e934d9331c3e, 0xb8d742e0f0fa6b09, 0xe4e9b272deca2348, + 0xaee132ff902f773c, 0x43f658f7c2a0c90a, 0x28cb4dbc76cc53ea, 0x7d92253aa99ac39b, + 0x4fea3d832370baab, 0xb29e36936e51d78e, 0xea10778712321064, 0xff4f21f8ef274be2, + 0x84eff18ddfa0933f, 0xd0ec6a9f86c758a0, 0xaf82e5973c431ae0, 0x352023c00c045425, + 0xad34d7bc4a2f8961, 0xbdb4a02a24d4dee0, 0x354a4846d97447cf, 0x331a8b944d5bc19f, + 0x5ce04f8e17909035, 0x6497581bad8f4aab, 0x07c503bba647111e, 0x85f412ba78e1f7ff, + 0x7f3b920fd20f4cff, 0x424e1a9a4ce34e2f, 0x3035e2d62e1b9f0a, 0xef63114bff7b729a, + 0xe86a05889ab6bb60, 0xee0830cf095585a1, 0x4a54f7fa47d9c94b, 0x17daeece9fcb556a, + 0xc506d3f391834c6f, 0xb3f24be362e1af64, 0xc435e4e23608efdd, 0xeeba9caaa4cc1768, + 0x5a71f306daddc22d, 0x18e5205f41eba1a0, 0x7b29b4d1f6610925, 0x065cb65a0258d9a9, + 0x3e5ac8faa9fd1f95, 0x3b362362c1ea0470, 0xce0e4f6434db7a2e, 0xf327341098de52f2, + 0xcfca3b9e2a1992c3, 0x7483bf9401233e41, 0xbafbac531c6f9281, 0x4b52dd71b2c106f8, + 0xdf73b66e50b5a1f7, 0x237aec0202a20283, 0x23dd5be23dffdf2b, 0xea9730731ee122ef, + 0x5cb3f846014fbcd3, 0xc3b21c8ffdce9201, 0x06a99a02f91a8760, 0x721a81fa8fd7b7a3, + 0x6aafcdddc53cbcd8, 0xd03b464005a93bcc, 0x8212edc1b1669dcb, 0x71f4c31364c31bc7, + 0xfeeec0eba8772307, 0x1948d00a13d88cf1, 0x19064fd6d943ada8, 0x4ec8d31722697bfd, + 0x596d9a953a516609, 0xc4cb4bff53507da2, 0x1d59f3c5be36e4ca, 0xe5b4fc5bf6044c9b, + 0x1bb74e052232f735, 0x04e8a0db611ddd5d, 0x8d04eaa009b421bf, 0xa7878ae0ac0e6d58, + 0x28c1030217cab2b3, 0x827943767e56a883, 0x28fce5fa02d22809, 0xb30c322fffc8c58e, + 0x1ca5a6a9f8066c5b, 0xb24db5f1462b2513, 0x02f653b89b7e5f6c, 0xe31f8fb5d5f78eee, + 0x266acc514ed93501, 0x936879d1c6fddcc4, 0xcd51be3636af1952, 0x3fdbb6fc332c78c8, + 0x9eb656379fa73094, 0x056146cc92fa0f96, 0xed6c4f1836c027c3, 0x021e0bb5d2113f2a, + 0x8983e42ec1c626b3, 0x73ea9bc6513ad9c9, 0x0c904903b24f4247, 0xacbac1e6243e2525, + 0x0b1069a0c230fb06, 0x77d709fca3fc1ce5, 0x87ad0f65020947e6, 0x555302641c53f4e6, + 0x65ea87871fa9aaee, 0x58aaf4ecc1067bb4, 0x1a66c48cc4c65b3f, 0xca96aca48b2ea969, + 0xa68eb70bad14de2b, 0x5ccdb3d7e00a6f6e, 0xe178fbfec73fe72f, 0x2b63d6a16b83e890, + 0x32fdb7a5330fbae0, 0x2ab5803c8d1bf32c, 0xda838388c1527c94, 0x16a50bdc4de24acb, + 0xe561301f134c074a, 0xd7ae63d2816b4db1, 0x036aabd4df0dd741, 0xc5e0db8783435b9d, + 0x9c4386cf0a07f3b2, 0x6a72ac1aa56a13a1, 0x299bbdb04bb20a23, 0x138c1018fda16b81, + 0x0e354f0b3bda49df, 0x9f4c295b23127437, 0xd133ceb2bd561341, 0xd8b4bfd5a526ac29, + 0xcdd0a70ddc1c7bbd, 0x81dce595bf572225, 0x1c6f925c05f6efd7, 0x8ae5097553856ea0, + 0x3aabeaeef248f60d, 0xd9005809d19a69e2, 0x2a3a1a314311cc27, 0x89bb2dc76b2b624a, + 0x50a2a95d0412e289, 0x9def8df564e68581, 0xf49010a9b2e2ea5c, 0x8602ae175d9ff3f0, + 0xbf037e245369a618, 0x8038164365f6e2b5, 0xe2e1f6163b4e8d08, 0x8df9314914f0857e}, +}; + +const uint64_t AVG_LEN = 1024 * 1024; + +// create a fake null array class with a GetView method returning 0 always +class FakeNullArray { + public: + uint8_t GetView(int64_t i) const { return 0; } + + std::shared_ptr<::arrow::DataType> type() const { return ::arrow::null(); } + + int64_t null_count() const { return 0; } +}; + +static uint64_t GetMask(uint64_t min_size, uint64_t max_size) { + uint64_t avg_size = (min_size + max_size) / 2; + size_t mask_bits = static_cast(std::ceil(std::log2(avg_size))); + size_t effective_bits = mask_bits - 3 - 5; + return (1ULL << effective_bits) - 1; +} + +// rename it since it is not FastCDC anymore + +FastCDC::FastCDC(const LevelInfo& level_info, uint64_t avg_len, uint8_t granurality_level) + : level_info_(level_info), + avg_len_(avg_len == 0 ? AVG_LEN : avg_len), + min_len_(static_cast(avg_len_ * 0.5)), + max_len_(static_cast(avg_len_ * 2.0)), + hash_mask_(GetMask(avg_len_, granurality_level + 3)) {} + +template +bool FastCDC::Roll(const T value) { + constexpr size_t BYTE_WIDTH = sizeof(T); + chunk_size_ += BYTE_WIDTH; + if (chunk_size_ < min_len_) { + return false; + } + auto bytes = reinterpret_cast(&value); + bool match = false; + for (size_t i = 0; i < BYTE_WIDTH; ++i) { + rolling_hash_ = (rolling_hash_ << 1) + GEAR_HASH_TABLE[nth_run_][bytes[i]]; + match |= (rolling_hash_ & hash_mask_) == 0; + } + return match; +} + +bool FastCDC::Roll(std::string_view value) { + chunk_size_ += value.size(); + if (chunk_size_ < min_len_) { + return false; + } + bool match = false; + for (char c : value) { + rolling_hash_ = + (rolling_hash_ << 1) + GEAR_HASH_TABLE[nth_run_][static_cast(c)]; + match |= (rolling_hash_ & hash_mask_) == 0; + } + return match; +} + +bool FastCDC::Check(bool match) { + if (ARROW_PREDICT_FALSE(match && (++nth_run_ >= 7))) { + nth_run_ = 0; + chunk_size_ = 0; + return true; + } else if (ARROW_PREDICT_FALSE(chunk_size_ >= max_len_)) { + chunk_size_ = 0; + return true; + } else { + return false; + } +} + +template +const std::vector FastCDC::Calculate(const int16_t* def_levels, + const int16_t* rep_levels, int64_t num_levels, + const T& leaf_array) { + std::vector result; + bool has_def_levels = level_info_.def_level > 0; + bool has_rep_levels = level_info_.rep_level > 0; + + if (!has_rep_levels && !has_def_levels) { + // fastest path for non-repeated non-null data + bool val_match; + int64_t offset = 0; + int64_t prev_offset = 0; + while (offset < num_levels) { + val_match = Roll(leaf_array.GetView(offset)); + ++offset; + if (Check(val_match)) { + result.emplace_back(prev_offset, prev_offset, offset - prev_offset); + prev_offset = offset; + } + } + if (prev_offset < num_levels) { + result.emplace_back(prev_offset, prev_offset, num_levels - prev_offset); + } + } else if (!has_rep_levels) { + // non-repeated data possibly with nulls + bool def_match, val_match; + int64_t offset = 0; + int64_t prev_offset = 0; + while (offset < num_levels) { + def_match = Roll(def_levels[offset]); + val_match = Roll(leaf_array.GetView(offset)); + ++offset; + if (Check(def_match || val_match)) { + result.emplace_back(prev_offset, prev_offset, offset - prev_offset); + prev_offset = offset; + } + } + if (prev_offset < num_levels) { + result.emplace_back(prev_offset, prev_offset, num_levels - prev_offset); + } + } else { + // repeated data possibly with nulls + bool def_match, rep_match, val_match; + int16_t def_level; + int16_t rep_level; + int64_t level_offset = 0; + int64_t value_offset = 0; + int64_t record_level_offset = 0; + int64_t record_value_offset = 0; + int64_t prev_record_level_offset = 0; + int64_t prev_record_value_offset = 0; + + while (level_offset < num_levels) { + def_level = def_levels[level_offset]; + rep_level = rep_levels[level_offset]; + if (rep_level == 0) { + record_level_offset = level_offset; + record_value_offset = value_offset; + } + ++level_offset; + + def_match = Roll(def_level); + rep_match = Roll(rep_level); + if (ARROW_PREDICT_TRUE(def_level >= level_info_.repeated_ancestor_def_level)) { + val_match = Roll(leaf_array.GetView(value_offset)); + ++value_offset; + } else { + val_match = false; + } + + if (Check(def_match || rep_match || val_match)) { + auto levels_to_write = record_level_offset - prev_record_level_offset; + if (levels_to_write > 0) { + result.emplace_back(prev_record_level_offset, prev_record_value_offset, + levels_to_write); + prev_record_level_offset = record_level_offset; + prev_record_value_offset = record_value_offset; + } + } + } + + auto levels_to_write = num_levels - prev_record_level_offset; + if (levels_to_write > 0) { + result.emplace_back(prev_record_level_offset, prev_record_value_offset, + levels_to_write); + } + return result; + } + + return result; +} + +#define PRIMITIVE_CASE(TYPE_ID, ArrowType) \ + case ::arrow::Type::TYPE_ID: \ + return Calculate(def_levels, rep_levels, num_levels, \ + checked_cast(values)); + +const ::arrow::Result> FastCDC::GetBoundaries( + const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, + const ::arrow::Array& values) { + auto type_id = values.type()->id(); + switch (type_id) { + PRIMITIVE_CASE(BOOL, Boolean) + PRIMITIVE_CASE(INT8, Int8) + PRIMITIVE_CASE(INT16, Int16) + PRIMITIVE_CASE(INT32, Int32) + PRIMITIVE_CASE(INT64, Int64) + PRIMITIVE_CASE(UINT8, UInt8) + PRIMITIVE_CASE(UINT16, UInt16) + PRIMITIVE_CASE(UINT32, UInt32) + PRIMITIVE_CASE(UINT64, UInt64) + PRIMITIVE_CASE(HALF_FLOAT, HalfFloat) + PRIMITIVE_CASE(FLOAT, Float) + PRIMITIVE_CASE(DOUBLE, Double) + PRIMITIVE_CASE(STRING, String) + PRIMITIVE_CASE(BINARY, Binary) + PRIMITIVE_CASE(FIXED_SIZE_BINARY, FixedSizeBinary) + PRIMITIVE_CASE(DATE32, Date32) + PRIMITIVE_CASE(DATE64, Date64) + PRIMITIVE_CASE(TIME32, Time32) + PRIMITIVE_CASE(TIME64, Time64) + PRIMITIVE_CASE(TIMESTAMP, Timestamp) + PRIMITIVE_CASE(DURATION, Duration) + PRIMITIVE_CASE(DECIMAL128, Decimal128) + PRIMITIVE_CASE(DECIMAL256, Decimal256) + case ::arrow::Type::DICTIONARY: + return GetBoundaries( + def_levels, rep_levels, num_levels, + *checked_cast(values).indices()); + case ::arrow::Type::NA: + FakeNullArray fake_null_array; + return Calculate(def_levels, rep_levels, num_levels, fake_null_array); + default: + return ::arrow::Status::NotImplemented("Unsupported type " + + values.type()->ToString()); + } +} + +} // namespace internal +} // namespace parquet diff --git a/cpp/src/parquet/column_chunker.h b/cpp/src/parquet/column_chunker.h index 6a9285c5b58..ba96abc0ad6 100644 --- a/cpp/src/parquet/column_chunker.h +++ b/cpp/src/parquet/column_chunker.h @@ -28,541 +28,6 @@ using arrow::internal::checked_cast; namespace parquet { namespace internal { -constexpr uint64_t GEAR_HASH_TABLE[8][256] = { - {// seed = 0 - 0xf09f35a563783945, 0x0dcc5b3bc5ae410a, 0x63f1ea8d22554270, 0xfbe5ee7bd05a7b61, - 0x3f692ed5e9934aba, 0xaab3755952250eb8, 0xdefb168dc2888fa5, 0x501b36f7c77a7d47, - 0xd2fff45d1989642d, 0x80217c1c600e30a6, 0xb9469ee2e43df7ac, 0x3654b76a61999706, - 0x6ea73dfe5de0c6b6, 0xdfd662e1937a589d, 0x0dbe0cc74b188a68, 0xde45f4e6d73ffc6f, - 0xcdf7a7759e70d87e, 0x5d6a951b8d38c310, 0xdc9423c3813fcf2c, 0x25dc2976e167ffce, - 0xc2555baa1d031c84, 0x115bc3f2230a3ab6, 0xd4b10260f350bede, 0xdfd3501ab447d723, - 0x022e79217edaf167, 0x1635e2255c5a7526, 0xa0a750350cc77102, 0xc027133e05d39f56, - 0xd949459779cf0387, 0xb92f1464f5c688c2, 0xd9ac5f3e8b42f2f3, 0xdf02bb6f5ecaac21, - 0x8156f988fac7bfa4, 0xe4580f97bede2ec8, 0x44fe7d17a76fca32, 0x885f59bd54c2014c, - 0x435e63ec655ffae9, 0x5ebc51930967b1f1, 0x5428c2084ac29e47, 0x9465938fec30e36b, - 0xc7cb3de4977772cd, 0x15692d7c201e8c3a, 0x505ee65cdc4b17f4, 0x7d9839a0a7aead6b, - 0xeef5f5b6a0105291, 0x76c2fb232ce7f5bf, 0x5c13893c1c3ff3a9, 0x65b6b547d4442f98, - 0xb8ad7487c8c96fce, 0x906bcf51c99974f8, 0x2f56e48bb943a48c, 0xbc9ab109f82d3a44, - 0xcd5160cdc8c7e735, 0xbe9acb9df3427732, 0x386b91d477d7fade, 0x36be463621dd5af2, - 0xcbe6a2faffd627a8, 0x9c8fd528463a2f5a, 0xb9b88c6bb802b184, 0xb414b4e665c597c7, - 0xbedb142568209556, 0x5360d81c25429dce, 0x63a69a960a952f37, 0xc900d63899e1b503, - 0x1abc63a8b37c7728, 0xa8b3a8b6409080eb, 0x495e391f662959f6, 0xdf1e136f3e12229b, - 0x33d5fc526b0dd38d, 0x321221ae2abfac63, 0x7fde18351fda7395, 0xed79fe5c3a6aa4c3, - 0x2dd6965a4867d8d4, 0x54813ca20fe8799b, 0x5d59ea6456465c39, 0x0de0c294d1936b81, - 0x4aaf0755002c588c, 0x3530a1857ad04c6d, 0xb8a64f4ce184442b, 0xe0def10bceedfa17, - 0x46e38d0a443757ec, 0x9795a1c645ee16d7, 0x7e531def245eac8a, 0x683b25c43a0716cf, - 0x884583d372da219d, 0x5b06b62c910416e5, 0x54b6902fbebd3dbe, 0x931198d40a761a75, - 0xead7d8e830013590, 0x80b4d5dc99bfaced, 0xf98272c8108a1ad2, 0x1adce054289a0ec6, - 0x7d53a1143c56b465, 0x497fbe4f00c92b52, 0x525e4cc2e81ebd69, 0xc94478e0d5508ff6, - 0xb8a5da83c196d07c, 0x7667a921b65b0603, 0xf236fabbdefe6cd1, 0x53da978d19a92b98, - 0xc604f6e97087124d, 0x2cbd27221924b094, 0x65cd1102c985b1d2, 0x08c0755dc1a97eb4, - 0x5e0419e921c0fef1, 0x282d2c1196f84a29, 0xe21117fcfc5793f7, 0xcf4e985dc38e6c2e, - 0xd521f4f264d55616, 0xde69b04c485f2a10, 0x59410e245305178a, 0xceab1d477c943601, - 0xa9805732d71ee5e9, 0x054cd443896974f6, 0xf2b517717a423a3e, 0x09517937fa9fac95, - 0x4938233e9ca871e3, 0x9132cbaf56f83ec0, 0x4703421ed1dd027d, 0xfd9933f4e6f1ec4e, - 0xf237c7fded2274a8, 0xdf4616efe68cd7b4, 0x5e46de0f39f0a380, 0x3d41e0c6d8e095b0, - 0xc5272f8a5bb2df09, 0x68aa78e8301fb964, 0xbf5b5b52c8e32ae0, 0xbf28ed3df74bdcf7, - 0xd6198f64c833815a, 0x8cd99d2974267544, 0xd90560ea4465ff2c, 0x571d65ad7ad59261, - 0x309453518baa367a, 0xa60538377bc79fb2, 0xace515da1ab4183c, 0xf56d3c8d891d1c5b, - 0x5b0d8370b59def49, 0x775866ce7c83c762, 0x3d76085695c8e18a, 0xba064d1a9af1b114, - 0xc84ef7cd7b98b521, 0x90b9231681c2bc37, 0x37e2b13e6f585b6b, 0x1d0a34e55e0f369f, - 0x86bb8019cf41447c, 0x4b95c6ef55b3f71f, 0x3b6ed1660732b310, 0x617eee603d137f21, - 0xf4f6278b464f3bbc, 0xdfb763b720da205a, 0x353478899b871cb7, 0xe45fbbff574cc41e, - 0x1a94b60847907d72, 0xb10eef051eff67a5, 0xf0e012ec6a284d40, 0xcc1cd1a11b926d7c, - 0xcf9d9c5453e19cad, 0x270febcc0fc0e86b, 0xd6567568778b781e, 0x7323b98965eeb46b, - 0xccecd374567086ff, 0xef7b44bfc497a704, 0xebc479c051a9f0a5, 0xc9b7410e3e00a235, - 0x1d084f7ecdf83dab, 0xc8a9a97e33ba8ba3, 0x8c75318f5b2350d6, 0xaa3cd5d0c684bdda, - 0xa81125fe0901bedf, 0xf7bcd76020edfc93, 0x834ee4c12e75874f, 0xb2bb8a7beb44fa14, - 0x32cd26f50a4f4e4d, 0x0fc5817ca55d959a, 0xd6e4ae2e3ae10718, 0x074abdcceb8d6e38, - 0xc0cc5f4f9b3a9c43, 0x1115d364363595b2, 0x69861db2eb19f2e8, 0x59b8d804cf92bc67, - 0x9bac9785e5e4b863, 0x7fa0e17a41869561, 0x10d3c9633f0c709c, 0x534a03deee6bc44a, - 0x73b1f7201257f581, 0x46fd6a11e2e0706b, 0x494abb554946e67a, 0xb5d6da317864dc8e, - 0x402ded9238f39687, 0xd8fa37d2cbd6d290, 0xcc818293fcb06791, 0x6482ab344806cd4d, - 0x0956e6ee9d8eb60b, 0x01fee622d8465ac8, 0xae7ece370cbd9c35, 0x7ff09e937a177279, - 0xa2c29ee7a33ca5f1, 0x990e8dbee083923b, 0x4a819b72f610863a, 0xddecfad79d3f08be, - 0x627372480fac20a7, 0x802154d6eca2db4c, 0x8fcf02e42f805e55, 0x040a911ff8cea977, - 0xbb544485bc64d0d4, 0xaddde1aeb406d0fb, 0xf6b35fae23dce66f, 0xc07a9fb3645d2f9b, - 0xccd113907e9c0fed, 0xd17af369984fd213, 0x9223823c59a083e7, 0xe19d475606b81013, - 0xe181ac116a90e57a, 0x71f7b6258c6def4c, 0x2246f34b45964f7c, 0xd74aedaea2d31751, - 0xb1add86e5dd305d1, 0xeb9ba881f16d6471, 0xef7600e036f5c6ff, 0x1d50bc9735b8fb85, - 0xe63942bd1f3e2969, 0x9241ba9f8b3f4e72, 0xee8bb2bca07d35b6, 0x55cd55dab522654e, - 0x94d0cfa7c1a6845d, 0x02f9845d559884c3, 0x8ce70ea21063b560, 0xd70998028ef08b74, - 0xdfdb5bbee310876b, 0x4e21b2e348256d16, 0xde007a981c13debc, 0xe51950cbbddabfdd, - 0xd223301dbe9957c1, 0x084b8634cc2cce4b, 0x90e551378aa9d70c, 0x833b533ac633e448, - 0x7891e232882da57f, 0xa1bf26f0163ce2b3, 0xf33a0171eb9c68d5, 0x2e7de18ca69b3fa2, - 0x666fd6f175619199, 0x1239d37edb5feb9f, 0xfa9fc9382e61ff5c, 0x3ca4ad427e3c126f, - 0x37c6dd4c2c31ae6e, 0x1f1bacb619d427b2, 0x7dd09f5d10759afe, 0xc8d941432327d733, - 0x2b389ba25e1d43a7, 0xa4e3030c3740ff21, 0xcc56dae13fd37463, 0x2481457c175b560f, - 0x9deb35bde77c5c41, 0x847aa6ea5549a0c3, 0xcde01bb48b6e7f02, 0x15a28844e64cb211}, - {// seed = 1 - 0xecfcba92fe5691a3, 0x71377799fea34699, 0xb284c9096fa614e5, 0x54534170f40de6c8, - 0xbbd804d45884fba3, 0x44929a896388c8a1, 0x79b712508e0fa3b1, 0xeb53ab280af31054, - 0x351ea23a6319da7a, 0x2fbe55d9819d85a2, 0x34f4b6568dcd28b1, 0x8c94ea5e5d82967a, - 0x09068d333a46d3c5, 0x762ad4f64cb73381, 0xd5c6db5ef0e22640, 0x36d8ab5a36175680, - 0xd41fe333cdc3525a, 0xa1f51dbdf20ce781, 0x1410a95e786c8be6, 0x96b7499a670c2b41, - 0x3912e1037835d893, 0x272c5bd83e1e9115, 0x2ea7f91cad82a0d6, 0xcd10e85662ce9931, - 0xedad49be8d5e8b74, 0x7ccd8fe0f37d12bc, 0xfac0482005eed593, 0x4513991681f6c8b0, - 0x2804d612eb0ad37d, 0x7cca9e8412b81d34, 0x85ffd6707192b7b8, 0xea0560aeea954411, - 0x0122d28226102bba, 0xf51c47cdbd22fdd1, 0x3707d851183ff17c, 0xaef5a1465f3e902d, - 0xbcb38c2d8736a04f, 0x4025317e864bef15, 0x8d3f66d86e1ea58f, 0xc16759a3d97ed79a, - 0x1c62abdc0659f2f5, 0x23b3eb4e699bd28f, 0x5083c4fceed3ccaf, 0xa65bf34562cc989c, - 0xaa5865932fd79064, 0xf24d08d268c24593, 0x7fbd00a215196999, 0x7812cd366d752964, - 0x62e8dcb27ef3d945, 0xf08b7984e1b946dc, 0x547d23ad9a5c1dcf, 0x496b1fb249b27fb7, - 0xcd692e1db5f3b3ba, 0x41931e39f1e1bc61, 0x286c6a7d7edae82b, 0x17ef6638b6c4ca6e, - 0x609beb5a2576a934, 0xcc5e16fe4a69b83c, 0xbbd14d08b078fc24, 0x2a617680f481cb94, - 0x81dbbd5f86e6d039, 0xeb8205e1fc8ecc3c, 0xe5e3bb576faa8042, 0x5d6f1eb9d9df01b5, - 0x9a47b8739c10fb44, 0x398a7caad7ea7696, 0x9c0fc1d7c46adde6, 0x67cd6de0a51978a6, - 0x68ccc4b77a21cca4, 0x1e067066b82f415c, 0xf7ddade6535e1819, 0xf2185c884291751b, - 0xc322b7381fcbe34f, 0x242f593e88290b9b, 0x8e11ccc0ea5e84a3, 0x40e3a2e3346db8a2, - 0xf18bfc3ad2931a2c, 0x2468397394b00144, 0xeae199cce14e6817, 0x05b462686c75a1ae, - 0xda096cb859c51673, 0xd87aeb967a906bef, 0xaabc74493cb02fe6, 0x74d48fc2e7da143e, - 0x6ec1c8fed3f2c1fd, 0xe01e0704b463f18e, 0xc3d88a4d3a8056e4, 0xd01ae0ffab6c8f3f, - 0x881ba052620ae7c7, 0xcea033aef0a823a5, 0x8d2cad91d83df1e3, 0x18746d205e66dbe9, - 0x3061f8e58d046650, 0xd819c59f0ce2cf8b, 0x144e89e93635e870, 0x3415e88279b21651, - 0xd6f7ab944b86c3fa, 0x45f1dd15d0f67bdc, 0xbf0d97c7f4fa24f4, 0x34a7de520a57fcd2, - 0x4ba86fda03e9e2bc, 0xa7995265a025b552, 0x698f6819d5f51cf7, 0xd07dbe9d8a156981, - 0x2683945373857fc1, 0x116f8a84f96167de, 0x8bc832bd85595ebf, 0xb206519d74fdfafa, - 0xde9519b2e9b5cc5f, 0x16fdd6f2da1d8163, 0x7ba32bd48ef56f11, 0x6f4e4d7ee8b29717, - 0xd31576dde7468aad, 0x023bb08848676045, 0xf6dcc083178160b7, 0x42035f426250e683, - 0x343732993cfed89f, 0x0640a870a22d3d58, 0x65cff80b53b4ae6a, 0x27996fa17ab05215, - 0xfd5db01401b21a04, 0x894508784bc1673c, 0x5bfcf43a2380e27d, 0x4cd6dcc2715583b7, - 0xa43b3763e7d4c902, 0x6da83e12ef0c1257, 0xfe80a602b0335aff, 0x293a7d8f4ff344de, - 0xb4ae7c2b8956bf5a, 0x6b45432d38254b4d, 0xd086acbdf15d9455, 0xa4d19e43f41ea87b, - 0xf01f13ba4bb87fbf, 0xca582cf301a299ff, 0x0ddad3d45298fa7d, 0x0646a130459c3999, - 0xc08e3af3747e2cee, 0xfc7db8aa9ed67295, 0x783b329e7bd79d5f, 0x732dbc607957af7b, - 0x8e446ac19fb26555, 0xff1dfa4d61dc89a5, 0xb6fbc46bd8d011d8, 0x185147ec5779f0d7, - 0x6eb2cf6149a5380f, 0xb0e773df803a1eae, 0xc07706c5519bfce5, 0xc35abcf54fa95f14, - 0x40a01d99a38608ea, 0x776dcd6f603c277f, 0x6ae12389b1d6d0bb, 0x8bd981448df92bb9, - 0x426a6a7ca21a2c16, 0x87efd5b71c1bad26, 0x71fb7fc4cd41de48, 0xdd9033c45619d463, - 0x40eaab322654cef7, 0xe077fffed6f3e3a2, 0x375a4dbef9384447, 0x2066b009d2c4a100, - 0xeca4a5794a068447, 0x2128f64bddf341a1, 0x738b4bb1be90bd61, 0x433772cf3813d52e, - 0x9540c88add8e4474, 0x0b6d5decd21d3519, 0x654ead966745642d, 0xe1bfb03c3b4bdb4c, - 0x0b977a9937515b1f, 0x0a4587509ef63870, 0xe89f0de1d9cfd44a, 0x23a91390272e7f68, - 0xd92defbc9096b8d8, 0x004db87174612539, 0xc88ecaabdd1a71f1, 0x050de38393073346, - 0x8af1426d7964e038, 0xf352c4fef8ad5c87, 0x6f26bc7408e26548, 0x0d41543fd9bf3084, - 0xfc4e07553a840fc6, 0x5ef117de86a555a9, 0x1f11c42dffb5ae1b, 0x4147648f07490fa5, - 0x09b35fd7671b21aa, 0x1453b14f7ccca481, 0x944f6fcce4c9b2ba, 0x5b08dd2e3583dc06, - 0xe0220df78dc9c22d, 0x1c200b9506cbf666, 0x8a0b7465eadb523b, 0xfbcb43a91a1e2d80, - 0xe697f44be3c36a58, 0x2f8a8e48fb7e350d, 0x7baba71b8920d55f, 0x10edc0216105bc96, - 0x52db07c79d7a7a63, 0x1916e8cef9452ac3, 0x5cbbbf21f867b6cc, 0xadd583365a690a4b, - 0x4e4ca2c8bffc2fdb, 0xf5fe3416d2eebcfe, 0x839af8b85e452476, 0x8496c0c54ad44e16, - 0x6c46f1ecad4482bf, 0xb794cad76ae18715, 0x67b762eec7c62985, 0x52dc9e68df5b3a53, - 0x0cc7e444b422a5f9, 0xadbfe90841c112b0, 0xfe37b136f0ca5c34, 0xcfe9e47948a8d73e, - 0xee90572b86a30d91, 0x549e72d8262830aa, 0x3361564b469f32c6, 0x1e6eba9e0d2648e2, - 0x5f8e2b2ac5fcb4eb, 0xe4224fa5f71f7cc6, 0x7357a9230c76757b, 0xcad70f74aaf6b702, - 0xeef28ced23894cc2, 0x753fdd3352aefd68, 0x1fed6ba90bbeb9d2, 0x05316f4ab4034b4b, - 0x3396df022b9f63d6, 0x82d7125a7cfd0935, 0x3519a71caf1f87f0, 0xd1dfb7a5cc3974be, - 0xbfae40ecbdbbcc2a, 0x152c11778e08dd54, 0x4a96566a6c848554, 0x3a84d621c340cdd7, - 0xfd47aa1887e2fb03, 0xa63cae94b2f1d099, 0xed61783f3e5b75e0, 0xefd44864106019be, - 0x145ff78b80b081aa, 0x34670e5fcea9230e, 0x876ef976328db371, 0x4221f3a5269942a6, - 0x95315cbd85c648f4, 0x3ca344dc7c3b1600, 0x38421ea39ff28780, 0x31dbeee967c0435c, - 0x27437c3e268402e7, 0xdd0cf8343312a654, 0x965ab9dad1d8aa29, 0xf871706dd3e23509, - 0xce23d06c7a25e699, 0x1b37d59382b27589, 0x3407f004723d6324, 0x56efb69cdb5deaa1, - 0xf46cdd2b9fd604e0, 0xcad3ca79fdac69bd, 0x7252802a574e63cb, 0xc281fb8acc6ec1d3}, - {// seed = 2 - 0xdd16cb672ba6979c, 0x3954eaa9ec41ae41, 0x52cb802771d2966d, 0xf57ed8eb0d0294f2, - 0x768be23c71da2219, 0x6131e22d95a84ad3, 0xd849e4e49bb15842, 0x18e8e5c4978cf00d, - 0x3af5e5867ce1f9bd, 0x06c75a9fffe83d63, 0xe8de75a00b58a065, 0x0a773251bc0d755a, - 0x629dc21e54548329, 0x2a168f5e5a883e70, 0x33547375f0996c86, 0xdfcb4c7680451322, - 0x55c1ecaaaa57e397, 0x4546c346c24f5a31, 0x6f8f0401dfabc86c, 0x7760d2d36ee340b4, - 0xf6448e48bdeb229d, 0xba70e1633b4dba65, 0x069cda561e273054, 0xa010b6a84aebf340, - 0x5c23b8229eee34b6, 0xea63c926d90153af, 0x7d7de27b3e43ec1b, 0xea119541eddc3491, - 0xf1259daeddfc724c, 0x2873ca9a67730647, 0xa1e7710dade32607, 0x758de030b61d43fd, - 0xd2c9bcbfa475edb4, 0x18ade47bb8a0aa29, 0xf7a74af0ff1aea88, 0x6f8873274a987162, - 0x6963e8d876f4d282, 0xd435d4fe448c6c5b, 0x93ec80ba404cafff, 0xcf90d24c509e41e7, - 0x5f0fc8a62923e36e, 0x9224878fe458f3a4, 0xd9a039edf1945bcd, 0x0877d1892c288441, - 0x75205491f4b4740b, 0x30f9d2d523a9085b, 0x4b7f4029fa097c99, 0x170bb013745709d4, - 0x7087af537f11ef2e, 0x28c62b88e08fc464, 0x84bbcb3e0bb56271, 0x485a4b099165c681, - 0x357c63357caa9292, 0x819eb7d1aee2d27e, 0xdaa759eb9c0f8c9d, 0x42cdc36729cc3db5, - 0x9489aa852eddbb06, 0x8161e4f85a84e6d4, 0xa964863fdad3eb29, 0xcc095ddbce1a6702, - 0x3ecfadbb8dc2ce58, 0x971316509b95a231, 0xc8f484d1dbc38427, 0xae9c510c463574c0, - 0xdf2b31179600c21a, 0x440de87bada4dfa3, 0xbd8d30f3f6fb7522, 0x84e6d7f678a0e2d0, - 0x0ec4d74323e15975, 0xf6947610dad6d9ab, 0x73a55a95d73fe3a5, 0x3e5f623024d37eda, - 0x8d99a728d95d9344, 0x8b82a7956c4acdc4, 0x7faeaea4385b27f6, 0x540625ff4aa2ff21, - 0x4aa43b3ebd92ce2b, 0x899646a6df2da807, 0x49225115780942d7, 0xe16606636af89525, - 0xb980bcf893888e33, 0xf9ed57695291b0d8, 0x5c6dd14464619afa, 0x50606d69b733d4f3, - 0x7fb1af465b990f97, 0x3fab2634c8bbd936, 0x556da6168838b902, 0x0f15975902a30e1f, - 0xb29d782ae9e1991f, 0xae00e26ff8f7e739, 0xd3da86458bb292d5, 0x4528ee0afb27e4ce, - 0x49882d5ba49fabad, 0x7e873b6a7cf875ee, 0x777edd535113c912, 0x94ed05e7ff149594, - 0x0b8f95fc4211df43, 0x9135c2b42426fef2, 0x411e6c2b47307073, 0x503207d1af0c8cf8, - 0xd76f8619059f9a79, 0x64d24617855dee45, 0xf7bc7a877923196a, 0xd6cc42ed6a65be79, - 0xe3912ff09d4fc574, 0x4192d03b2bc2460a, 0xa0dcc37dad98af85, 0xfc59049b2a5818a4, - 0x2128bae90a5b975f, 0xbe7067ca05ea3294, 0x5bab7e7753064c4f, 0x42cbf0949ef88443, - 0x564df4bbd017492c, 0xf2c2eb500cf80564, 0x5b92e67eb00e92af, 0x8c4103eef59c0341, - 0x83412122b8284998, 0x888daf2da0636b6d, 0x4d54b10303dd07d6, 0x201190e7c1e7b5ed, - 0x3797510bb53a5771, 0x03f7bc598b570b79, 0xdc1e15d67d94f73e, 0x721e8b499ebe02c1, - 0x71f954f606d13fa0, 0x0c7a2e408c168bf0, 0x07df2ef14f69c89d, 0xe295096f46b4baaf, - 0x7a2037916438737e, 0xd1e861aeaf8676ea, 0xb36ebdce368b8108, 0xb7e53b090ddb5d25, - 0x5a606607b390b1aa, 0x475e52994f4a2471, 0xbcc2038ba55b2078, 0x28b8a6b6c80df694, - 0xb5f0130ec972c9a2, 0x7a87cd2a93276b54, 0x4d0eec7ecf92d625, 0xac1a8ce16269a42e, - 0xa4ca0237ca9637b8, 0xd8dc8ff91202b6ff, 0x75b29846799d7678, 0x761b11a5edd9c757, - 0xf2581db294ef3307, 0xe3173c2b6a48e20f, 0xe46fd7d486d65b3c, 0x1352024303580d1f, - 0x2d665dae485c1d6d, 0x4e0905c825d74d3b, 0x14ff470c331c229e, 0xbdc656b8613d8805, - 0x36de38e396345721, 0xaae682c1aa8ff13b, 0x57eb28d7b85a1052, 0xf3145290231d443a, - 0xd0f68095e23cbe39, 0x67f99b3c2570b33d, 0x54575285f3017a83, 0x9b2f7bb03d836a79, - 0xa57b209d303367a9, 0x7ccb545dd0939c79, 0x1392b79a37f4716d, 0x6e81bb91a3c79bcd, - 0x2c2cd80307dddf81, 0xb949e119e2a16cbb, 0x69625382c4c7596f, 0xf19c6d97204fb95c, - 0x1b2ea42a24b6b05e, 0x8976f83cd43d20ac, 0x7149dd3de44c9872, 0xc79f1ae2d2623059, - 0xca17a4f143a414e1, 0x66d7a1a21b6f0185, 0xed2c6198fe73f113, 0x16a5f0295cbe06af, - 0x5f27162e38d98013, 0xf54d9f295bdc0f76, 0x9ba7d562073ef77b, 0xa4a24daaa2cfc571, - 0x49884cf486da43cd, 0x74c641c0e2148a24, 0xbff9dcbff504c482, 0xf8fc2d9403c837ab, - 0x6ccc44828af0bb1e, 0xbcf0d69b4c19dfdb, 0x8fe0d962d47abf8f, 0xa65f1d9d5514271d, - 0x26ff393e62ef6a03, 0xc7153500f283e8fc, 0xea5ed99cdd9d15cd, 0xfc16ac2ba8b48bb7, - 0xf49694b70041c67a, 0xbd35dd30f5d15f72, 0xcf10ad7385f83f98, 0x709e52e27339cdc2, - 0xe9505cb3ec893b71, 0x2ffa610e4a229af7, 0x12e1bc774d1f0e52, 0xe301a3bb7eacccc8, - 0x1fdd3b6dcd877ebf, 0x56a7e8bda59c05aa, 0x99acd421035d6ab4, 0xfd21e401cecd2808, - 0x9a89d23df8b8d46f, 0x4e26b1f1eb297b9c, 0x9df24d973e1eae07, 0xe6cdc74da62a6318, - 0xfc360d74df992db0, 0xf4eca0a739514c98, 0x481c515ba9bf5215, 0xce89cce80f5f3022, - 0xf487a10fc80e4777, 0x235b379a87e41832, 0x76f72e028371f194, 0xd044d4a201325a7d, - 0x47d8e855e0ffbdde, 0x268ae196fe7334b0, 0x123f2b26db46faa8, 0x11741175b86eb083, - 0x72ee185a423e6e31, 0x8da113dfe6f6df89, 0x286b72e338bbd548, 0xa922246204973592, - 0x7237b4f939a6b629, 0x31babda9bedf039a, 0xb2e8f18c6aeec258, 0x0f5f6ce6dd65a45e, - 0x8f9071a0f23e57d3, 0x71307115ba598423, 0xcbe70264c0e1768c, 0x1c23729f955681a8, - 0xfbc829099bc2fc24, 0x9619355cbc37d5d6, 0xea694d4e59b59a74, 0xb41cf8d3a7c4f638, - 0xae1e792df721cd0b, 0x7cd855d28aac11f6, 0xca11ba0efec11238, 0x7c433e554ce261d8, - 0xe3140366f042b6ba, 0x8a59d68642b3b18c, 0x094fcdd5d7bccac2, 0x9517d80356362c37, - 0x4a20a9949c6c74e8, 0xc25bcf1699d3b326, 0xa8893f1d1ed2f340, 0x9b58986e0e8a886e, - 0x29d78c647587ce41, 0x3b210181df471767, 0xd45e8e807627849d, 0x1ec56bc3f2b653e3, - 0x974ff23068558b00, 0xdb72bdac5d34262c, 0x23225143bb206b57, 0xd0a34cfe027cbb7e}, - {// seed = 3 - 0x39209fb3eb541043, 0xee0cd3754563088f, 0x36c05fc545bf8abe, 0x842cb6381a9d396b, - 0xd5059dcb443ce3bf, 0xe92545a8dfa7097e, 0xb9d47558d8049174, 0xc6389e426f4c2fc0, - 0xd8e0a6e4c0b850d3, 0x7730e54360bd0d0d, 0x6ecb4d4c50d050d5, 0x07a16584d4eb229f, - 0x13305d05f4a92267, 0xb278ddd75db4baec, 0x32381b774138608f, 0x61fe7a7163948057, - 0x460c58a9092efee6, 0x553bf895d9b5ff62, 0x899daf2dabfd0189, 0xf388ab9c1c4b6f70, - 0xd600fe47027ea4cd, 0x16d527ec2b5ef355, 0x5ac1f58ff6908c81, 0xa08d79ff8ee9ffe8, - 0xc1060a80b7a5e117, 0x14b2c23118c60bda, 0x8cc0defbb890df8f, 0xe29540fd94c6d28b, - 0xa604f003f82d5b71, 0xa67583d4eb066d18, 0xd62cbd796322b3fc, 0x070cfe244cdcccf3, - 0x73557c30b3af47e5, 0x2e544e31153a2163, 0x996eef7464d5bead, 0xbc71cb5ab0586cdc, - 0x0bfcb6c1b517ed69, 0x62b4f1fcc82e8ca0, 0x0edbc68f544965c5, 0x40fa39baa24af412, - 0xf39aeb2413dab165, 0x17e6013e7afee738, 0x8109bff1c8d42a9d, 0x3cd99863390989b5, - 0x02021a4cc9c336c8, 0xa06060778cb60aa4, 0xd96591db60bc1e06, 0xd2727175183f4022, - 0xcdc1f1c5bce3e7ce, 0xb393ccc447872a37, 0xdf6efe63257ead3a, 0x20729d0340dbceb6, - 0x9f3d2d26fc0ea0d7, 0xf392e0885189bd79, 0xdf2ee01eb212b8b6, 0x6e103a0c0f97e2c3, - 0x96c604a763bd841b, 0x9fc590c43bba0169, 0xf92dcd5ddc248c40, 0x113a8b54446941dc, - 0x5943eda146b46bb8, 0xbf657901a36a39a7, 0x5a4e0e7ea6568971, 0xb94c635bae9f9117, - 0x2626fb65b3a4ef81, 0xa59bfd5478ce97de, 0x79112ba9cc1a1c63, 0xf41f102f002cf39c, - 0x0a589bcbfb7ff1c8, 0xa1478c53540c4fa1, 0x60d55e72c86dfaca, 0x312e7b6840ea7a39, - 0x8aae72dcccfe1f75, 0xff2f51f55bf0247a, 0x3c2e4b109edb4a90, 0x5c6d73f6525c7637, - 0xe49acb04a199f61c, 0x27860642d966df7f, 0x541ce75fb1e21c30, 0xd9fcd6f90806c7cc, - 0xb87c27bc93a7969b, 0x92f77a1179b8f8dc, 0xb1f29379deb89ed4, 0x7e63ead35808efe7, - 0x13545183d7fa5420, 0x575f593e34cf029d, 0x27f1199fb07344ae, 0xe67f95f7dc741455, - 0x49b478b761ab850b, 0xd7bedf794adfc21e, 0xdc788dcd2dda40ae, 0x14673eb9f4d8ad35, - 0x0cced3c71ecf5eb1, 0xe62d4e6c84471180, 0xdfe1b9e2cb4ada7d, 0x70185a8fce980426, - 0x0ce2db5e8f9553d6, 0x1fedc57bb37b7264, 0xb9310a2e970b3760, 0x989ff8ab9805e87d, - 0x0b912d7eb712d9ee, 0x1fe272830379e67c, 0x16e6a73aff4738fb, 0xeed196d98ba43866, - 0x7088ca12d356cbe2, 0x23539aa43a71eee0, 0xed52f0311fa0f7ad, 0xa12b16233f302eea, - 0xc477786f0870ecb4, 0xd603674717a93920, 0x4abe0ae17fa62a4c, 0xa18f1ad79e4edc8d, - 0xc49fe6db967c6981, 0xcc154d7e3c1271e9, 0xdd075d640013c0c0, 0xc026cd797d10922a, - 0xead7339703f95572, 0x4342f6f11739eb4b, 0x9862f4657d15c197, 0x4f3cb1d4d392f9ff, - 0xe35bffa018b97d03, 0x600c755031939ad3, 0xb8c6557ffea83abf, 0x14c9e7f2f8a122ea, - 0x0a2eb9285ee95a7c, 0x8823fec19840c46f, 0x2c4c445c736ed1d0, 0x83181dff233449f1, - 0x15ed3fca3107bef5, 0x305e9adb688a4c71, 0x7dbef196f68a3e2e, 0x93e47ece3e249187, - 0x8353c5e890ead93c, 0xea8a7ae66abafdf7, 0xf956dbb6becf7f74, 0x9f37c494fbfdb6e4, - 0x11c6cbaa2485dd32, 0x206f336fcca11320, 0x9befe9a59135d8fe, 0x5f3ef8b8db92c7db, - 0xbb305e556ce0ce9a, 0xf26bdafb1305887f, 0xcbf28abe23f08c61, 0x0bc64173b914e00b, - 0x9168da52e983f54a, 0x6ea41d09c3574a3e, 0x78aa44d4a74459ae, 0x2931422878387bf5, - 0x018f64a3a92c2d9c, 0x9be43f6752e66b34, 0xae378890decd1152, 0x07325329a1cb7623, - 0x3b96f4ee3dd9c525, 0x2d6ebcdbe77d61a3, 0x10e32b0e975f510c, 0xffc007b9da959bf9, - 0x38bf66c6559e5d90, 0xbe22bdf0bf8899fe, 0x87807d7a991632a8, 0x149a0d702816766a, - 0x026f723db057e9ab, 0xeeecb83625ec6798, 0xcec2ed5984208148, 0xd985a78e97f03c84, - 0xf96c279e7927b116, 0x99d5027b3204f6e2, 0x13a84878c3d34c55, 0x5cf5ec96229e9676, - 0x0bc36b07e4f8e289, 0xbed33b80a069914d, 0x2fbfbdd1ff4b9396, 0xab352bb6982da90f, - 0x154d219e4fa3f62b, 0x4d087512bb6b9be7, 0xc582e31775ee400e, 0x7dadb002ae8c4a4e, - 0xaae2957375c1aee2, 0x5f36ca643356625b, 0xf87cf8eb76e07fb7, 0x46f432a755e02cc3, - 0x36087e07aba09642, 0xe5642c1e4ebb9939, 0xb9152d22338eefad, 0xf7ba44278a22cf7f, - 0xd3b8013502acd838, 0x7761511da6482659, 0xb0857621638e8e50, 0x552eddb4a8b1d5f5, - 0xc43d9861e812c3ea, 0xd765c2aada47910c, 0x21c935b68f552b19, 0x6256d5641a2b47dc, - 0xab711d8e6c94bc79, 0xa8d0b91a2a01ab81, 0x5e6d66141e8d632a, 0x7638285124d5d602, - 0x794876dbca3e471f, 0x951937d8682670ce, 0x0f99cb1f52ed466a, 0x8c7cd205543b804c, - 0x2fd24d74a9c33783, 0xe5dcb7b7762e5af1, 0x45e6749cca4af77c, 0x540ac7ee61f2259f, - 0x89c505c72802ce86, 0xeab83b9d2d8000d1, 0x9f01d5e76748d005, 0xc740aaef3035b6d0, - 0x49afcd31d582d054, 0xcba5dc4c1efb5ddc, 0xc0a4c07434350ca1, 0xfc8dfaddcc65ee80, - 0x157c9780f6e4b2d9, 0x9762a872e1797617, 0xc4afae2cf3c7e1bd, 0x71cde14591b595d4, - 0x8843c3e0e641f3b9, 0xd92ecd91dce28750, 0x1474e7a1742cb19f, 0xec198e22764fa06b, - 0x39394edb47330c7d, 0x00ba1d925242533d, 0xaed8702536c6fb30, 0x6d3618e531c2967a, - 0x77f7cedcd7cc0411, 0xbc1e2ab82be5b752, 0x07b0cf9223676977, 0x596c693b099edd53, - 0xbb7f570f5b9b2811, 0x96bfdad3c4a6840c, 0x668015e79b60c534, 0x3ad38d72123f1366, - 0x6b994d81d2fcbb09, 0x70885f022c5052d8, 0xc891ee79d9306a7b, 0x2c4df05c0ed02497, - 0x19ebc13816898be2, 0xea7c64df11c392a2, 0xb7663e88dd12e1bd, 0x79f768cb8e154c21, - 0x1fb21b12e945933b, 0xe6a9045643f6906e, 0x544c47acd7e15371, 0xb7709b14f727e3d1, - 0x326ee36a46942971, 0x477f1cf7b0e2d847, 0x88b8f6b82b3b0c24, 0x18bc357b80e3cd5c, - 0x3333de70e4d66e0b, 0x4fd4c5e148583cf6, 0xae1b62f3008c0af3, 0xc49f419b6ab29cf5, - 0x2c29fa65afc3fa28, 0x4b19d93734d03009, 0x7dd6c09e589276ad, 0x1cece97f30de48ad}, - {// seed = 4 - 0x58bdf4338602e4fb, 0x71a5620b02c926d5, 0x3811c960129c2d9f, 0x29c2fb11fccac567, - 0x0d6b1ea7780f1352, 0xcc4d3ddfae3f87b3, 0xfdd30257362a586b, 0xabc948fde69f25f1, - 0x51b3523469d30f7b, 0xe0f0322724405ace, 0xd3729266d896da1e, 0xb10c37e5147915bf, - 0x8b577039f9fa32a3, 0xe677c6a9cbfb44b3, 0x7317a756ebb51a03, 0xf8e988ef37359485, - 0x600fc1ef3f469ff3, 0xbf0b8f8520444e01, 0x3711168b08b63d73, 0x34146f2944a6cb36, - 0x717feb263862cdde, 0x7185f8347db00412, 0x900798d82127e693, 0x84089e976a473268, - 0x10f8308c0d293719, 0xf62a618d4e5719b8, 0x8bdbd257a1a9516f, 0xf49f666fd7a75110, - 0xbaf45e2db7864339, 0xe4efa1ea0c627697, 0x3e71d4c82a09fe10, 0x54a2a51cf12127bb, - 0xa0592c9f54ba14cd, 0x27dd627a101c7a42, 0x3d2ceb44b3d20d72, 0x7ee1f94a68ca8f5d, - 0x7e8cb8651b006c36, 0xbd9fa7ca3a475259, 0x856de173586a7b34, 0xcedb291b594cb1b5, - 0xa3d6e462fd21cddc, 0x74561d10af9118e4, 0x13a3d389fc2d4b36, 0xeea8594a4a054856, - 0xf56d7474d9ba4b13, 0x25ddce2f6490b2fd, 0x920653ff3a8d830b, 0xcd8c0c9cdac740d1, - 0x2c348a738db9c4a0, 0x2967ccbe8ea44c22, 0x47963f69adb049f8, 0xf9d01eb5b4cf7eb6, - 0x7a5c26eb63a86bd2, 0x62ad8b7a71fa0566, 0xb373213179f250ae, 0x589d4e9a88245a4d, - 0x433dafebe2d558a8, 0x521fbef2c8fe4399, 0x62a31f9ff9ccd46b, 0x51602203eba7c1a6, - 0x9afc8c451b06c99f, 0xb529085bdbaffcea, 0xac251825cc75892b, 0x94976a5bce23d58e, - 0xdd17925b6c71b515, 0x568fd07a57bce92e, 0xefac31200d8bd340, 0x716c3e466b540ef9, - 0x3d2c9e380063c69b, 0x14168f9a3662dd83, 0xd298c7504dbc412f, 0x74490a94f016719f, - 0x0e0da431e1ab80c8, 0xe321f63dc6b169ae, 0xf08671544febc95a, 0x39324450cc394b3b, - 0xea6e3d35f1aa3a70, 0x8ef8a886508ce486, 0xdc1a631ef0a17f06, 0xfda2b3fbcd79e87b, - 0xd75bcae936403b10, 0xf88b5bd9f035f875, 0xc43efec2e3792dd4, 0xe9fac21a9d47cd94, - 0xc2876f0c4b7d47c3, 0xaba156cf49f368b4, 0x5ccda2170fa58bf9, 0xadc92c879ed18df7, - 0x110c1b227354e6c8, 0x298ee7a603249200, 0xde92142ede0e8ee7, 0x88e4a4610644ba9e, - 0xbb62d277e7641d3a, 0xb9be1985b7bf8073, 0x29024e5426cdb0d1, 0xf6aefd01f3092ab8, - 0x2a07087b313133aa, 0x6d71f445d6dfc839, 0x1e2412ff12e5526b, 0xed5cdeba6617b9e1, - 0x20b1d0d5e5f8760e, 0x12ff15705c368260, 0x7bf4338b7c387203, 0x34ff25f00cd06185, - 0x1148c706c518cf28, 0x5c04f0623388f025, 0xcb9d649275d87d79, 0x9b5f0c24fabc42ec, - 0x1a7b5e7964e33858, 0x2a81bbd8efdc6793, 0x8d05431ffe42752e, 0x83915cd511002677, - 0x580ed4d791837b31, 0x5982e041d19ff306, 0xcad0d08fa5d864ca, 0x867bee6efe1afa63, - 0x26467b0320f23009, 0xd842414dfda4ec36, 0x047fcdcbc0a76725, 0xbddb340a3768aeca, - 0xef4ce6fa6e99ab45, 0x88c5b66c7762bf9b, 0x5679f1c51ffb225d, 0xdab79048317d77ee, - 0xf14e9b8a8ba03803, 0xe77f07f7731184c1, 0x4c2aab9a108c1ef5, 0xa137795718e6ad97, - 0x8d6c7cc73350b88b, 0x5c34e2ae74131a49, 0xd4828f579570a056, 0xb7862594da5336fc, - 0x6fd590a4a2bed7a5, 0x138d327de35e0ec1, 0xe8290eb33d585b0b, 0xcee01d52cdf88833, - 0x165c7c76484f160e, 0x7232653da72fc7f6, 0x66600f13445ca481, 0x6bbdf0a01f7b127d, - 0xd7b71d6a1992c73b, 0xcf259d37ae3fda4a, 0xf570c70d05895acf, 0x1e01e6a3e8f60155, - 0x2dacbb83c2bd3671, 0x9c291f5a5bca81af, 0xd976826c68b4ee90, 0x95112eec1f6310a2, - 0x11ebc7f623bc4c9a, 0x18471781b1122b30, 0x48f7c65414b00187, 0x6834b03efa2f5c30, - 0x0875ef5c2c56b164, 0x45248d4f2a60ba71, 0x5a7d466e7f7ba830, 0x2bebe6a5e42c4a1d, - 0xd871d8483db51d10, 0x6ee37decd2fd392f, 0x7d724392010cede3, 0x8e96ef11e1c9bcc8, - 0x804a61d86b89d178, 0xbb1b83ce956055ec, 0xcb44e107410ff64f, 0xc426bb09ee0ba955, - 0x057c08f42c3dd7f1, 0x40ea1ec148602bdf, 0xc24688deeb65d7f1, 0xd8bcc53c768ba4e4, - 0x16e0e3af65c1106c, 0xfc12f7e7d647218b, 0x70d6e1d3ee93cef4, 0x01d2a505c4541ef9, - 0x1ef79e16e764d5c3, 0x0363d14d13870b98, 0xb56ef64345d06b11, 0xe653d557ebb7c346, - 0x8304a8597c2b2706, 0x1536e1322ce7e7bb, 0x525aec08a65af822, 0x91f66d6e98d28e43, - 0xe65af12c0b5c0274, 0xdf6ae56b7d5ea4c2, 0x5cef621cedf3c81c, 0x41e8b1ffd4889944, - 0xb5c0f452c213c3e5, 0x77af86f3e67e499b, 0xe20e76ea5b010704, 0xbdc205ab0c889ec0, - 0xc76d93eb0469cd83, 0x17ac27f65cab0034, 0xd49ec4531fd62133, 0x07a873ea2f1b9984, - 0xbff270dfef0032ee, 0x1764dbe91592f255, 0xe40363126f79e859, 0xa06cad3ab46971f6, - 0x0be596e90dedd875, 0x3387cce5c1658461, 0x44246acf88a9585e, 0xe0ad82b92d5ecb2c, - 0x2177491c9a1600a6, 0x16e7c4aac0f02422, 0x75792eeeec15c4e1, 0x2309cd359d08ee30, - 0x7cd9831dd1b83b0a, 0x374914a7c4ee8cf0, 0x0dd17765c9ac2e54, 0xb7847470ba9a7688, - 0xfba4f4bbe2991173, 0x422b203fc3de040e, 0x63bfcaf2ecf2ab0e, 0x0c5559f3a192946e, - 0xfdf80675c1847695, 0xf5f570accab842c9, 0x65cc5a448767afea, 0x1efeb0a7ee234f2f, - 0x9b05f03d81e7b5d2, 0xe7c31317a8626cf4, 0x620f2a53081d0398, 0x1b6de96cdd9943ae, - 0x8c226a436777d303, 0xa08fbbd50fafb10d, 0x6a64c5ec20104883, 0x9c9c653502c0f671, - 0x678a02b2174f52a0, 0x68e008ba16bbad4b, 0xa317c16d2efb860f, 0xeab2075d17ed714c, - 0x565eeeddf0c4ea15, 0x8ec8e94d242a6c19, 0x139e8e27d9000fae, 0xc977a7ff1b33d2f5, - 0x1d0accca84420346, 0xc9e82602cd436e03, 0x6a2231da53d2ccd3, 0xb44b12d917826e2a, - 0x4f4567c6a74cf0b9, 0xd8e115a42fc6da8f, 0xb6bbe79d95742a74, 0x5686c647f1707dab, - 0xa70d58eb6c008fc5, 0xaaedc2dbe4418026, 0x6661e2267bdcfd3d, 0x4882a6eda7706f9e, - 0xf6c2d2c912dafdd0, 0x2f2298c142fd61f9, 0x31d75afeb17143a8, 0x1f9b96580a2a982f, - 0xa6cd3e5604a8ad49, 0x0dae2a80aad17419, 0xdb9a9d12868124ac, 0x66b6109f80877fac, - 0x9a81d9c703a94029, 0xbd3b381b1e03c647, 0xe88bc07b70f31083, 0x4e17878356a55822}, - {// seed = 5 - 0xb3c58c2483ad5ead, 0x6570847428cdcf6c, 0x2b38adbf813ac866, 0x8cb9945d37eb9ad3, - 0xf5b409ec3d1aed1c, 0xa35f4bffc9bb5a93, 0x5db89cde3c9e9340, 0xff1225231b2afb2b, - 0x157b0b212b9cc47d, 0xf03faf97a2b2e04d, 0x86fdab8544a20f87, 0xfcb8732744ae5c1c, - 0xd91744c0787986d5, 0x5f8db2a76d65ad05, 0xcff605cbed17a90d, 0xf80284980a3164e7, - 0x59cc24e713fccc7d, 0x268982cada117ce4, 0xcd020e63896e730e, 0xe760dc46e9fe9885, - 0x6aaece8ab49c6b5d, 0x7451194d597aae3e, 0x35d4385900332457, 0xa40fb563a096583d, - 0xa797b612f7f11b76, 0x2fed6eb68e6a2b9b, 0x2f06ee64aeffd943, 0x9dd0e49d9ca45330, - 0x97d48f08bd7f1d8f, 0x1cfa7fe3ebe4d8ee, 0x2a2ba076bd397d42, 0x68c4344f7472f333, - 0xce21ec31987d74b5, 0xb73dabdc91d84088, 0x801aadee592222fe, 0xaf41345398ebc3f5, - 0x8a8f653d7f15ee46, 0xce2d065ff2ba2965, 0x4e05da515da2adb7, 0xa6dbdb8aa25f0fd4, - 0xca9f9666bbd2d5a9, 0x6b917ce50bd46408, 0x1550cc564ba6c84d, 0xb3063ae043506504, - 0x84e5f96bb796653d, 0xe2364798096cf6e3, 0x3b0dfedf6d3a53d0, 0xb7e4c7c77bde8d93, - 0xe99545bac9ab418a, 0xa0e31f96889507bb, 0x883c74f80c346885, 0xf674ae0b039fd341, - 0x8bb6ce2d5e8d1c75, 0x0c48737966a7ed7c, 0x04fcdf897b34c61c, 0xe96ac181bacbd4d6, - 0x5a9c55a6106a9c01, 0x2520f020de4f45d3, 0x935730955e94d208, 0xce5ad4d7f3f67d3b, - 0xa4b6d107fe2d81ca, 0x4f0033f50ae7944e, 0x32c5d28dd8a645a7, 0x57ce018223ef1039, - 0x2cbab15a661ab68e, 0x6de08798c0b5bec2, 0xee197fb2c5c007c6, 0x31b630ac63e7bda2, - 0xab98785aefe9efe3, 0xa36006158a606bf7, 0x7b20376b9f4af635, 0xa40762fdc3c08680, - 0x943b5faffd0ebee2, 0x7f39f41d0b81f06e, 0x7c4b399b116a90f8, 0x24e1662ac92bc9f3, - 0xcf586fc4e8e6c7db, 0xe46e0d047eeb12d7, 0xe8021076e4ea9958, 0x11fc13492e3ca22a, - 0xd61eae01410397e3, 0x7e8c4a58036a8e9f, 0x068a6de267970745, 0x64faab129bef1a41, - 0xb4a6f720943dad01, 0x631491058d73a9d5, 0xdad4fe95eab3ec02, 0x0a8b141c5c3a44f6, - 0x9fc69d4c2b335b98, 0x94d5f84a07d6e4cd, 0x1b73965de143c608, 0x443932c2dda54bcc, - 0x7397818fb0b04cd2, 0xef4ab03a1202b277, 0xf3d2ee459c0c2b92, 0x182d4daf8b058a87, - 0x90e63035d7b51368, 0xba4cd8b9a95d45fd, 0x12a7392c76731090, 0x890d264ec5d082d2, - 0xeeaf5c363da4994e, 0xd6aad756902123fb, 0xb531ebebdb28f191, 0xe71ce659fc59babd, - 0x37c1b94f63f2dcb5, 0xe4e3abeb311f9b96, 0x4a31b72ccb8695d3, 0x52cae1f0629fdce4, - 0xe5b0475e2ed71369, 0x2724e8c3506414fb, 0xbab0367920672deb, 0x0161a781c305449f, - 0x37b70f40f5bb60be, 0xddd1094c50251a01, 0x3b28283afd17224e, 0x06dec0cfe889fc6b, - 0x47608ea95bb4902d, 0xad883ebc12c00e82, 0x9e8d7ae0f7a8df29, 0xa79443e9f7c013a1, - 0xcfa26f68b7c68b71, 0x33ae6cc19bda1f23, 0xd9741e22b407887f, 0xf2bff78066d46b1c, - 0x794123191c9d32d4, 0x56cb6b903764ec76, 0x98775d0ef91e1a5a, 0xae7b713bc15c1db9, - 0x3b4c1a7870ed7a0d, 0x46666965f305cc34, 0x0ea0c3b2e9c6b3cd, 0x4dc387039a143bff, - 0x5f38bb9229ef9477, 0xea5d39ba72af7850, 0x69a5ed0174ce2b6d, 0x06969a36bfe7594d, - 0x0adee8e4065ccaa3, 0x908a581d57113718, 0x64822d6c5a8190ed, 0x8c5068b56ace4e4c, - 0x88ba3b4fb4e30bef, 0xa6ec0b8bb5896cfe, 0x4e23fcc6b47996fd, 0xe18e75b0dd549c7a, - 0xcd90f17e106cf939, 0x1666fdfb2ef7c52f, 0x4fae325f206dd88c, 0xe7bc1160e25b062d, - 0x3cc999cb246db950, 0xc5930a7326cd5c37, 0xb008a48a211367bd, 0xc5559da145a88fd4, - 0x1e3ad46655fac69c, 0x7834266b4841bfd7, 0xa764450fbffc58cc, 0x54d8cf93a939c667, - 0x93c51f11b21b2d9d, 0x0964112082ed65cc, 0x4c2df21213e7fb03, 0xf0405bc877468615, - 0x17b4fc835d116ab4, 0xa6b112ae5f3cb4ef, 0x23cfc8a7fd38a46e, 0x8e0a360dc2774808, - 0x24ca9c8092105ad5, 0xafd3f75524f2e0d5, 0x4f39ed7dbaddc24c, 0xe5e362c7679a7875, - 0x00914a916b07b389, 0xdfe1119b7d5ab5da, 0xabd6ed9940e46161, 0x630ed2044171e22c, - 0xdecc244157dd1601, 0x777e6d5b4b4868d5, 0x9b3530bee67017d8, 0xd2faf08b291fdcb9, - 0x006e99455d6523de, 0xd559b5817f6955b5, 0xefcc1063b0088c61, 0xed73145ae0f00ae7, - 0xab2af402cf5b7421, 0x897767f537644926, 0x26c9c0473ca83695, 0x192e34e1881b2962, - 0xf7cf666ec3b3d020, 0x27f9b79c7404afb7, 0xe533e8bed3010767, 0xe5817838e11d05d3, - 0x65659c531bd36517, 0xd427c5e0a23836fd, 0xf3eab7ea58fa3528, 0x07683adae1289f35, - 0x201d6af7e896dd32, 0xd5da938b9a21ad88, 0x843fb73ad67bc316, 0x1782ec7d5feef21b, - 0x943f66f6ec772877, 0x7e9112e7b26da097, 0xeac8161f8663c2c7, 0xe8600db480a9ebf4, - 0x07807fc90f6eaf5f, 0xe0e4c9deb41abf83, 0xbdf533db271f9c15, 0xb398411b0497afe2, - 0xdebb45ef25448940, 0xe7a5decefcd376c4, 0xaf1ef3c728c83735, 0xb8b83a99355cb15a, - 0x6444a0344f1611e4, 0xe8bb7f5cf3c60179, 0x77ab5c5177e75ff7, 0xc38fd6fa849d585d, - 0x390d57d53029060a, 0xa66327eb7b8b593c, 0x6350a14f6fcd5ac9, 0x2c08125bcd7008b4, - 0x2d00c299a6a6bf8e, 0x6b0039c1f68d1445, 0x0035150c5d06f143, 0xa34d01628cc927e1, - 0xdf5b3164d7b2ede1, 0x8167db1d0583d72e, 0x4e13b341cd2ae8bc, 0xa693d9b1f416e306, - 0xc15ed7ca0bc67609, 0xdc344313c1c4f0af, 0x88b6887ccf772bb4, 0x6326d8f93ca0b20e, - 0x6964fad667dc2f11, 0xe9783dd38fc6d515, 0x359ed258fa022718, 0x27ac934d1f7fd60a, - 0xd68130437294dbcc, 0xaf5f869921f8f416, 0x2b8f149b4ab4bf9f, 0xc41caca607e421cb, - 0x7746976904238ef9, 0x604cb5529b1532f0, 0x1c94cd17c4c4e4ab, 0xe833274b734d6bbe, - 0xe9f1d3ef674539ce, 0x64f56ed68d193c6a, 0xe34192343d8ecfc1, 0xcb162f6c3aa71fe8, - 0x99eaf25f4c0f8fa4, 0x92f11e7361cb8d02, 0xb89170cddff37197, 0x4f86e68a51e071e3, - 0x31abf6afd911a75b, 0x6d20cf259c269333, 0x4150b9f88fcb6513, 0x705063989ebf7451, - 0x559231d927c84410, 0x1ca8ec4b098bc687, 0xebed22405c9180e0, 0xaa815b37d052af59}, - {// seed = 6 - 0x946ac62246e04460, 0x9cebee264fcbc1ae, 0x8af54943a415652b, 0x2b327ed3b17b8682, - 0x983fde47b3c3847e, 0x10a3013f99a2ad33, 0x6e230bb92d2721ef, 0x1cf8b8369e5c5c50, - 0x7f64017f2b7b3738, 0xd393248a62417fa1, 0x9ff01c0b20a372c5, 0xb0e44abce7e7c220, - 0xcebb9f88d48a815f, 0xdb7df6bd09033886, 0x7844fc82b6fa9091, 0x72d095449863b8ec, - 0xc13e678c89da2c7e, 0x6caf4d5ad231d12f, 0x2e0ab7b5fcf35c49, 0xf410720cb932a70f, - 0xd66ea581f16fce06, 0x175c9f002f57dc98, 0xccbcfd0d32988775, 0xfde4c407d3b0a232, - 0x5db2931ae7e97223, 0x6e07e2173085809f, 0x6e1d1ec0f9cad73c, 0xb2fc251a7f802619, - 0xbc1fc17f04f342de, 0x8de8f21ec658e078, 0x72c0f40cbee53fd6, 0x0678244411fc17a1, - 0x1d5837ca166b9bbd, 0xc8cada003c554345, 0x6a2fe2bfb2e58652, 0xfca9d797a6f7988b, - 0x6699e24ac737948b, 0x69623ffcb05789ba, 0x946429c529d95b75, 0x0d14df0b2a13970f, - 0x593d8592c440dfec, 0x2ee176f3d7e74b94, 0xae003f1da3be9e26, 0x0c7b02c4c0f6764a, - 0x3117e2fa1f632462, 0xf0f23265b6f1eaeb, 0x3111255d9b10c137, 0xc82745e509a00397, - 0xbd1d04037005fea7, 0xe104ab0dd22a9036, 0x51b27ce50851ac7a, 0xb2cb9fb21b471b15, - 0x29d298074c5a3e26, 0x6ebdf2058b737418, 0xc4a974041431b96f, 0x1ec5a30ccb6bdaac, - 0xe818beede9bf4425, 0x4b69b1bce67a5555, 0xf5c35f1eb0d62698, 0xf4509bbd8e99867c, - 0xb17206debd52e1bc, 0x35785668c770b3be, 0xe9343987ff5863bc, 0x2ee768499ac73114, - 0x5132bb3426eeaaf4, 0x471bce2c6833c5ff, 0xbb9a2d5428e6f6f9, 0xd5678943c595792d, - 0xab2a65e7f81e479c, 0xa82407bb23990b31, 0xdae321383984923c, 0x01823bb22648e6f1, - 0xda6e8df4214a8b04, 0x0e172bb88e03d94f, 0x552da6c22e362777, 0x7ce67329fb0e90cb, - 0x7b2d7f287ede7ebf, 0xd44f8222500651bd, 0x4acca1ef58fbb8ab, 0x428ecf058df9656b, - 0xd7e1ec6a8987c185, 0x365be6a54b253246, 0x168849be1e271ee8, 0x6a00f3c4151a8db2, - 0x37602727ca94b33d, 0xf6b50f18504fa9ce, 0x1c10817f6bc872de, 0x4bfe1fe42b0f3638, - 0x135fad4b8ef6143b, 0x1b25ad2bafc25f58, 0x41e37f85cf321f92, 0xfc73f75d9d5b9bea, - 0x9eb3694d1e9cb7e1, 0x601d51f08fa83b90, 0x234a2a9b88366f41, 0x63fe903e16f2c3bf, - 0x1cdbd34fa751c0b0, 0x0ce4fc6747c0558c, 0x51ed72afb8bb49aa, 0x20313ba13ca12c96, - 0x271fa38f9ebd54c1, 0x3696a5ac03a8edde, 0x05602be7df625702, 0x11f1ac73790f7a9f, - 0xa2836c099f0810bd, 0xe5ac2e47caa532fa, 0xd9c000a66d39f681, 0xd93d900e6f3d9d5f, - 0x792c81c65b7900f2, 0x5c5dce790ee20da1, 0x74ff1950edec1aee, 0x71fc85fa1e277d8f, - 0x0e77df17d6546cbc, 0x07debad44816c3b4, 0xbafa721581e92a70, 0x8ab6fbe2ed27bba8, - 0xe83243a20dea304a, 0xaa85a63a84c00a07, 0xde0e79917fc4153a, 0x21bb445e83537896, - 0xeedcac49fc0b433a, 0xffb2926a810ae57a, 0xf724be1f41d28702, 0x79cb95746039bb3b, - 0x5a54fe3742a00900, 0xda4768d64922c04f, 0x420396a84a339dae, 0xa171e26ee5e8724e, - 0x4c8da7c5d289c20a, 0x9ebd79a1a8e94742, 0x39235232b97e9782, 0xb75df0be9bba7d80, - 0x0c1d204dd87d48fc, 0x8f81f3e7177266e8, 0xe4a460b39e78d72b, 0x50b98fa151e65351, - 0xb7cb585c3ee1eddc, 0x11cdad9a76ee1dc4, 0xa38054a78595dc1c, 0x92f09e2ec4978edc, - 0xa8f0061b5efdabaa, 0x04bcc4abc224d230, 0xc58606738e692d46, 0xdd2b27b565952433, - 0x19e6ed1b740beec0, 0xceadd49b2ef9891f, 0x328178c28fe95cad, 0xe5ad4c43afe02848, - 0x03c0cb538cd967c0, 0xec4352526d19a630, 0x4c7e99389d39b031, 0xf65dd05362c2deb6, - 0xd1e70daf6879d28d, 0xbe9f57db6309b265, 0xa4b66f370b872bb7, 0xe26896fbc6ee1fd5, - 0xac705e661bfcf7c5, 0xab4d0d07d7f09940, 0x976417c06aeb6267, 0x8161c684a6bd468c, - 0xf77b6b9976dc4601, 0xc6489b779a39c12c, 0xb2aa58d5681cea1a, 0x043b1b40f8c3e04c, - 0x681fcbfadc845430, 0xab8896c921ba8def, 0x57aaf172606f37b2, 0xc3735048cd5eb8d7, - 0xa7078b96955631bd, 0xdd6b3543aa187f33, 0xc7103ea4a2a697fd, 0x8d7b95f6ff1f7407, - 0xe44f419e84709530, 0xf340caa9132cbb0a, 0x2ba407283143c66c, 0xe1be240ca636c844, - 0x90d32f2877ac08bc, 0x5d26e6294b2c8673, 0x4a6b2f5b27c87a44, 0x961fb9043f76d34f, - 0x0afee02d8d3c55d2, 0x6228e3f48c42e5dc, 0xc338e69ee6593675, 0x853f74b16efb7bdd, - 0xd062f40bdd22e687, 0x647164b9ab4c4190, 0xf94689f67d598369, 0x8e4b29d87a5012d7, - 0xaf02b8b925656fbd, 0x7a722a767179a630, 0xb5c8afe937a75ace, 0xfdb8e8d02d279372, - 0x887ef700cb25fae1, 0xcfe9bd912f72cabe, 0xb1d4dedc24f978de, 0x517522d38319cc2a, - 0x7dd87b2b36aab798, 0x579c4ff3046b5a04, 0xf5c5975c5028b7a7, 0x7094579d1000ec84, - 0xbc8d5b1ea70a5291, 0x161b2d783be8855c, 0xd26d0b0d6d18279f, 0x0be1945f02a78bd5, - 0xb822a5a9e045415b, 0x2fe9d68b1ccc3562, 0xb2e375960033d14f, 0x26aca04e49b4ff22, - 0x732a81c862112aea, 0x8bd901ed6e4260b8, 0xe839532c561ad5b0, 0x8fb6e4d517a79b12, - 0x0dd37f8c0be9b429, 0xc8ad87ad12f1b1b0, 0xc51f3aa62b90318b, 0x031a7e8b86c1cefc, - 0xa95547af2b70fc76, 0x9cb3615c5a98801e, 0xa387e3c3341d7032, 0xa087ea52a1debaef, - 0x16325ec9a2e6e835, 0x587944a484c585eb, 0xc8879033bde22ecc, 0xa39dbfce709c464a, - 0x7acc010f99208774, 0x98dd2973a096c5ad, 0x26458b51139f198c, 0x2f5d19575e8c4f02, - 0x726643f0d38af352, 0x44d879b6d73e6e94, 0xa68a03885c980abe, 0x06048acd161c40c0, - 0xa4dab8f89d405d28, 0x7120c880cb04be18, 0xa062ace22a1cf0cf, 0x3901a9daf29704f4, - 0xff08f3ed989db30a, 0x6d22b13e874c67e9, 0x80c6f35518d73f4d, 0xc23c2a521aac6f29, - 0x2e708fd83aaa42e0, 0x7fc3780f55f1b0fd, 0xabb3075c98cf87f2, 0xb4df3f40f7c61143, - 0x2a04418098a76d75, 0x0d9eeee9509b2d37, 0x6be8ae51f4b59cdc, 0xe746cc7c00e4a2ab, - 0x785bc6df9cac597c, 0x33cb6620ce8adc48, 0xc1ba30739bffcef7, 0x6d95771f18e503f7, - 0xf7be3ae2e62652ff, 0xc8d82ffd2a73c62b, 0x8725a3ba5b110973, 0x67ed6b9c724757ec}, - {// seed = 7 - 0xc0272d42c19ff3ae, 0x4694228b43ea043b, 0x5709a6ef8a462841, 0xc9210a1e538805c9, - 0x279b171196113ec2, 0x859b769fc2d9e815, 0x0d5d3125a2bf14d3, 0x22bca1cfefa878ba, - 0x481b6bf58037bd83, 0x4933ba8647728d22, 0xf08c7b6b56f6e1b6, 0x374e8af5a15407c7, - 0xa95c4dc3d2487a5c, 0x9b832808ff11e751, 0xf2048507e9da01d5, 0xa9c576189f544a4a, - 0xf6c2a45b2e9d2b41, 0x9b9874c9f10ecc2f, 0x37d9b5f51f8c149e, 0x93aead54c9de9467, - 0x59cf0b4af262da23, 0xe7e9929af18194b2, 0x9df2644e33eb0178, 0xde4122d6f0671938, - 0xf005786c07f4800b, 0xb1fc9d254b5d1039, 0x0bf1088631f6dd7b, 0x665623f0a4b8f0c7, - 0x60f0113a9187db7c, 0xfd7cceda4f0d23a6, 0x26c01e9d89955940, 0x33afa1dfc0f5a6a0, - 0xeb77daf215e9283c, 0xc7575214bf85edb4, 0xeb0d804bf297e616, 0x84bff4ffd564f747, - 0xc4ac33189246f620, 0x43ef61213ecc1005, 0xcbbb0dea6cd96acd, 0x8ed27abfa8cfcb05, - 0x543b61529cb996b6, 0xa5f987ca41ea5e59, 0x3c50e0ac5254cb7a, 0x4192b0446c06d1e6, - 0x3e86592e21b45388, 0xdb766f06fcc6e51e, 0x0448ee36efe632db, 0x663c9db689253e35, - 0x72e0bd4985331dd4, 0xff501b5bf7d94e74, 0xe911ce758e2113a8, 0xec3a8d03a75a6ba4, - 0xaf6b4b72f56edc83, 0xf284857936c0a391, 0x5ba6feff407d46f4, 0x9d689c26de9d6702, - 0x28c04a9083726b5d, 0x2ccf4a627a029730, 0x7b4719500d4f0c71, 0x76470a9a7da250a8, - 0xcc48409404a1c890, 0xccefbdc7ec9a8055, 0xe0db91bff3cc42d3, 0x0532436426141254, - 0xf2ee9325e6f0ff0b, 0x149c20a5fbb28d9d, 0xe71624cd8d2d14d4, 0x8f01d4dc8cc2dd77, - 0x29cf409b333015b7, 0xba8bebd211884dd1, 0xc3396635e8c8db1d, 0x8ed0f6208d0528b8, - 0x0d90b43fdd0ee334, 0xd73c9a3333a044c7, 0xa2595cd208dbdc38, 0xae93cb264f940c09, - 0x8e0538d8afb07a97, 0x19115ec881385ba2, 0xa886f9e6a8039c6a, 0xcd5d62147ce3ecac, - 0xaecdf9e0bb4969f7, 0x2ddd631c53dcad10, 0x73ad1c97b3412054, 0xb08915fa2722efc6, - 0x97966047e5067eb0, 0x337f1675ed91445c, 0xb3a833d150b96a0d, 0x5940a98fe35e5e2e, - 0xfd03cc354ed0d8ff, 0x4e65b98291a8644a, 0x14a259f2852a60b2, 0x7648e3478c1e8e5f, - 0xbc0fbef6d9a919b4, 0xbec4302081346cf1, 0x57d2ce7aa1c7c511, 0x234c209d8f4e1ac3, - 0x87cf80cc933ce443, 0x7c262c616931e94e, 0xc5e33b049cf9eddf, 0x1a80790ed03ae51b, - 0xf2e8b9494f7220cf, 0x124cb59c14fff3ff, 0xa8a06cbfdb86ce18, 0x9068ef1f80b37653, - 0x0c55417b8d90338f, 0xcd579a523f6bcd30, 0xa31bfe2476a8d2a9, 0x1f8d142208094223, - 0x332dc40a5203cfad, 0xf8792fe5b2d33b4c, 0x443bd9668bf9461e, 0xc9019db0ace1409e, - 0x781bea919a113e8b, 0xb0f11d866abfbeec, 0xcfe139a60db0c26a, 0x869ab8721e6aa39e, - 0xdb48a4977717837a, 0x588a5ff151065b18, 0xe4a251ea0028864d, 0x7f0e43ba408a77c3, - 0x65f66dd50a536135, 0x6f49e934d9331c3e, 0xb8d742e0f0fa6b09, 0xe4e9b272deca2348, - 0xaee132ff902f773c, 0x43f658f7c2a0c90a, 0x28cb4dbc76cc53ea, 0x7d92253aa99ac39b, - 0x4fea3d832370baab, 0xb29e36936e51d78e, 0xea10778712321064, 0xff4f21f8ef274be2, - 0x84eff18ddfa0933f, 0xd0ec6a9f86c758a0, 0xaf82e5973c431ae0, 0x352023c00c045425, - 0xad34d7bc4a2f8961, 0xbdb4a02a24d4dee0, 0x354a4846d97447cf, 0x331a8b944d5bc19f, - 0x5ce04f8e17909035, 0x6497581bad8f4aab, 0x07c503bba647111e, 0x85f412ba78e1f7ff, - 0x7f3b920fd20f4cff, 0x424e1a9a4ce34e2f, 0x3035e2d62e1b9f0a, 0xef63114bff7b729a, - 0xe86a05889ab6bb60, 0xee0830cf095585a1, 0x4a54f7fa47d9c94b, 0x17daeece9fcb556a, - 0xc506d3f391834c6f, 0xb3f24be362e1af64, 0xc435e4e23608efdd, 0xeeba9caaa4cc1768, - 0x5a71f306daddc22d, 0x18e5205f41eba1a0, 0x7b29b4d1f6610925, 0x065cb65a0258d9a9, - 0x3e5ac8faa9fd1f95, 0x3b362362c1ea0470, 0xce0e4f6434db7a2e, 0xf327341098de52f2, - 0xcfca3b9e2a1992c3, 0x7483bf9401233e41, 0xbafbac531c6f9281, 0x4b52dd71b2c106f8, - 0xdf73b66e50b5a1f7, 0x237aec0202a20283, 0x23dd5be23dffdf2b, 0xea9730731ee122ef, - 0x5cb3f846014fbcd3, 0xc3b21c8ffdce9201, 0x06a99a02f91a8760, 0x721a81fa8fd7b7a3, - 0x6aafcdddc53cbcd8, 0xd03b464005a93bcc, 0x8212edc1b1669dcb, 0x71f4c31364c31bc7, - 0xfeeec0eba8772307, 0x1948d00a13d88cf1, 0x19064fd6d943ada8, 0x4ec8d31722697bfd, - 0x596d9a953a516609, 0xc4cb4bff53507da2, 0x1d59f3c5be36e4ca, 0xe5b4fc5bf6044c9b, - 0x1bb74e052232f735, 0x04e8a0db611ddd5d, 0x8d04eaa009b421bf, 0xa7878ae0ac0e6d58, - 0x28c1030217cab2b3, 0x827943767e56a883, 0x28fce5fa02d22809, 0xb30c322fffc8c58e, - 0x1ca5a6a9f8066c5b, 0xb24db5f1462b2513, 0x02f653b89b7e5f6c, 0xe31f8fb5d5f78eee, - 0x266acc514ed93501, 0x936879d1c6fddcc4, 0xcd51be3636af1952, 0x3fdbb6fc332c78c8, - 0x9eb656379fa73094, 0x056146cc92fa0f96, 0xed6c4f1836c027c3, 0x021e0bb5d2113f2a, - 0x8983e42ec1c626b3, 0x73ea9bc6513ad9c9, 0x0c904903b24f4247, 0xacbac1e6243e2525, - 0x0b1069a0c230fb06, 0x77d709fca3fc1ce5, 0x87ad0f65020947e6, 0x555302641c53f4e6, - 0x65ea87871fa9aaee, 0x58aaf4ecc1067bb4, 0x1a66c48cc4c65b3f, 0xca96aca48b2ea969, - 0xa68eb70bad14de2b, 0x5ccdb3d7e00a6f6e, 0xe178fbfec73fe72f, 0x2b63d6a16b83e890, - 0x32fdb7a5330fbae0, 0x2ab5803c8d1bf32c, 0xda838388c1527c94, 0x16a50bdc4de24acb, - 0xe561301f134c074a, 0xd7ae63d2816b4db1, 0x036aabd4df0dd741, 0xc5e0db8783435b9d, - 0x9c4386cf0a07f3b2, 0x6a72ac1aa56a13a1, 0x299bbdb04bb20a23, 0x138c1018fda16b81, - 0x0e354f0b3bda49df, 0x9f4c295b23127437, 0xd133ceb2bd561341, 0xd8b4bfd5a526ac29, - 0xcdd0a70ddc1c7bbd, 0x81dce595bf572225, 0x1c6f925c05f6efd7, 0x8ae5097553856ea0, - 0x3aabeaeef248f60d, 0xd9005809d19a69e2, 0x2a3a1a314311cc27, 0x89bb2dc76b2b624a, - 0x50a2a95d0412e289, 0x9def8df564e68581, 0xf49010a9b2e2ea5c, 0x8602ae175d9ff3f0, - 0xbf037e245369a618, 0x8038164365f6e2b5, 0xe2e1f6163b4e8d08, 0x8df9314914f0857e}, -}; - -const uint64_t AVG_LEN = 1024 * 1024; - -// create a fake null array class with a GetView method returning 0 always -class FakeNullArray { - public: - uint8_t GetView(int64_t i) const { return 0; } - - std::shared_ptr<::arrow::DataType> type() const { return ::arrow::null(); } - - int64_t null_count() const { return 0; } -}; - struct Chunk { int64_t level_offset; int64_t value_offset; @@ -574,207 +39,28 @@ struct Chunk { levels_to_write(levels_to_write) {} }; -static uint64_t GetMask(uint64_t min_size, uint64_t max_size) { - uint64_t avg_size = (min_size + max_size) / 2; - size_t mask_bits = static_cast(std::ceil(std::log2(avg_size))); - size_t effective_bits = mask_bits - 3 - 5; - return (1ULL << effective_bits) - 1; -} +// have a chunker here // rename it since it is not FastCDC anymore class FastCDC { public: - FastCDC(const LevelInfo& level_info, uint64_t avg_len, uint8_t granurality_level = 5) - : level_info_(level_info), - avg_len_(avg_len == 0 ? AVG_LEN : avg_len), - min_len_(static_cast(avg_len_ * 0.5)), - max_len_(static_cast(avg_len_ * 2.0)), - hash_mask_(GetMask(avg_len_, granurality_level + 3)) {} - - template - bool Roll(const T value) { - constexpr size_t BYTE_WIDTH = sizeof(T); - chunk_size_ += BYTE_WIDTH; - if (chunk_size_ < min_len_) { - return false; - } - auto bytes = reinterpret_cast(&value); - bool match = false; - for (size_t i = 0; i < BYTE_WIDTH; ++i) { - rolling_hash_ = (rolling_hash_ << 1) + GEAR_HASH_TABLE[nth_run_][bytes[i]]; - match |= (rolling_hash_ & hash_mask_) == 0; - } - return match; - } - - bool Roll(std::string_view value) { - chunk_size_ += value.size(); - if (chunk_size_ < min_len_) { - return false; - } - bool match = false; - for (char c : value) { - rolling_hash_ = - (rolling_hash_ << 1) + GEAR_HASH_TABLE[nth_run_][static_cast(c)]; - match |= (rolling_hash_ & hash_mask_) == 0; - } - return match; - } - - inline bool Check(bool match) { - if (ARROW_PREDICT_FALSE(match && (++nth_run_ >= 7))) { - nth_run_ = 0; - chunk_size_ = 0; - return true; - } else if (ARROW_PREDICT_FALSE(chunk_size_ >= max_len_)) { - chunk_size_ = 0; - return true; - } else { - return false; - } - } - - template - const std::vector GetBoundaries(const int16_t* def_levels, - const int16_t* rep_levels, int64_t num_levels, - const T& leaf_array) { - std::vector result; - bool has_def_levels = level_info_.def_level > 0; - bool has_rep_levels = level_info_.rep_level > 0; - - if (!has_rep_levels && !has_def_levels) { - // fastest path for non-repeated non-null data - bool val_match; - int64_t offset = 0; - int64_t prev_offset = 0; - while (offset < num_levels) { - val_match = Roll(leaf_array.GetView(offset)); - ++offset; - if (Check(val_match)) { - result.emplace_back(prev_offset, prev_offset, offset - prev_offset); - prev_offset = offset; - } - } - if (prev_offset < num_levels) { - result.emplace_back(prev_offset, prev_offset, num_levels - prev_offset); - } - } else if (!has_rep_levels) { - // non-repeated data possibly with nulls - bool def_match, val_match; - int64_t offset = 0; - int64_t prev_offset = 0; - while (offset < num_levels) { - def_match = Roll(def_levels[offset]); - val_match = Roll(leaf_array.GetView(offset)); - ++offset; - if (Check(def_match || val_match)) { - result.emplace_back(prev_offset, prev_offset, offset - prev_offset); - prev_offset = offset; - } - } - if (prev_offset < num_levels) { - result.emplace_back(prev_offset, prev_offset, num_levels - prev_offset); - } - } else { - // repeated data possibly with nulls - bool def_match, rep_match, val_match; - int16_t def_level; - int16_t rep_level; - int64_t level_offset = 0; - int64_t value_offset = 0; - int64_t record_level_offset = 0; - int64_t record_value_offset = 0; - int64_t prev_record_level_offset = 0; - int64_t prev_record_value_offset = 0; - - while (level_offset < num_levels) { - def_level = def_levels[level_offset]; - rep_level = rep_levels[level_offset]; - if (rep_level == 0) { - record_level_offset = level_offset; - record_value_offset = value_offset; - } - ++level_offset; - - def_match = Roll(def_level); - rep_match = Roll(rep_level); - if (ARROW_PREDICT_TRUE(def_level >= level_info_.repeated_ancestor_def_level)) { - val_match = Roll(leaf_array.GetView(value_offset)); - ++value_offset; - } else { - val_match = false; - } - - if (Check(def_match || rep_match || val_match)) { - auto levels_to_write = record_level_offset - prev_record_level_offset; - if (levels_to_write > 0) { - result.emplace_back(prev_record_level_offset, prev_record_value_offset, - levels_to_write); - prev_record_level_offset = record_level_offset; - prev_record_value_offset = record_value_offset; - } - } - } - - auto levels_to_write = num_levels - prev_record_level_offset; - if (levels_to_write > 0) { - result.emplace_back(prev_record_level_offset, prev_record_value_offset, - levels_to_write); - } - return result; - } - - return result; - } - -#define PRIMITIVE_CASE(TYPE_ID, ArrowType) \ - case ::arrow::Type::TYPE_ID: \ - return GetBoundaries(def_levels, rep_levels, num_levels, \ - checked_cast(values)); + FastCDC(const LevelInfo& level_info, uint64_t avg_len, uint8_t granurality_level = 5); const ::arrow::Result> GetBoundaries(const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, - const ::arrow::Array& values) { - auto type_id = values.type()->id(); - switch (type_id) { - PRIMITIVE_CASE(BOOL, Boolean) - PRIMITIVE_CASE(INT8, Int8) - PRIMITIVE_CASE(INT16, Int16) - PRIMITIVE_CASE(INT32, Int32) - PRIMITIVE_CASE(INT64, Int64) - PRIMITIVE_CASE(UINT8, UInt8) - PRIMITIVE_CASE(UINT16, UInt16) - PRIMITIVE_CASE(UINT32, UInt32) - PRIMITIVE_CASE(UINT64, UInt64) - PRIMITIVE_CASE(HALF_FLOAT, HalfFloat) - PRIMITIVE_CASE(FLOAT, Float) - PRIMITIVE_CASE(DOUBLE, Double) - PRIMITIVE_CASE(STRING, String) - PRIMITIVE_CASE(BINARY, Binary) - PRIMITIVE_CASE(FIXED_SIZE_BINARY, FixedSizeBinary) - PRIMITIVE_CASE(DATE32, Date32) - PRIMITIVE_CASE(DATE64, Date64) - PRIMITIVE_CASE(TIME32, Time32) - PRIMITIVE_CASE(TIME64, Time64) - PRIMITIVE_CASE(TIMESTAMP, Timestamp) - PRIMITIVE_CASE(DURATION, Duration) - PRIMITIVE_CASE(DECIMAL128, Decimal128) - PRIMITIVE_CASE(DECIMAL256, Decimal256) - case ::arrow::Type::DICTIONARY: - return GetBoundaries( - def_levels, rep_levels, num_levels, - *checked_cast(values).indices()); - case ::arrow::Type::NA: - FakeNullArray fake_null_array; - return GetBoundaries(def_levels, rep_levels, num_levels, fake_null_array); - default: - return ::arrow::Status::NotImplemented("Unsupported type " + - values.type()->ToString()); - } - } + const ::arrow::Array& values); private: + template + bool Roll(const T value); + bool Roll(std::string_view value); + inline bool Check(bool match); + + template + const std::vector Calculate(const int16_t* def_levels, const int16_t* rep_levels, + int64_t num_levels, const T& leaf_array); + const internal::LevelInfo& level_info_; const uint64_t avg_len_; const uint64_t min_len_; From ee6a715cfa7300fcbe907ab30fbd6c5947f0fce6 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Mon, 17 Feb 2025 18:56:23 +0100 Subject: [PATCH 013/102] change the api to define min_chunk_size and max_chunk_size and automatically center the mask --- cpp/src/parquet/column_chunker.cc | 53 ++-- cpp/src/parquet/column_chunker.h | 15 +- cpp/src/parquet/column_chunker_test.cc | 415 ++++++++++++++++++++++++- cpp/src/parquet/column_writer.cc | 21 +- cpp/src/parquet/properties.h | 23 +- python/pyarrow/_parquet.pxd | 2 +- python/pyarrow/_parquet.pyx | 5 +- 7 files changed, 468 insertions(+), 66 deletions(-) diff --git a/cpp/src/parquet/column_chunker.cc b/cpp/src/parquet/column_chunker.cc index a9564a79e7c..d99d36b8483 100644 --- a/cpp/src/parquet/column_chunker.cc +++ b/cpp/src/parquet/column_chunker.cc @@ -22,8 +22,6 @@ #include "arrow/array.h" #include "parquet/level_conversion.h" -using arrow::internal::checked_cast; - namespace parquet { namespace internal { @@ -550,8 +548,6 @@ constexpr uint64_t GEAR_HASH_TABLE[8][256] = { 0xbf037e245369a618, 0x8038164365f6e2b5, 0xe2e1f6163b4e8d08, 0x8df9314914f0857e}, }; -const uint64_t AVG_LEN = 1024 * 1024; - // create a fake null array class with a GetView method returning 0 always class FakeNullArray { public: @@ -564,25 +560,27 @@ class FakeNullArray { static uint64_t GetMask(uint64_t min_size, uint64_t max_size) { uint64_t avg_size = (min_size + max_size) / 2; - size_t mask_bits = static_cast(std::ceil(std::log2(avg_size))); - size_t effective_bits = mask_bits - 3 - 5; - return (1ULL << effective_bits) - 1; + uint64_t target_size = avg_size - min_size; + size_t mask_bits = static_cast(std::floor(std::log2(target_size))); + // -3 because we are using 8 hash tables to have more gaussian-like distribution + // -1 narrows the chunk size distribution in order to avoid having too many hard + // cuts at the minimum and maximum chunk sizes + size_t effective_bits = mask_bits - 3 - 1; + return std::numeric_limits::max() << (64 - effective_bits); } -// rename it since it is not FastCDC anymore - -FastCDC::FastCDC(const LevelInfo& level_info, uint64_t avg_len, uint8_t granurality_level) +ContentDefinedChunker::ContentDefinedChunker(const LevelInfo& level_info, + uint64_t min_size, uint64_t max_size) : level_info_(level_info), - avg_len_(avg_len == 0 ? AVG_LEN : avg_len), - min_len_(static_cast(avg_len_ * 0.5)), - max_len_(static_cast(avg_len_ * 2.0)), - hash_mask_(GetMask(avg_len_, granurality_level + 3)) {} + min_size_(min_size), + max_size_(max_size), + hash_mask_(GetMask(min_size, max_size)) {} template -bool FastCDC::Roll(const T value) { +bool ContentDefinedChunker::Roll(const T value) { constexpr size_t BYTE_WIDTH = sizeof(T); chunk_size_ += BYTE_WIDTH; - if (chunk_size_ < min_len_) { + if (chunk_size_ < min_size_) { return false; } auto bytes = reinterpret_cast(&value); @@ -594,9 +592,9 @@ bool FastCDC::Roll(const T value) { return match; } -bool FastCDC::Roll(std::string_view value) { +bool ContentDefinedChunker::Roll(std::string_view value) { chunk_size_ += value.size(); - if (chunk_size_ < min_len_) { + if (chunk_size_ < min_size_) { return false; } bool match = false; @@ -608,12 +606,12 @@ bool FastCDC::Roll(std::string_view value) { return match; } -bool FastCDC::Check(bool match) { - if (ARROW_PREDICT_FALSE(match && (++nth_run_ >= 7))) { +bool ContentDefinedChunker::Check(bool match) { + if (ARROW_PREDICT_FALSE(match && ++nth_run_ >= 7)) { nth_run_ = 0; chunk_size_ = 0; return true; - } else if (ARROW_PREDICT_FALSE(chunk_size_ >= max_len_)) { + } else if (ARROW_PREDICT_FALSE(chunk_size_ >= max_size_)) { chunk_size_ = 0; return true; } else { @@ -622,9 +620,10 @@ bool FastCDC::Check(bool match) { } template -const std::vector FastCDC::Calculate(const int16_t* def_levels, - const int16_t* rep_levels, int64_t num_levels, - const T& leaf_array) { +const std::vector ContentDefinedChunker::Calculate(const int16_t* def_levels, + const int16_t* rep_levels, + int64_t num_levels, + const T& leaf_array) { std::vector result; bool has_def_levels = level_info_.def_level > 0; bool has_rep_levels = level_info_.rep_level > 0; @@ -717,9 +716,9 @@ const std::vector FastCDC::Calculate(const int16_t* def_levels, #define PRIMITIVE_CASE(TYPE_ID, ArrowType) \ case ::arrow::Type::TYPE_ID: \ return Calculate(def_levels, rep_levels, num_levels, \ - checked_cast(values)); + static_cast(values)); -const ::arrow::Result> FastCDC::GetBoundaries( +const ::arrow::Result> ContentDefinedChunker::GetBoundaries( const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, const ::arrow::Array& values) { auto type_id = values.type()->id(); @@ -750,7 +749,7 @@ const ::arrow::Result> FastCDC::GetBoundaries( case ::arrow::Type::DICTIONARY: return GetBoundaries( def_levels, rep_levels, num_levels, - *checked_cast(values).indices()); + *static_cast(values).indices()); case ::arrow::Type::NA: FakeNullArray fake_null_array; return Calculate(def_levels, rep_levels, num_levels, fake_null_array); diff --git a/cpp/src/parquet/column_chunker.h b/cpp/src/parquet/column_chunker.h index ba96abc0ad6..25ed78cb288 100644 --- a/cpp/src/parquet/column_chunker.h +++ b/cpp/src/parquet/column_chunker.h @@ -39,12 +39,10 @@ struct Chunk { levels_to_write(levels_to_write) {} }; -// have a chunker here - -// rename it since it is not FastCDC anymore -class FastCDC { +class ContentDefinedChunker { public: - FastCDC(const LevelInfo& level_info, uint64_t avg_len, uint8_t granurality_level = 5); + ContentDefinedChunker(const LevelInfo& level_info, uint64_t min_size, + uint64_t max_size); const ::arrow::Result> GetBoundaries(const int16_t* def_levels, const int16_t* rep_levels, @@ -62,12 +60,11 @@ class FastCDC { int64_t num_levels, const T& leaf_array); const internal::LevelInfo& level_info_; - const uint64_t avg_len_; - const uint64_t min_len_; - const uint64_t max_len_; + const uint64_t min_size_; + const uint64_t max_size_; const uint64_t hash_mask_; - uint8_t nth_run_ = 0; + uint64_t nth_run_ = 0; uint64_t chunk_size_ = 0; uint64_t rolling_hash_ = 0; }; diff --git a/cpp/src/parquet/column_chunker_test.cc b/cpp/src/parquet/column_chunker_test.cc index b248758bc12..c4332c882d4 100644 --- a/cpp/src/parquet/column_chunker_test.cc +++ b/cpp/src/parquet/column_chunker_test.cc @@ -7,10 +7,411 @@ // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. + +#include + +#include "arrow/array.h" +#include "arrow/array/builder_binary.h" +#include "arrow/array/builder_decimal.h" +#include "arrow/array/builder_primitive.h" +#include "arrow/table.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/type_fwd.h" +#include "arrow/type_traits.h" +#include "arrow/util/decimal.h" +#include "arrow/util/float16.h" +#include "parquet/arrow/reader.h" +#include "parquet/arrow/reader_internal.h" +#include "parquet/arrow/schema.h" +#include "parquet/arrow/test_util.h" +#include "parquet/arrow/writer.h" +#include "parquet/column_writer.h" +#include "parquet/file_writer.h" +#include "parquet/page_index.h" +#include "parquet/test_util.h" + +namespace parquet { + +using ::arrow::Array; +using ::arrow::ChunkedArray; +using ::arrow::ConcatenateTables; +using ::arrow::default_memory_pool; +using ::arrow::Field; +using ::arrow::Result; +using ::arrow::Table; +using ::arrow::io::BufferReader; +using ::arrow::random::GenerateArray; +using ::arrow::random::GenerateBatch; +using ::parquet::arrow::FileReader; +using ::parquet::arrow::FileReaderBuilder; +using ::parquet::arrow::MakeSimpleTable; +using ::parquet::arrow::NonNullArray; +using ::parquet::arrow::WriteTable; + +using ::testing::Bool; +using ::testing::Combine; +using ::testing::Values; + +std::shared_ptr GenerateTable(const std::vector>& fields, + int64_t size, int32_t seed = 42) { + auto batch = GenerateBatch(fields, size, seed); + return Table::FromRecordBatches({batch}).ValueOrDie(); +} + +std::shared_ptr
ConcatAndCombine( + const std::vector>& parts) { + auto table = ConcatenateTables(parts).ValueOrDie(); + return table->CombineChunks().ValueOrDie(); +} + +Result> WriteTableToBuffer(const std::shared_ptr
& table, + uint64_t min_chunk_size, + uint64_t max_chunk_size, + int64_t row_group_size = 1024 * 1024) { + auto sink = CreateOutputStream(); + + auto write_props = WriterProperties::Builder() + .disable_dictionary() + ->enable_cdc() + ->cdc_size_range(min_chunk_size, max_chunk_size) + ->build(); + auto arrow_props = default_arrow_writer_properties(); + RETURN_NOT_OK(WriteTable(*table, default_memory_pool(), sink, row_group_size, + write_props, arrow_props)); + return sink->Finish(); +} + +Result> ReadTableFromBuffer(const std::shared_ptr& data) { + std::shared_ptr
result; + FileReaderBuilder builder; + std::unique_ptr reader; + RETURN_NOT_OK(builder.Open(std::make_shared(data))); + RETURN_NOT_OK(builder.memory_pool(::arrow::default_memory_pool()) + ->properties(default_arrow_reader_properties()) + ->Build(&reader)); + RETURN_NOT_OK(reader->ReadTable(&result)); + return result; +} + +std::vector GetColumnPageLengths(const std::shared_ptr& data, + int column_index = 0) { + std::vector page_lengths; + + auto buffer_reader = std::make_shared(data); + auto parquet_reader = ParquetFileReader::Open(std::move(buffer_reader)); + + auto metadata = parquet_reader->metadata(); + for (int rg = 0; rg < metadata->num_row_groups(); rg++) { + auto page_reader = parquet_reader->RowGroup(rg)->GetColumnPageReader(column_index); + while (auto page = page_reader->NextPage()) { + if (page->type() == PageType::DATA_PAGE || page->type() == PageType::DATA_PAGE_V2) { + auto data_page = static_cast(page.get()); + page_lengths.push_back(data_page->num_values()); + } + } + } + + return page_lengths; +} + +Result> WriteAndGetPageLengths(const std::shared_ptr
& table, + uint64_t min_chunk_size, + uint64_t max_chunk_size, + int column_index = 0) { + ARROW_ASSIGN_OR_RAISE(auto buffer, + WriteTableToBuffer(table, min_chunk_size, max_chunk_size)); + ARROW_ASSIGN_OR_RAISE(auto readback, ReadTableFromBuffer(buffer)); + + RETURN_NOT_OK(readback->ValidateFull()); + ARROW_RETURN_IF(!readback->Equals(*table), + Status::Invalid("Readback table not equal to original")); + return GetColumnPageLengths(buffer, column_index); +} + +void AssertAllBetween(const std::vector& values, uint64_t min, uint64_t max) { + // expect the last chunk since it is not guaranteed to be within the range + for (size_t i = 0; i < values.size() - 1; i++) { + ASSERT_GE(values[i], min); + ASSERT_LE(values[i], max); + } + ASSERT_LE(values.back(), max); +} + +void AssertUpdateCase(const std::vector& original, + const std::vector& modified) { + ASSERT_EQ(original.size(), modified.size()); + for (size_t i = 0; i < original.size(); i++) { + ASSERT_EQ(original[i], modified[i]); + } +} + +void AssertDeleteCase(const std::vector& original, + const std::vector& modified, + uint8_t n_modifications = 1) { + ASSERT_EQ(original.size(), modified.size()); + size_t smaller_count = 0; + for (size_t i = 0; i < original.size(); i++) { + if (modified[i] < original[i]) { + smaller_count++; + ASSERT_LT(modified[i], original[i]); + } else { + ASSERT_EQ(modified[i], original[i]); + } + } + ASSERT_EQ(smaller_count, n_modifications); +} + +void AssertInsertCase(const std::vector& original, + const std::vector& modified, + uint8_t n_modifications = 1) { + ASSERT_EQ(original.size(), modified.size()); + size_t larger_count = 0; + for (size_t i = 0; i < original.size(); i++) { + if (modified[i] > original[i]) { + larger_count++; + ASSERT_GT(modified[i], original[i]); + } else { + ASSERT_EQ(modified[i], original[i]); + } + } + ASSERT_EQ(larger_count, n_modifications); +} + +void AssertAppendCase(const std::vector& original, + const std::vector& modified) { + ASSERT_GE(modified.size(), original.size()); + for (size_t i = 0; i < original.size() - 1; i++) { + ASSERT_EQ(original[i], modified[i]); + } + ASSERT_GT(modified[original.size() - 1], original.back()); +} + +uint64_t ElementCount(uint64_t size, int32_t byte_width, bool nullable) { + if (nullable) { + byte_width += 2; + } + return size / byte_width; +} + +constexpr uint64_t kMinChunkSize = 128 * 1024; +constexpr uint64_t kMaxChunkSize = 256 * 1024; + +// TODO: +// - test nullable types +// - test nested types +// - test dictionary encoding +// - test multiple row groups + +class TestColumnChunker : public ::testing::TestWithParam< + std::tuple, bool>> {}; + +TEST_P(TestColumnChunker, DeleteOnce) { + auto dtype = std::get<0>(GetParam()); + auto nullable = std::get<1>(GetParam()); + + auto field = ::arrow::field("f0", dtype, nullable); + + auto part1 = GenerateTable({field}, 128 * 1024); + auto part2 = GenerateTable({field}, 32, /*seed=*/1); + auto part3 = GenerateTable({field}, 128 * 1024); + + auto base = ConcatAndCombine({part1, part2, part3}); + auto modified = ConcatAndCombine({part1, part3}); + + auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); + auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); + + ASSERT_OK_AND_ASSIGN(auto base_lengths, + WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize)); + ASSERT_OK_AND_ASSIGN(auto modified_lengths, + WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize)); + + AssertAllBetween(base_lengths, min_length, max_length); + AssertAllBetween(modified_lengths, min_length, max_length); + AssertDeleteCase(base_lengths, modified_lengths, 1); +} + +TEST_P(TestColumnChunker, DeleteTwice) { + auto dtype = std::get<0>(GetParam()); + auto nullable = std::get<1>(GetParam()); + + auto field = ::arrow::field("f0", dtype, nullable); + + auto part1 = GenerateTable({field}, 128 * 1024); + auto part2 = GenerateTable({field}, 32, /*seed=*/1); + auto part3 = GenerateTable({field}, 128 * 1024); + auto part4 = GenerateTable({field}, 32, /*seed=*/2); + auto part5 = GenerateTable({field}, 128 * 1024); + + auto base = ConcatAndCombine({part1, part2, part3, part4, part5}); + auto modified = ConcatAndCombine({part1, part3, part5}); + + auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); + auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); + + ASSERT_OK_AND_ASSIGN(auto base_lengths, + WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize)); + ASSERT_OK_AND_ASSIGN(auto modified_lengths, + WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize)); + + AssertAllBetween(base_lengths, min_length, max_length); + AssertAllBetween(modified_lengths, min_length, max_length); + AssertDeleteCase(base_lengths, modified_lengths, 2); +} + +TEST_P(TestColumnChunker, UpdateOnce) { + auto dtype = std::get<0>(GetParam()); + auto nullable = std::get<1>(GetParam()); + + auto field = ::arrow::field("f0", dtype, nullable); + + auto part1 = GenerateTable({field}, 128 * 1024); + auto part2 = GenerateTable({field}, 32, /*seed=*/1); + auto part3 = GenerateTable({field}, 128 * 1024); + auto part4 = GenerateTable({field}, 32, /*seed=*/2); + + auto base = ConcatAndCombine({part1, part2, part3}); + auto modified = ConcatAndCombine({part1, part4, part3}); + + auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); + auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); + + ASSERT_OK_AND_ASSIGN(auto base_lengths, + WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize)); + ASSERT_OK_AND_ASSIGN(auto modified_lengths, + WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize)); + + AssertAllBetween(base_lengths, min_length, max_length); + AssertAllBetween(modified_lengths, min_length, max_length); + AssertUpdateCase(base_lengths, modified_lengths); +} + +TEST_P(TestColumnChunker, UpdateTwice) { + auto dtype = std::get<0>(GetParam()); + auto nullable = std::get<1>(GetParam()); + + auto field = ::arrow::field("f0", dtype, nullable); + + auto part1 = GenerateTable({field}, 128 * 1024); + auto part2 = GenerateTable({field}, 32, /*seed=*/1); + auto part3 = GenerateTable({field}, 128 * 1024); + auto part4 = GenerateTable({field}, 32, /*seed=*/2); + auto part5 = GenerateTable({field}, 128 * 1024); + auto part6 = GenerateTable({field}, 32, /*seed=*/3); + auto part7 = GenerateTable({field}, 32, /*seed=*/4); + + auto base = ConcatAndCombine({part1, part2, part3, part4, part5}); + auto modified = ConcatAndCombine({part1, part6, part3, part7, part5}); + + auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); + auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); + + ASSERT_OK_AND_ASSIGN(auto base_lengths, + WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize)); + ASSERT_OK_AND_ASSIGN(auto modified_lengths, + WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize)); + + AssertAllBetween(base_lengths, min_length, max_length); + AssertAllBetween(modified_lengths, min_length, max_length); + AssertUpdateCase(base_lengths, modified_lengths); +} + +TEST_P(TestColumnChunker, InsertOnce) { + auto dtype = std::get<0>(GetParam()); + auto nullable = std::get<1>(GetParam()); + + auto field = ::arrow::field("f0", dtype, nullable); + + auto part1 = GenerateTable({field}, 128 * 1024); + auto part2 = GenerateTable({field}, 32, /*seed=*/1); + auto part3 = GenerateTable({field}, 128 * 1024); + auto part4 = GenerateTable({field}, 64); + + auto base = ConcatAndCombine({part1, part2, part3}); + auto modified = ConcatAndCombine({part1, part2, part4, part3}); + + auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); + auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); + + ASSERT_OK_AND_ASSIGN(auto base_lengths, + WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize)); + ASSERT_OK_AND_ASSIGN(auto modified_lengths, + WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize)); + + AssertAllBetween(base_lengths, min_length, max_length); + AssertAllBetween(modified_lengths, min_length, max_length); + AssertInsertCase(base_lengths, modified_lengths, 1); +} + +TEST_P(TestColumnChunker, InsertTwice) { + auto dtype = std::get<0>(GetParam()); + auto nullable = std::get<1>(GetParam()); + + auto field = ::arrow::field("f0", dtype, nullable); + + auto part1 = GenerateTable({field}, 128 * 1024); + auto part2 = GenerateTable({field}, 32, /*seed=*/1); + auto part3 = GenerateTable({field}, 128 * 1024); + auto part4 = GenerateTable({field}, 32, /*seed=*/2); + auto part5 = GenerateTable({field}, 128 * 1024); + auto part6 = GenerateTable({field}, 64); + auto part7 = GenerateTable({field}, 64); + + auto base = ConcatAndCombine({part1, part2, part3, part4, part5}); + auto modified = ConcatAndCombine({part1, part2, part6, part3, part4, part7, part5}); + + auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); + auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); + + ASSERT_OK_AND_ASSIGN(auto base_lengths, + WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize)); + ASSERT_OK_AND_ASSIGN(auto modified_lengths, + WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize)); + + AssertAllBetween(base_lengths, min_length, max_length); + AssertAllBetween(modified_lengths, min_length, max_length); + AssertInsertCase(base_lengths, modified_lengths, 2); +} + +TEST_P(TestColumnChunker, Append) { + auto dtype = std::get<0>(GetParam()); + auto nullable = std::get<1>(GetParam()); + + auto field = ::arrow::field("f0", dtype, nullable); + + auto part1 = GenerateTable({field}, 128 * 1024); + auto part2 = GenerateTable({field}, 32, /*seed=*/1); + auto part3 = GenerateTable({field}, 128 * 1024); + auto part4 = GenerateTable({field}, 32 * 1024); + + auto base = ConcatAndCombine({part1, part2, part3}); + auto modified = ConcatAndCombine({part1, part2, part3, part4}); + + auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); + auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); + + ASSERT_OK_AND_ASSIGN(auto base_lengths, + WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize)); + ASSERT_OK_AND_ASSIGN(auto modified_lengths, + WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize)); + + AssertAllBetween(base_lengths, min_length, max_length); + AssertAllBetween(modified_lengths, min_length, max_length); + AssertAppendCase(base_lengths, modified_lengths); +} + +INSTANTIATE_TEST_SUITE_P( + TypeRoundtrip, TestColumnChunker, + Combine(Values(::arrow::uint8(), ::arrow::uint16(), ::arrow::uint32(), + ::arrow::uint64(), ::arrow::int8(), ::arrow::int16(), ::arrow::int32(), + ::arrow::int64(), ::arrow::float16(), ::arrow::float32(), + ::arrow::float64()), + Bool())); + +} // namespace parquet + +// - check that the state is maintained across rowgroups, so the edits should be +// consistent +// - check that the edits are consistent between writes +// - some smoke testing like approach would be nice to test several arrow types diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 12c253806a8..c6e2716d4e4 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -754,7 +754,8 @@ class ColumnWriterImpl { fallback_(false), definition_levels_sink_(allocator_), repetition_levels_sink_(allocator_), - content_defined_chunker_(level_info_, properties->cdc_avg_size()) { + content_defined_chunker_(level_info_, properties->cdc_size_range().first, + properties->cdc_size_range().second) { definition_levels_rle_ = std::static_pointer_cast(AllocateBuffer(allocator_, 0)); repetition_levels_rle_ = @@ -897,7 +898,7 @@ class ColumnWriterImpl { std::vector> data_pages_; - internal::FastCDC content_defined_chunker_; + internal::ContentDefinedChunker content_defined_chunker_; private: void InitSinks() { @@ -1366,15 +1367,17 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, content_defined_chunker_.GetBoundaries( def_levels, rep_levels, num_levels, leaf_array)); for (auto chunk : boundaries) { - auto sliced_array = leaf_array.Slice(chunk.value_offset); + auto chunk_array = leaf_array.Slice(chunk.value_offset); + auto chunk_def_levels = AddIfNotNull(def_levels, chunk.level_offset); + auto chunk_rep_levels = AddIfNotNull(rep_levels, chunk.level_offset); if (leaf_array.type()->id() == ::arrow::Type::DICTIONARY) { - ARROW_CHECK_OK(WriteArrowDictionary( - def_levels + chunk.level_offset, rep_levels + chunk.level_offset, - chunk.levels_to_write, *sliced_array, ctx, maybe_parent_nulls)); + ARROW_CHECK_OK(WriteArrowDictionary(chunk_def_levels, chunk_rep_levels, + chunk.levels_to_write, *chunk_array, ctx, + maybe_parent_nulls)); } else { - ARROW_CHECK_OK(WriteArrowDense( - def_levels + chunk.level_offset, rep_levels + chunk.level_offset, - chunk.levels_to_write, *sliced_array, ctx, maybe_parent_nulls)); + ARROW_CHECK_OK(WriteArrowDense(chunk_def_levels, chunk_rep_levels, + chunk.levels_to_write, *chunk_array, ctx, + maybe_parent_nulls)); } if (num_buffered_values_ > 0) { AddDataPage(); diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 86f590b68e8..ccb8e975d12 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -168,6 +168,8 @@ static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOM static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = true; static constexpr SizeStatisticsLevel DEFAULT_SIZE_STATISTICS_LEVEL = SizeStatisticsLevel::PageAndColumnChunk; +static constexpr std::pair DEFAULT_CDC_SIZE_RANGE = + std::make_pair(256 * 1024, 1024 * 1024); class PARQUET_EXPORT ColumnProperties { public: @@ -263,7 +265,7 @@ class PARQUET_EXPORT WriterProperties { page_checksum_enabled_(false), size_statistics_level_(DEFAULT_SIZE_STATISTICS_LEVEL), cdc_enabled_(false), - cdc_avg_size_(0) {} + cdc_size_range_(DEFAULT_CDC_SIZE_RANGE) {} explicit Builder(const WriterProperties& properties) : pool_(properties.memory_pool()), @@ -280,7 +282,7 @@ class PARQUET_EXPORT WriterProperties { sorting_columns_(properties.sorting_columns()), default_column_properties_(properties.default_column_properties()), cdc_enabled_(properties.cdc_enabled()), - cdc_avg_size_(properties.cdc_avg_size()) {} + cdc_size_range_(properties.cdc_size_range()) {} virtual ~Builder() {} @@ -294,8 +296,8 @@ class PARQUET_EXPORT WriterProperties { return this; } - Builder* cdc_avg_size(uint64_t avg_size) { - cdc_avg_size_ = avg_size; + Builder* cdc_size_range(uint64_t min_size, uint64_t max_size) { + cdc_size_range_ = std::make_pair(min_size, max_size); return this; } @@ -722,7 +724,7 @@ class PARQUET_EXPORT WriterProperties { size_statistics_level_, std::move(file_encryption_properties_), default_column_properties_, column_properties, data_page_version_, store_decimal_as_integer_, std::move(sorting_columns_), cdc_enabled_, - cdc_avg_size_)); + cdc_size_range_)); } private: @@ -753,7 +755,7 @@ class PARQUET_EXPORT WriterProperties { std::unordered_map page_index_enabled_; bool cdc_enabled_; - uint64_t cdc_avg_size_; + std::pair cdc_size_range_; }; inline MemoryPool* memory_pool() const { return pool_; } @@ -779,7 +781,7 @@ class PARQUET_EXPORT WriterProperties { inline bool page_checksum_enabled() const { return page_checksum_enabled_; } inline bool cdc_enabled() const { return cdc_enabled_; } - inline uint64_t cdc_avg_size() const { return cdc_avg_size_; } + inline std::pair cdc_size_range() const { return cdc_size_range_; } inline SizeStatisticsLevel size_statistics_level() const { return size_statistics_level_; @@ -883,7 +885,8 @@ class PARQUET_EXPORT WriterProperties { const ColumnProperties& default_column_properties, const std::unordered_map& column_properties, ParquetDataPageVersion data_page_version, bool store_short_decimal_as_integer, - std::vector sorting_columns, bool cdc_enabled, uint64_t cdc_avg_size) + std::vector sorting_columns, bool cdc_enabled, + std::pair cdc_size_range) : pool_(pool), dictionary_pagesize_limit_(dictionary_pagesize_limit), write_batch_size_(write_batch_size), @@ -900,7 +903,7 @@ class PARQUET_EXPORT WriterProperties { default_column_properties_(default_column_properties), column_properties_(column_properties), cdc_enabled_(cdc_enabled), - cdc_avg_size_(cdc_avg_size) + cdc_size_range_(cdc_size_range) {} @@ -924,7 +927,7 @@ class PARQUET_EXPORT WriterProperties { std::unordered_map column_properties_; bool cdc_enabled_; - uint64_t cdc_avg_size_; + std::pair cdc_size_range_; }; PARQUET_EXPORT const std::shared_ptr& default_writer_properties(); diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 51986eba842..a0dcfd6d453 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -497,7 +497,7 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: Builder* disable_page_checksum() Builder* enable_cdc() Builder* disable_cdc() - Builder* cdc_avg_size(uint64_t avg_size) + Builder* cdc_size_range(uint64_t min_size, uint64_t max_size) shared_ptr[WriterProperties] build() cdef cppclass ArrowWriterProperties: diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index ae53d6d868e..1ef86d3e3f9 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -2114,15 +2114,14 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( props.dictionary_pagesize_limit(dictionary_pagesize_limit) # content defined chunking - if content_defined_chunking is False: props.disable_cdc() elif content_defined_chunking is True: props.enable_cdc() elif isinstance(content_defined_chunking, tuple): - avg_size, = content_defined_chunking + min_size, max_size = content_defined_chunking props.enable_cdc() - props.cdc_avg_size(avg_size) + props.cdc_size_range(min_size, max_size) else: raise ValueError( "Unsupported value for content_defined_chunking: {0}" From 616e76de98222abdd8b9e1478adc590c7d393486 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 21 Feb 2025 17:08:35 +0100 Subject: [PATCH 014/102] additional testing (more types, dictionary encoding, nullable types) --- cpp/src/parquet/column_chunker_test.cc | 491 ++++++++++++++++++------- 1 file changed, 355 insertions(+), 136 deletions(-) diff --git a/cpp/src/parquet/column_chunker_test.cc b/cpp/src/parquet/column_chunker_test.cc index c4332c882d4..a1682ec1027 100644 --- a/cpp/src/parquet/column_chunker_test.cc +++ b/cpp/src/parquet/column_chunker_test.cc @@ -7,8 +7,18 @@ // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. #include +#include +#include +#include #include "arrow/array.h" #include "arrow/array/builder_binary.h" @@ -68,14 +78,20 @@ std::shared_ptr
ConcatAndCombine( Result> WriteTableToBuffer(const std::shared_ptr
& table, uint64_t min_chunk_size, uint64_t max_chunk_size, + bool enable_dictionary = false, + int64_t row_group_size = 1024 * 1024) { auto sink = CreateOutputStream(); - auto write_props = WriterProperties::Builder() - .disable_dictionary() - ->enable_cdc() - ->cdc_size_range(min_chunk_size, max_chunk_size) - ->build(); + auto builder = WriterProperties::Builder(); + // enable content defined chunking + builder.enable_cdc()->cdc_size_range(min_chunk_size, max_chunk_size); + if (enable_dictionary) { + builder.enable_dictionary(); + } else { + builder.disable_dictionary(); + } + auto write_props = builder.build(); auto arrow_props = default_arrow_writer_properties(); RETURN_NOT_OK(WriteTable(*table, default_memory_pool(), sink, row_group_size, write_props, arrow_props)); @@ -118,9 +134,12 @@ std::vector GetColumnPageLengths(const std::shared_ptr& data, Result> WriteAndGetPageLengths(const std::shared_ptr
& table, uint64_t min_chunk_size, uint64_t max_chunk_size, + + bool enable_dictionary = false, int column_index = 0) { - ARROW_ASSIGN_OR_RAISE(auto buffer, - WriteTableToBuffer(table, min_chunk_size, max_chunk_size)); + ARROW_ASSIGN_OR_RAISE( + auto buffer, + WriteTableToBuffer(table, min_chunk_size, max_chunk_size, enable_dictionary)); ARROW_ASSIGN_OR_RAISE(auto readback, ReadTableFromBuffer(buffer)); RETURN_NOT_OK(readback->ValidateFull()); @@ -129,53 +148,213 @@ Result> WriteAndGetPageLengths(const std::shared_ptr
& values, uint64_t min, uint64_t max) { +void AssertAllBetween(const std::vector& values, uint64_t min, uint64_t max, + bool expect_dictionary_fallback = false) { // expect the last chunk since it is not guaranteed to be within the range - for (size_t i = 0; i < values.size() - 1; i++) { - ASSERT_GE(values[i], min); - ASSERT_LE(values[i], max); + if (expect_dictionary_fallback) { + // if dictionary encoding is enabled, the writer can fallback to plain + // encoding splitting within a content defined chunk, so we can't + // guarantee that all chunks are within the range in this case, but we + // know that there can be at most 2 pages smaller than the min_chunk_size + size_t smaller_count = 0; + for (size_t i = 0; i < values.size() - 1; i++) { + if (values[i] < min) { + smaller_count++; + } else { + ASSERT_LE(values[i], max); + } + } + ASSERT_LE(smaller_count, 2); + } else { + for (size_t i = 0; i < values.size() - 1; i++) { + ASSERT_GE(values[i], min); + ASSERT_LE(values[i], max); + } } ASSERT_LE(values.back(), max); } +std::vector, std::vector>> FindDifferences( + const std::vector& first, const std::vector& second) { + auto n = first.size(), m = second.size(); + + // Build DP table for LCS. + std::vector> dp(n + 1, std::vector(m + 1, 0)); + for (size_t i = 0; i < n; ++i) { + for (size_t j = 0; j < m; ++j) { + dp[i + 1][j + 1] = + (first[i] == second[j]) ? dp[i][j] + 1 : std::max(dp[i + 1][j], dp[i][j + 1]); + } + } + + // Backtrack to recover LCS indices. + std::vector> common; + for (auto i = n, j = m; i > 0 && j > 0;) { + if (first[i - 1] == second[j - 1]) { + common.emplace_back(i - 1, j - 1); + --i, --j; + } else if (dp[i - 1][j] >= dp[i][j - 1]) { + --i; + } else { + --j; + } + } + std::reverse(common.begin(), common.end()); + + // Extract differences using the common indices as anchors. + std::vector, std::vector>> result; + size_t last_i = 0, last_j = 0; + for (auto [ci, cj] : common) { + std::vector diff1(first.begin() + last_i, first.begin() + ci); + std::vector diff2(second.begin() + last_j, second.begin() + cj); + if (!diff1.empty() || !diff2.empty()) { + result.emplace_back(std::move(diff1), std::move(diff2)); + } + last_i = ci + 1; + last_j = cj + 1; + } + // Add any remaining elements after the last common index. + std::vector diff1(first.begin() + last_i, first.end()); + std::vector diff2(second.begin() + last_j, second.end()); + if (!diff1.empty() || !diff2.empty()) { + result.emplace_back(std::move(diff1), std::move(diff2)); + } + + return result; +} + +TEST(TestFindDifferences, Basic) { + std::vector first = {1, 2, 3, 4, 5}; + std::vector second = {1, 7, 8, 4, 5}; + + auto diffs = FindDifferences(first, second); + + ASSERT_EQ(diffs.size(), 1); + ASSERT_EQ(diffs[0].first, std::vector({2, 3})); + ASSERT_EQ(diffs[0].second, std::vector({7, 8})); +} + +TEST(TestFindDifferences, MultipleDifferences) { + std::vector first = {1, 2, 3, 4, 5, 6, 7}; + std::vector second = {1, 8, 9, 4, 10, 6, 11}; + auto diffs = FindDifferences(first, second); + + ASSERT_EQ(diffs.size(), 3); + + ASSERT_EQ(diffs[0].first, std::vector({2, 3})); + ASSERT_EQ(diffs[0].second, std::vector({8, 9})); + + ASSERT_EQ(diffs[1].first, std::vector({5})); + ASSERT_EQ(diffs[1].second, std::vector({10})); + + ASSERT_EQ(diffs[2].first, std::vector({7})); + ASSERT_EQ(diffs[2].second, std::vector({11})); +} + +TEST(TestFindDifferences, DifferentLengths) { + std::vector first = {1, 2, 3}; + std::vector second = {1, 2, 3, 4, 5}; + auto diffs = FindDifferences(first, second); + + ASSERT_EQ(diffs.size(), 1); + ASSERT_TRUE(diffs[0].first.empty()); + ASSERT_EQ(diffs[0].second, std::vector({4, 5})); +} + +TEST(TestFindDifferences, EmptyArrays) { + std::vector first = {}; + std::vector second = {}; + auto diffs = FindDifferences(first, second); + ASSERT_TRUE(diffs.empty()); +} + +TEST(TestFindDifferences, LongSequenceWithSingleDifference) { + std::vector first = { + 1994, 2193, 2700, 1913, 2052, + }; + std::vector second = {2048, 43, 2080, 2700, 1913, 2052}; + auto diffs = FindDifferences(first, second); + + ASSERT_EQ(diffs.size(), 1); + ASSERT_EQ(diffs[0].first, std::vector({1994, 2193})); + ASSERT_EQ(diffs[0].second, std::vector({2048, 43, 2080})); + + // Verify that elements after the difference are identical + for (size_t i = 3; i < second.size(); i++) { + ASSERT_EQ(first[i - 1], second[i]); + } +} + +TEST(TestFindDifferences, LongSequenceWithMiddleChanges) { + std::vector first = {2169, 1976, 2180, 2147, 1934, 1772, + 1914, 2075, 2154, 1940, 1934, 1970}; + std::vector second = {2169, 1976, 2180, 2147, 2265, 1804, + 1717, 1925, 2122, 1940, 1934, 1970}; + auto diffs = FindDifferences(first, second); + + ASSERT_EQ(diffs.size(), 1); + ASSERT_EQ(diffs[0].first, std::vector({1934, 1772, 1914, 2075, 2154})); + ASSERT_EQ(diffs[0].second, std::vector({2265, 1804, 1717, 1925, 2122})); + + // Verify elements before and after the difference are identical + for (size_t i = 0; i < 4; i++) { + ASSERT_EQ(first[i], second[i]); + } + for (size_t i = 9; i < first.size(); i++) { + ASSERT_EQ(first[i], second[i]); + } +} + void AssertUpdateCase(const std::vector& original, - const std::vector& modified) { - ASSERT_EQ(original.size(), modified.size()); - for (size_t i = 0; i < original.size(); i++) { - ASSERT_EQ(original[i], modified[i]); + const std::vector& modified, uint8_t n_modifications) { + auto diffs = FindDifferences(original, modified); + ASSERT_LE(diffs.size(), n_modifications); + + for (const auto& diff : diffs) { + uint64_t left_sum = 0, right_sum = 0; + for (const auto& val : diff.first) left_sum += val; + for (const auto& val : diff.second) right_sum += val; + ASSERT_EQ(left_sum, right_sum); + ASSERT_LE(diff.first.size(), 2); + ASSERT_LE(diff.second.size(), 2); + } + + if (diffs.size() == 0) { + // no differences found, the arrays are equal + ASSERT_TRUE(original == modified); } } void AssertDeleteCase(const std::vector& original, - const std::vector& modified, - uint8_t n_modifications = 1) { - ASSERT_EQ(original.size(), modified.size()); - size_t smaller_count = 0; - for (size_t i = 0; i < original.size(); i++) { - if (modified[i] < original[i]) { - smaller_count++; - ASSERT_LT(modified[i], original[i]); - } else { - ASSERT_EQ(modified[i], original[i]); - } + const std::vector& modified, uint8_t n_modifications, + uint64_t edit_length) { + auto diffs = FindDifferences(original, modified); + ASSERT_EQ(diffs.size(), n_modifications); + + for (const auto& diff : diffs) { + uint64_t left_sum = 0, right_sum = 0; + for (const auto& val : diff.first) left_sum += val; + for (const auto& val : diff.second) right_sum += val; + ASSERT_EQ(left_sum, right_sum + edit_length); + ASSERT_LE(diff.first.size(), 2); + ASSERT_LE(diff.second.size(), 2); } - ASSERT_EQ(smaller_count, n_modifications); } void AssertInsertCase(const std::vector& original, - const std::vector& modified, - uint8_t n_modifications = 1) { - ASSERT_EQ(original.size(), modified.size()); - size_t larger_count = 0; - for (size_t i = 0; i < original.size(); i++) { - if (modified[i] > original[i]) { - larger_count++; - ASSERT_GT(modified[i], original[i]); - } else { - ASSERT_EQ(modified[i], original[i]); - } + const std::vector& modified, uint8_t n_modifications, + uint64_t edit_length) { + auto diffs = FindDifferences(original, modified); + ASSERT_EQ(diffs.size(), n_modifications); + + for (const auto& diff : diffs) { + uint64_t left_sum = 0, right_sum = 0; + for (const auto& val : diff.first) left_sum += val; + for (const auto& val : diff.second) right_sum += val; + ASSERT_EQ(left_sum + edit_length, right_sum); + ASSERT_LE(diff.first.size(), 2); + ASSERT_LE(diff.second.size(), 2); } - ASSERT_EQ(larger_count, n_modifications); } void AssertAppendCase(const std::vector& original, @@ -189,229 +368,269 @@ void AssertAppendCase(const std::vector& original, uint64_t ElementCount(uint64_t size, int32_t byte_width, bool nullable) { if (nullable) { + // in case of nullable types the def_levels are also fed through the chunker + // to identify changes in the null bitmap, this will increase the byte width + // and decrease the number of elements per chunk byte_width += 2; } return size / byte_width; } -constexpr uint64_t kMinChunkSize = 128 * 1024; -constexpr uint64_t kMaxChunkSize = 256 * 1024; +constexpr uint64_t kMinChunkSize = 32 * 1024; +constexpr uint64_t kMaxChunkSize = 128 * 1024; +constexpr uint64_t kPartLength = 128 * 1024; +constexpr uint64_t kEditLength = 32; // TODO: -// - test nullable types // - test nested types -// - test dictionary encoding // - test multiple row groups -class TestColumnChunker : public ::testing::TestWithParam< - std::tuple, bool>> {}; +class TestContentDefinedChunker + : public ::testing::TestWithParam< + std::tuple, bool, bool>> {}; -TEST_P(TestColumnChunker, DeleteOnce) { +TEST_P(TestContentDefinedChunker, DeleteOnce) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); + auto enable_dictionary = std::get<2>(GetParam()); auto field = ::arrow::field("f0", dtype, nullable); - auto part1 = GenerateTable({field}, 128 * 1024); - auto part2 = GenerateTable({field}, 32, /*seed=*/1); - auto part3 = GenerateTable({field}, 128 * 1024); + auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); + auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); + auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); auto base = ConcatAndCombine({part1, part2, part3}); auto modified = ConcatAndCombine({part1, part3}); + ASSERT_FALSE(base->Equals(*modified)); auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); ASSERT_OK_AND_ASSIGN(auto base_lengths, - WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize)); + WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); ASSERT_OK_AND_ASSIGN(auto modified_lengths, - WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize)); + WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + + AssertAllBetween(base_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + AssertAllBetween(modified_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); - AssertAllBetween(base_lengths, min_length, max_length); - AssertAllBetween(modified_lengths, min_length, max_length); - AssertDeleteCase(base_lengths, modified_lengths, 1); + AssertDeleteCase(base_lengths, modified_lengths, 1, kEditLength); } -TEST_P(TestColumnChunker, DeleteTwice) { +TEST_P(TestContentDefinedChunker, DeleteTwice) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); + auto enable_dictionary = std::get<2>(GetParam()); auto field = ::arrow::field("f0", dtype, nullable); - auto part1 = GenerateTable({field}, 128 * 1024); - auto part2 = GenerateTable({field}, 32, /*seed=*/1); - auto part3 = GenerateTable({field}, 128 * 1024); - auto part4 = GenerateTable({field}, 32, /*seed=*/2); - auto part5 = GenerateTable({field}, 128 * 1024); + auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); + auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); + auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); + auto part4 = GenerateTable({field}, kEditLength, /*seed=*/4); + auto part5 = GenerateTable({field}, kPartLength, /*seed=*/5); auto base = ConcatAndCombine({part1, part2, part3, part4, part5}); auto modified = ConcatAndCombine({part1, part3, part5}); + ASSERT_FALSE(base->Equals(*modified)); auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); ASSERT_OK_AND_ASSIGN(auto base_lengths, - WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize)); + WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); ASSERT_OK_AND_ASSIGN(auto modified_lengths, - WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize)); - - AssertAllBetween(base_lengths, min_length, max_length); - AssertAllBetween(modified_lengths, min_length, max_length); - AssertDeleteCase(base_lengths, modified_lengths, 2); + WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + + AssertAllBetween(base_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + AssertAllBetween(modified_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + AssertDeleteCase(base_lengths, modified_lengths, 2, kEditLength); } -TEST_P(TestColumnChunker, UpdateOnce) { +TEST_P(TestContentDefinedChunker, UpdateOnce) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); + auto enable_dictionary = std::get<2>(GetParam()); auto field = ::arrow::field("f0", dtype, nullable); - auto part1 = GenerateTable({field}, 128 * 1024); - auto part2 = GenerateTable({field}, 32, /*seed=*/1); - auto part3 = GenerateTable({field}, 128 * 1024); - auto part4 = GenerateTable({field}, 32, /*seed=*/2); + auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); + auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); + auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); + auto part4 = GenerateTable({field}, kEditLength, /*seed=*/4); auto base = ConcatAndCombine({part1, part2, part3}); auto modified = ConcatAndCombine({part1, part4, part3}); + ASSERT_FALSE(base->Equals(*modified)); auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); ASSERT_OK_AND_ASSIGN(auto base_lengths, - WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize)); + WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); ASSERT_OK_AND_ASSIGN(auto modified_lengths, - WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize)); - - AssertAllBetween(base_lengths, min_length, max_length); - AssertAllBetween(modified_lengths, min_length, max_length); - AssertUpdateCase(base_lengths, modified_lengths); + WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + + AssertAllBetween(base_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + AssertAllBetween(modified_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + AssertUpdateCase(base_lengths, modified_lengths, 1); } -TEST_P(TestColumnChunker, UpdateTwice) { +TEST_P(TestContentDefinedChunker, UpdateTwice) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); + auto enable_dictionary = std::get<2>(GetParam()); auto field = ::arrow::field("f0", dtype, nullable); - auto part1 = GenerateTable({field}, 128 * 1024); - auto part2 = GenerateTable({field}, 32, /*seed=*/1); - auto part3 = GenerateTable({field}, 128 * 1024); - auto part4 = GenerateTable({field}, 32, /*seed=*/2); - auto part5 = GenerateTable({field}, 128 * 1024); - auto part6 = GenerateTable({field}, 32, /*seed=*/3); - auto part7 = GenerateTable({field}, 32, /*seed=*/4); + auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); + auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); + auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); + auto part4 = GenerateTable({field}, kEditLength, /*seed=*/4); + auto part5 = GenerateTable({field}, kPartLength, /*seed=*/5); + auto part6 = GenerateTable({field}, kEditLength, /*seed=*/6); + auto part7 = GenerateTable({field}, kEditLength, /*seed=*/7); auto base = ConcatAndCombine({part1, part2, part3, part4, part5}); auto modified = ConcatAndCombine({part1, part6, part3, part7, part5}); + ASSERT_FALSE(base->Equals(*modified)); auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); ASSERT_OK_AND_ASSIGN(auto base_lengths, - WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize)); + WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); ASSERT_OK_AND_ASSIGN(auto modified_lengths, - WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize)); - - AssertAllBetween(base_lengths, min_length, max_length); - AssertAllBetween(modified_lengths, min_length, max_length); - AssertUpdateCase(base_lengths, modified_lengths); + WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + + AssertAllBetween(base_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + AssertAllBetween(modified_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + AssertUpdateCase(base_lengths, modified_lengths, 2); } -TEST_P(TestColumnChunker, InsertOnce) { +TEST_P(TestContentDefinedChunker, InsertOnce) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); + auto enable_dictionary = std::get<2>(GetParam()); auto field = ::arrow::field("f0", dtype, nullable); - auto part1 = GenerateTable({field}, 128 * 1024); - auto part2 = GenerateTable({field}, 32, /*seed=*/1); - auto part3 = GenerateTable({field}, 128 * 1024); - auto part4 = GenerateTable({field}, 64); + auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); + auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); + auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); - auto base = ConcatAndCombine({part1, part2, part3}); - auto modified = ConcatAndCombine({part1, part2, part4, part3}); + auto base = ConcatAndCombine({part1, part3}); + auto modified = ConcatAndCombine({part1, part2, part3}); + ASSERT_FALSE(base->Equals(*modified)); auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); ASSERT_OK_AND_ASSIGN(auto base_lengths, - WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize)); + WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); ASSERT_OK_AND_ASSIGN(auto modified_lengths, - WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize)); - - AssertAllBetween(base_lengths, min_length, max_length); - AssertAllBetween(modified_lengths, min_length, max_length); - AssertInsertCase(base_lengths, modified_lengths, 1); + WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + + AssertAllBetween(base_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + AssertAllBetween(modified_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + AssertInsertCase(base_lengths, modified_lengths, 1, kEditLength); } -TEST_P(TestColumnChunker, InsertTwice) { +TEST_P(TestContentDefinedChunker, InsertTwice) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); + auto enable_dictionary = std::get<2>(GetParam()); + enable_dictionary = false; auto field = ::arrow::field("f0", dtype, nullable); - auto part1 = GenerateTable({field}, 128 * 1024); - auto part2 = GenerateTable({field}, 32, /*seed=*/1); - auto part3 = GenerateTable({field}, 128 * 1024); - auto part4 = GenerateTable({field}, 32, /*seed=*/2); - auto part5 = GenerateTable({field}, 128 * 1024); - auto part6 = GenerateTable({field}, 64); - auto part7 = GenerateTable({field}, 64); + auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); + auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); + auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); + auto part4 = GenerateTable({field}, kEditLength, /*seed=*/4); + auto part5 = GenerateTable({field}, kPartLength, /*seed=*/5); - auto base = ConcatAndCombine({part1, part2, part3, part4, part5}); - auto modified = ConcatAndCombine({part1, part2, part6, part3, part4, part7, part5}); + auto base = ConcatAndCombine({part1, part3, part5}); + auto modified = ConcatAndCombine({part1, part2, part3, part4, part5}); + ASSERT_FALSE(base->Equals(*modified)); auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); ASSERT_OK_AND_ASSIGN(auto base_lengths, - WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize)); + WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); ASSERT_OK_AND_ASSIGN(auto modified_lengths, - WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize)); - - AssertAllBetween(base_lengths, min_length, max_length); - AssertAllBetween(modified_lengths, min_length, max_length); - AssertInsertCase(base_lengths, modified_lengths, 2); + WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + + AssertAllBetween(base_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + AssertAllBetween(modified_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + AssertInsertCase(base_lengths, modified_lengths, 2, kEditLength); } -TEST_P(TestColumnChunker, Append) { +TEST_P(TestContentDefinedChunker, Append) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); + auto enable_dictionary = std::get<2>(GetParam()); auto field = ::arrow::field("f0", dtype, nullable); - auto part1 = GenerateTable({field}, 128 * 1024); - auto part2 = GenerateTable({field}, 32, /*seed=*/1); - auto part3 = GenerateTable({field}, 128 * 1024); - auto part4 = GenerateTable({field}, 32 * 1024); + auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); + auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); + auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); + auto part4 = GenerateTable({field}, kEditLength, /*seed=*/4); auto base = ConcatAndCombine({part1, part2, part3}); auto modified = ConcatAndCombine({part1, part2, part3, part4}); + ASSERT_FALSE(base->Equals(*modified)); auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); ASSERT_OK_AND_ASSIGN(auto base_lengths, - WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize)); + WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); ASSERT_OK_AND_ASSIGN(auto modified_lengths, - WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize)); + WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); - AssertAllBetween(base_lengths, min_length, max_length); - AssertAllBetween(modified_lengths, min_length, max_length); + AssertAllBetween(base_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + AssertAllBetween(modified_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); AssertAppendCase(base_lengths, modified_lengths); } INSTANTIATE_TEST_SUITE_P( - TypeRoundtrip, TestColumnChunker, + FixedSizedTypes, TestContentDefinedChunker, Combine(Values(::arrow::uint8(), ::arrow::uint16(), ::arrow::uint32(), ::arrow::uint64(), ::arrow::int8(), ::arrow::int16(), ::arrow::int32(), ::arrow::int64(), ::arrow::float16(), ::arrow::float32(), ::arrow::float64()), - Bool())); + Bool(), Bool())); } // namespace parquet - -// - check that the state is maintained across rowgroups, so the edits should be -// consistent -// - check that the edits are consistent between writes -// - some smoke testing like approach would be nice to test several arrow types From 002a37d454f3ae5f0e2d61523adfb0c56268b625 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Sat, 22 Feb 2025 00:19:06 +0100 Subject: [PATCH 015/102] test cases for binary-like types --- cpp/src/parquet/column_chunker.cc | 2 + cpp/src/parquet/column_chunker_test.cc | 366 +++++++++++++++++++------ 2 files changed, 282 insertions(+), 86 deletions(-) diff --git a/cpp/src/parquet/column_chunker.cc b/cpp/src/parquet/column_chunker.cc index d99d36b8483..bc5640cbd7d 100644 --- a/cpp/src/parquet/column_chunker.cc +++ b/cpp/src/parquet/column_chunker.cc @@ -736,7 +736,9 @@ const ::arrow::Result> ContentDefinedChunker::GetBoundaries( PRIMITIVE_CASE(FLOAT, Float) PRIMITIVE_CASE(DOUBLE, Double) PRIMITIVE_CASE(STRING, String) + PRIMITIVE_CASE(LARGE_STRING, LargeString) PRIMITIVE_CASE(BINARY, Binary) + PRIMITIVE_CASE(LARGE_BINARY, LargeBinary) PRIMITIVE_CASE(FIXED_SIZE_BINARY, FixedSizeBinary) PRIMITIVE_CASE(DATE32, Date32) PRIMITIVE_CASE(DATE64, Date64) diff --git a/cpp/src/parquet/column_chunker_test.cc b/cpp/src/parquet/column_chunker_test.cc index a1682ec1027..3133888cc0f 100644 --- a/cpp/src/parquet/column_chunker_test.cc +++ b/cpp/src/parquet/column_chunker_test.cc @@ -92,7 +92,8 @@ Result> WriteTableToBuffer(const std::shared_ptr
& builder.disable_dictionary(); } auto write_props = builder.build(); - auto arrow_props = default_arrow_writer_properties(); + + auto arrow_props = ArrowWriterProperties::Builder().store_schema()->build(); RETURN_NOT_OK(WriteTable(*table, default_memory_pool(), sink, row_group_size, write_props, arrow_props)); return sink->Finish(); @@ -110,8 +111,9 @@ Result> ReadTableFromBuffer(const std::shared_ptr return result; } -std::vector GetColumnPageLengths(const std::shared_ptr& data, - int column_index = 0) { +std::pair, std::vector> GetColumnPageSizes( + const std::shared_ptr& data, int column_index = 0) { + std::vector page_sizes; std::vector page_lengths; auto buffer_reader = std::make_shared(data); @@ -123,20 +125,18 @@ std::vector GetColumnPageLengths(const std::shared_ptr& data, while (auto page = page_reader->NextPage()) { if (page->type() == PageType::DATA_PAGE || page->type() == PageType::DATA_PAGE_V2) { auto data_page = static_cast(page.get()); + page_sizes.push_back(data_page->size()); page_lengths.push_back(data_page->num_values()); } } } - return page_lengths; + return {page_lengths, page_sizes}; } -Result> WriteAndGetPageLengths(const std::shared_ptr
& table, - uint64_t min_chunk_size, - uint64_t max_chunk_size, - - bool enable_dictionary = false, - int column_index = 0) { +Result, std::vector>> WriteAndGetPageSizes( + const std::shared_ptr
& table, uint64_t min_chunk_size, uint64_t max_chunk_size, + bool enable_dictionary = false, int column_index = 0) { ARROW_ASSIGN_OR_RAISE( auto buffer, WriteTableToBuffer(table, min_chunk_size, max_chunk_size, enable_dictionary)); @@ -145,7 +145,7 @@ Result> WriteAndGetPageLengths(const std::shared_ptr
ValidateFull()); ARROW_RETURN_IF(!readback->Equals(*table), Status::Invalid("Readback table not equal to original")); - return GetColumnPageLengths(buffer, column_index); + return GetColumnPageSizes(buffer, column_index); } void AssertAllBetween(const std::vector& values, uint64_t min, uint64_t max, @@ -385,11 +385,11 @@ constexpr uint64_t kEditLength = 32; // - test nested types // - test multiple row groups -class TestContentDefinedChunker +class TestFixedSizedTypeCDC : public ::testing::TestWithParam< std::tuple, bool, bool>> {}; -TEST_P(TestContentDefinedChunker, DeleteOnce) { +TEST_P(TestFixedSizedTypeCDC, DeleteOnce) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); auto enable_dictionary = std::get<2>(GetParam()); @@ -406,23 +406,22 @@ TEST_P(TestContentDefinedChunker, DeleteOnce) { auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); - - ASSERT_OK_AND_ASSIGN(auto base_lengths, - WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_lengths, - WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - - AssertAllBetween(base_lengths, min_length, max_length, + ASSERT_OK_AND_ASSIGN(auto base_result, + WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto modified_result, + WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + + AssertAllBetween(base_result.first, min_length, max_length, /*expect_dictionary_fallback=*/enable_dictionary); - AssertAllBetween(modified_lengths, min_length, max_length, + AssertAllBetween(modified_result.first, min_length, max_length, /*expect_dictionary_fallback=*/enable_dictionary); - AssertDeleteCase(base_lengths, modified_lengths, 1, kEditLength); + AssertDeleteCase(base_result.first, modified_result.first, 1, kEditLength); } -TEST_P(TestContentDefinedChunker, DeleteTwice) { +TEST_P(TestFixedSizedTypeCDC, DeleteTwice) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); auto enable_dictionary = std::get<2>(GetParam()); @@ -442,21 +441,21 @@ TEST_P(TestContentDefinedChunker, DeleteTwice) { auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); - ASSERT_OK_AND_ASSIGN(auto base_lengths, - WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_lengths, - WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto base_result, + WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto modified_result, + WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); - AssertAllBetween(base_lengths, min_length, max_length, + AssertAllBetween(base_result.first, min_length, max_length, /*expect_dictionary_fallback=*/enable_dictionary); - AssertAllBetween(modified_lengths, min_length, max_length, + AssertAllBetween(modified_result.first, min_length, max_length, /*expect_dictionary_fallback=*/enable_dictionary); - AssertDeleteCase(base_lengths, modified_lengths, 2, kEditLength); + AssertDeleteCase(base_result.first, modified_result.first, 2, kEditLength); } -TEST_P(TestContentDefinedChunker, UpdateOnce) { +TEST_P(TestFixedSizedTypeCDC, UpdateOnce) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); auto enable_dictionary = std::get<2>(GetParam()); @@ -475,21 +474,21 @@ TEST_P(TestContentDefinedChunker, UpdateOnce) { auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); - ASSERT_OK_AND_ASSIGN(auto base_lengths, - WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_lengths, - WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto base_result, + WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto modified_result, + WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); - AssertAllBetween(base_lengths, min_length, max_length, + AssertAllBetween(base_result.first, min_length, max_length, /*expect_dictionary_fallback=*/enable_dictionary); - AssertAllBetween(modified_lengths, min_length, max_length, + AssertAllBetween(modified_result.first, min_length, max_length, /*expect_dictionary_fallback=*/enable_dictionary); - AssertUpdateCase(base_lengths, modified_lengths, 1); + AssertUpdateCase(base_result.first, modified_result.first, 1); } -TEST_P(TestContentDefinedChunker, UpdateTwice) { +TEST_P(TestFixedSizedTypeCDC, UpdateTwice) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); auto enable_dictionary = std::get<2>(GetParam()); @@ -511,21 +510,21 @@ TEST_P(TestContentDefinedChunker, UpdateTwice) { auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); - ASSERT_OK_AND_ASSIGN(auto base_lengths, - WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_lengths, - WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto base_result, + WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto modified_result, + WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); - AssertAllBetween(base_lengths, min_length, max_length, + AssertAllBetween(base_result.first, min_length, max_length, /*expect_dictionary_fallback=*/enable_dictionary); - AssertAllBetween(modified_lengths, min_length, max_length, + AssertAllBetween(modified_result.first, min_length, max_length, /*expect_dictionary_fallback=*/enable_dictionary); - AssertUpdateCase(base_lengths, modified_lengths, 2); + AssertUpdateCase(base_result.first, modified_result.first, 2); } -TEST_P(TestContentDefinedChunker, InsertOnce) { +TEST_P(TestFixedSizedTypeCDC, InsertOnce) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); auto enable_dictionary = std::get<2>(GetParam()); @@ -543,25 +542,24 @@ TEST_P(TestContentDefinedChunker, InsertOnce) { auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); - ASSERT_OK_AND_ASSIGN(auto base_lengths, - WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_lengths, - WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto base_result, + WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto modified_result, + WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); - AssertAllBetween(base_lengths, min_length, max_length, + AssertAllBetween(base_result.first, min_length, max_length, /*expect_dictionary_fallback=*/enable_dictionary); - AssertAllBetween(modified_lengths, min_length, max_length, + AssertAllBetween(modified_result.first, min_length, max_length, /*expect_dictionary_fallback=*/enable_dictionary); - AssertInsertCase(base_lengths, modified_lengths, 1, kEditLength); + AssertInsertCase(base_result.first, modified_result.first, 1, kEditLength); } -TEST_P(TestContentDefinedChunker, InsertTwice) { +TEST_P(TestFixedSizedTypeCDC, InsertTwice) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); auto enable_dictionary = std::get<2>(GetParam()); - enable_dictionary = false; auto field = ::arrow::field("f0", dtype, nullable); @@ -578,21 +576,21 @@ TEST_P(TestContentDefinedChunker, InsertTwice) { auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); - ASSERT_OK_AND_ASSIGN(auto base_lengths, - WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_lengths, - WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto base_result, + WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto modified_result, + WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); - AssertAllBetween(base_lengths, min_length, max_length, + AssertAllBetween(base_result.first, min_length, max_length, /*expect_dictionary_fallback=*/enable_dictionary); - AssertAllBetween(modified_lengths, min_length, max_length, + AssertAllBetween(modified_result.first, min_length, max_length, /*expect_dictionary_fallback=*/enable_dictionary); - AssertInsertCase(base_lengths, modified_lengths, 2, kEditLength); + AssertInsertCase(base_result.first, modified_result.first, 2, kEditLength); } -TEST_P(TestContentDefinedChunker, Append) { +TEST_P(TestFixedSizedTypeCDC, Append) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); auto enable_dictionary = std::get<2>(GetParam()); @@ -611,26 +609,222 @@ TEST_P(TestContentDefinedChunker, Append) { auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); - ASSERT_OK_AND_ASSIGN(auto base_lengths, - WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_lengths, - WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto base_result, + WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto modified_result, + WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); - AssertAllBetween(base_lengths, min_length, max_length, + AssertAllBetween(base_result.first, min_length, max_length, /*expect_dictionary_fallback=*/enable_dictionary); - AssertAllBetween(modified_lengths, min_length, max_length, + AssertAllBetween(modified_result.first, min_length, max_length, /*expect_dictionary_fallback=*/enable_dictionary); - AssertAppendCase(base_lengths, modified_lengths); + AssertAppendCase(base_result.first, modified_result.first); } INSTANTIATE_TEST_SUITE_P( - FixedSizedTypes, TestContentDefinedChunker, + FixedSizedTypes, TestFixedSizedTypeCDC, Combine(Values(::arrow::uint8(), ::arrow::uint16(), ::arrow::uint32(), ::arrow::uint64(), ::arrow::int8(), ::arrow::int16(), ::arrow::int32(), ::arrow::int64(), ::arrow::float16(), ::arrow::float32(), ::arrow::float64()), Bool(), Bool())); +class TestVariableLengthTypeCDC + : public ::testing::TestWithParam< + std::tuple, bool>> {}; + +TEST_P(TestVariableLengthTypeCDC, Append) { + auto dtype = std::get<0>(GetParam()); + auto nullable = std::get<1>(GetParam()); + + auto field = ::arrow::field("f0", dtype, nullable); + + auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); + auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); + auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); + auto part4 = GenerateTable({field}, kEditLength, /*seed=*/4); + + auto base = ConcatAndCombine({part1, part2, part3}); + auto modified = ConcatAndCombine({part1, part2, part3, part4}); + ASSERT_FALSE(base->Equals(*modified)); + + ASSERT_OK_AND_ASSIGN(auto base_result, + WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize)); + ASSERT_OK_AND_ASSIGN(auto modified_result, + WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize)); + + if (!nullable) { + AssertAllBetween(base_result.second, kMinChunkSize, kMaxChunkSize); + AssertAllBetween(modified_result.second, kMinChunkSize, kMaxChunkSize); + } + AssertAppendCase(base_result.first, modified_result.first); +} + +TEST_P(TestVariableLengthTypeCDC, UpdateOnce) { + auto dtype = std::get<0>(GetParam()); + auto nullable = std::get<1>(GetParam()); + + auto field = ::arrow::field("f0", dtype, nullable); + + auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); + auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); + auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); + auto part4 = GenerateTable({field}, kEditLength, /*seed=*/4); + + auto base = ConcatAndCombine({part1, part2, part3}); + auto modified = ConcatAndCombine({part1, part4, part3}); + ASSERT_FALSE(base->Equals(*modified)); + + ASSERT_OK_AND_ASSIGN(auto base_result, + WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize)); + ASSERT_OK_AND_ASSIGN(auto modified_result, + WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize)); + if (!nullable) { + AssertAllBetween(base_result.second, kMinChunkSize, kMaxChunkSize); + AssertAllBetween(modified_result.second, kMinChunkSize, kMaxChunkSize); + } + AssertUpdateCase(base_result.first, modified_result.first, 1); +} + +TEST_P(TestVariableLengthTypeCDC, UpdateTwice) { + auto dtype = std::get<0>(GetParam()); + auto nullable = std::get<1>(GetParam()); + + auto field = ::arrow::field("f0", dtype, nullable); + + auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); + auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); + auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); + auto part4 = GenerateTable({field}, kEditLength, /*seed=*/4); + auto part5 = GenerateTable({field}, kPartLength, /*seed=*/5); + auto part6 = GenerateTable({field}, kEditLength, /*seed=*/6); + auto part7 = GenerateTable({field}, kEditLength, /*seed=*/7); + + auto base = ConcatAndCombine({part1, part2, part3, part4, part5}); + auto modified = ConcatAndCombine({part1, part6, part3, part7, part5}); + ASSERT_FALSE(base->Equals(*modified)); + + ASSERT_OK_AND_ASSIGN(auto base_result, + WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize)); + ASSERT_OK_AND_ASSIGN(auto modified_result, + WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize)); + if (!nullable) { + AssertAllBetween(base_result.second, kMinChunkSize, kMaxChunkSize); + AssertAllBetween(modified_result.second, kMinChunkSize, kMaxChunkSize); + } + AssertUpdateCase(base_result.first, modified_result.first, 2); +} + +TEST_P(TestVariableLengthTypeCDC, InsertOnce) { + auto dtype = std::get<0>(GetParam()); + auto nullable = std::get<1>(GetParam()); + + auto field = ::arrow::field("f0", dtype, nullable); + + auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); + auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); + auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); + + auto base = ConcatAndCombine({part1, part3}); + auto modified = ConcatAndCombine({part1, part2, part3}); + ASSERT_FALSE(base->Equals(*modified)); + + ASSERT_OK_AND_ASSIGN(auto base_result, + WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize)); + ASSERT_OK_AND_ASSIGN(auto modified_result, + WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize)); + if (!nullable) { + AssertAllBetween(base_result.second, kMinChunkSize, kMaxChunkSize); + AssertAllBetween(modified_result.second, kMinChunkSize, kMaxChunkSize); + } + AssertInsertCase(base_result.first, modified_result.first, 1, kEditLength); +} + +TEST_P(TestVariableLengthTypeCDC, InsertTwice) { + auto dtype = std::get<0>(GetParam()); + auto nullable = std::get<1>(GetParam()); + + auto field = ::arrow::field("f0", dtype, nullable); + + auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); + auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); + auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); + auto part4 = GenerateTable({field}, kEditLength, /*seed=*/4); + auto part5 = GenerateTable({field}, kPartLength, /*seed=*/5); + + auto base = ConcatAndCombine({part1, part3, part5}); + auto modified = ConcatAndCombine({part1, part2, part3, part4, part5}); + ASSERT_FALSE(base->Equals(*modified)); + + ASSERT_OK_AND_ASSIGN(auto base_result, + WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize)); + ASSERT_OK_AND_ASSIGN(auto modified_result, + WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize)); + if (!nullable) { + AssertAllBetween(base_result.second, kMinChunkSize, kMaxChunkSize); + AssertAllBetween(modified_result.second, kMinChunkSize, kMaxChunkSize); + } + AssertInsertCase(base_result.first, modified_result.first, 2, kEditLength); +} + +TEST_P(TestVariableLengthTypeCDC, DeleteOnce) { + auto dtype = std::get<0>(GetParam()); + auto nullable = std::get<1>(GetParam()); + + auto field = ::arrow::field("f0", dtype, nullable); + + auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); + auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); + auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); + + auto base = ConcatAndCombine({part1, part2, part3}); + auto modified = ConcatAndCombine({part1, part3}); + ASSERT_FALSE(base->Equals(*modified)); + + ASSERT_OK_AND_ASSIGN(auto base_result, + WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize)); + ASSERT_OK_AND_ASSIGN(auto modified_result, + WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize)); + if (!nullable) { + AssertAllBetween(base_result.second, kMinChunkSize, kMaxChunkSize); + AssertAllBetween(modified_result.second, kMinChunkSize, kMaxChunkSize); + } + AssertDeleteCase(base_result.first, modified_result.first, 1, kEditLength); +} + +TEST_P(TestVariableLengthTypeCDC, DeleteTwice) { + auto dtype = std::get<0>(GetParam()); + auto nullable = std::get<1>(GetParam()); + + auto field = ::arrow::field("f0", dtype, nullable); + + auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); + auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); + auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); + auto part4 = GenerateTable({field}, kEditLength, /*seed=*/4); + auto part5 = GenerateTable({field}, kPartLength, /*seed=*/5); + + auto base = ConcatAndCombine({part1, part2, part3, part4, part5}); + auto modified = ConcatAndCombine({part1, part3, part5}); + ASSERT_FALSE(base->Equals(*modified)); + + ASSERT_OK_AND_ASSIGN(auto base_result, + WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize)); + ASSERT_OK_AND_ASSIGN(auto modified_result, + WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize)); + if (!nullable) { + AssertAllBetween(base_result.second, kMinChunkSize, kMaxChunkSize); + AssertAllBetween(modified_result.second, kMinChunkSize, kMaxChunkSize); + } + AssertDeleteCase(base_result.first, modified_result.first, 2, kEditLength); +} + +INSTANTIATE_TEST_SUITE_P(VarLenTypes, TestVariableLengthTypeCDC, + Combine(Values(::arrow::utf8(), ::arrow::large_utf8(), + ::arrow::binary(), ::arrow::large_binary(), + ::arrow::fixed_size_binary(16)), + Bool())); + } // namespace parquet From 1eb6f4cd46e541d0336ae38f643449b46f7f9f6c Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Sat, 22 Feb 2025 00:37:30 +0100 Subject: [PATCH 016/102] reduce duplication in testing --- cpp/src/parquet/column_chunker_test.cc | 313 ++++--------------------- 1 file changed, 52 insertions(+), 261 deletions(-) diff --git a/cpp/src/parquet/column_chunker_test.cc b/cpp/src/parquet/column_chunker_test.cc index 3133888cc0f..574df08c47f 100644 --- a/cpp/src/parquet/column_chunker_test.cc +++ b/cpp/src/parquet/column_chunker_test.cc @@ -111,8 +111,9 @@ Result> ReadTableFromBuffer(const std::shared_ptr return result; } -std::pair, std::vector> GetColumnPageSizes( - const std::shared_ptr& data, int column_index = 0) { +using PageSizes = std::pair, std::vector>; + +PageSizes GetColumnPageSizes(const std::shared_ptr& data, int column_index = 0) { std::vector page_sizes; std::vector page_lengths; @@ -134,9 +135,10 @@ std::pair, std::vector> GetColumnPageSizes( return {page_lengths, page_sizes}; } -Result, std::vector>> WriteAndGetPageSizes( - const std::shared_ptr
& table, uint64_t min_chunk_size, uint64_t max_chunk_size, - bool enable_dictionary = false, int column_index = 0) { +Result WriteAndGetPageSizes(const std::shared_ptr
& table, + uint64_t min_chunk_size, uint64_t max_chunk_size, + bool enable_dictionary = false, + int column_index = 0) { ARROW_ASSIGN_OR_RAISE( auto buffer, WriteTableToBuffer(table, min_chunk_size, max_chunk_size, enable_dictionary)); @@ -376,6 +378,24 @@ uint64_t ElementCount(uint64_t size, int32_t byte_width, bool nullable) { return size / byte_width; } +void AssertChunkSizes(const std::shared_ptr<::arrow::DataType>& dtype, + PageSizes base_result, PageSizes modified_result, bool nullable, + bool enable_dictionary, uint64_t min_chunk_size, + uint64_t max_chunk_size) { + if (::arrow::is_fixed_width(dtype->id())) { + auto min_length = ElementCount(min_chunk_size, dtype->byte_width(), nullable); + auto max_length = ElementCount(max_chunk_size, dtype->byte_width(), nullable); + AssertAllBetween(base_result.first, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + AssertAllBetween(modified_result.first, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + } else if (::arrow::is_base_binary_like(dtype->id()) && !nullable && + !enable_dictionary) { + AssertAllBetween(base_result.second, min_chunk_size, max_chunk_size); + AssertAllBetween(modified_result.second, min_chunk_size, max_chunk_size); + } +} + constexpr uint64_t kMinChunkSize = 32 * 1024; constexpr uint64_t kMaxChunkSize = 128 * 1024; constexpr uint64_t kPartLength = 128 * 1024; @@ -385,11 +405,10 @@ constexpr uint64_t kEditLength = 32; // - test nested types // - test multiple row groups -class TestFixedSizedTypeCDC - : public ::testing::TestWithParam< - std::tuple, bool, bool>> {}; +class TestColumnCDC : public ::testing::TestWithParam< + std::tuple, bool, bool>> {}; -TEST_P(TestFixedSizedTypeCDC, DeleteOnce) { +TEST_P(TestColumnCDC, DeleteOnce) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); auto enable_dictionary = std::get<2>(GetParam()); @@ -404,8 +423,6 @@ TEST_P(TestFixedSizedTypeCDC, DeleteOnce) { auto modified = ConcatAndCombine({part1, part3}); ASSERT_FALSE(base->Equals(*modified)); - auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); - auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); ASSERT_OK_AND_ASSIGN(auto base_result, WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, /*enable_dictionary=*/enable_dictionary)); @@ -413,15 +430,13 @@ TEST_P(TestFixedSizedTypeCDC, DeleteOnce) { WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, /*enable_dictionary=*/enable_dictionary)); - AssertAllBetween(base_result.first, min_length, max_length, - /*expect_dictionary_fallback=*/enable_dictionary); - AssertAllBetween(modified_result.first, min_length, max_length, - /*expect_dictionary_fallback=*/enable_dictionary); + AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, + kMinChunkSize, kMaxChunkSize); AssertDeleteCase(base_result.first, modified_result.first, 1, kEditLength); } -TEST_P(TestFixedSizedTypeCDC, DeleteTwice) { +TEST_P(TestColumnCDC, DeleteTwice) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); auto enable_dictionary = std::get<2>(GetParam()); @@ -438,9 +453,6 @@ TEST_P(TestFixedSizedTypeCDC, DeleteTwice) { auto modified = ConcatAndCombine({part1, part3, part5}); ASSERT_FALSE(base->Equals(*modified)); - auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); - auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); - ASSERT_OK_AND_ASSIGN(auto base_result, WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, /*enable_dictionary=*/enable_dictionary)); @@ -448,14 +460,12 @@ TEST_P(TestFixedSizedTypeCDC, DeleteTwice) { WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, /*enable_dictionary=*/enable_dictionary)); - AssertAllBetween(base_result.first, min_length, max_length, - /*expect_dictionary_fallback=*/enable_dictionary); - AssertAllBetween(modified_result.first, min_length, max_length, - /*expect_dictionary_fallback=*/enable_dictionary); + AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, + kMinChunkSize, kMaxChunkSize); AssertDeleteCase(base_result.first, modified_result.first, 2, kEditLength); } -TEST_P(TestFixedSizedTypeCDC, UpdateOnce) { +TEST_P(TestColumnCDC, UpdateOnce) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); auto enable_dictionary = std::get<2>(GetParam()); @@ -471,9 +481,6 @@ TEST_P(TestFixedSizedTypeCDC, UpdateOnce) { auto modified = ConcatAndCombine({part1, part4, part3}); ASSERT_FALSE(base->Equals(*modified)); - auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); - auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); - ASSERT_OK_AND_ASSIGN(auto base_result, WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, /*enable_dictionary=*/enable_dictionary)); @@ -481,14 +488,12 @@ TEST_P(TestFixedSizedTypeCDC, UpdateOnce) { WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, /*enable_dictionary=*/enable_dictionary)); - AssertAllBetween(base_result.first, min_length, max_length, - /*expect_dictionary_fallback=*/enable_dictionary); - AssertAllBetween(modified_result.first, min_length, max_length, - /*expect_dictionary_fallback=*/enable_dictionary); + AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, + kMinChunkSize, kMaxChunkSize); AssertUpdateCase(base_result.first, modified_result.first, 1); } -TEST_P(TestFixedSizedTypeCDC, UpdateTwice) { +TEST_P(TestColumnCDC, UpdateTwice) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); auto enable_dictionary = std::get<2>(GetParam()); @@ -507,9 +512,6 @@ TEST_P(TestFixedSizedTypeCDC, UpdateTwice) { auto modified = ConcatAndCombine({part1, part6, part3, part7, part5}); ASSERT_FALSE(base->Equals(*modified)); - auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); - auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); - ASSERT_OK_AND_ASSIGN(auto base_result, WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, /*enable_dictionary=*/enable_dictionary)); @@ -517,14 +519,12 @@ TEST_P(TestFixedSizedTypeCDC, UpdateTwice) { WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, /*enable_dictionary=*/enable_dictionary)); - AssertAllBetween(base_result.first, min_length, max_length, - /*expect_dictionary_fallback=*/enable_dictionary); - AssertAllBetween(modified_result.first, min_length, max_length, - /*expect_dictionary_fallback=*/enable_dictionary); + AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, + kMinChunkSize, kMaxChunkSize); AssertUpdateCase(base_result.first, modified_result.first, 2); } -TEST_P(TestFixedSizedTypeCDC, InsertOnce) { +TEST_P(TestColumnCDC, InsertOnce) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); auto enable_dictionary = std::get<2>(GetParam()); @@ -539,9 +539,6 @@ TEST_P(TestFixedSizedTypeCDC, InsertOnce) { auto modified = ConcatAndCombine({part1, part2, part3}); ASSERT_FALSE(base->Equals(*modified)); - auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); - auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); - ASSERT_OK_AND_ASSIGN(auto base_result, WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, /*enable_dictionary=*/enable_dictionary)); @@ -549,14 +546,12 @@ TEST_P(TestFixedSizedTypeCDC, InsertOnce) { WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, /*enable_dictionary=*/enable_dictionary)); - AssertAllBetween(base_result.first, min_length, max_length, - /*expect_dictionary_fallback=*/enable_dictionary); - AssertAllBetween(modified_result.first, min_length, max_length, - /*expect_dictionary_fallback=*/enable_dictionary); + AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, + kMinChunkSize, kMaxChunkSize); AssertInsertCase(base_result.first, modified_result.first, 1, kEditLength); } -TEST_P(TestFixedSizedTypeCDC, InsertTwice) { +TEST_P(TestColumnCDC, InsertTwice) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); auto enable_dictionary = std::get<2>(GetParam()); @@ -573,9 +568,6 @@ TEST_P(TestFixedSizedTypeCDC, InsertTwice) { auto modified = ConcatAndCombine({part1, part2, part3, part4, part5}); ASSERT_FALSE(base->Equals(*modified)); - auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); - auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); - ASSERT_OK_AND_ASSIGN(auto base_result, WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, /*enable_dictionary=*/enable_dictionary)); @@ -583,14 +575,12 @@ TEST_P(TestFixedSizedTypeCDC, InsertTwice) { WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, /*enable_dictionary=*/enable_dictionary)); - AssertAllBetween(base_result.first, min_length, max_length, - /*expect_dictionary_fallback=*/enable_dictionary); - AssertAllBetween(modified_result.first, min_length, max_length, - /*expect_dictionary_fallback=*/enable_dictionary); + AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, + kMinChunkSize, kMaxChunkSize); AssertInsertCase(base_result.first, modified_result.first, 2, kEditLength); } -TEST_P(TestFixedSizedTypeCDC, Append) { +TEST_P(TestColumnCDC, Append) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); auto enable_dictionary = std::get<2>(GetParam()); @@ -606,9 +596,6 @@ TEST_P(TestFixedSizedTypeCDC, Append) { auto modified = ConcatAndCombine({part1, part2, part3, part4}); ASSERT_FALSE(base->Equals(*modified)); - auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); - auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); - ASSERT_OK_AND_ASSIGN(auto base_result, WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, /*enable_dictionary=*/enable_dictionary)); @@ -616,215 +603,19 @@ TEST_P(TestFixedSizedTypeCDC, Append) { WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, /*enable_dictionary=*/enable_dictionary)); - AssertAllBetween(base_result.first, min_length, max_length, - /*expect_dictionary_fallback=*/enable_dictionary); - AssertAllBetween(modified_result.first, min_length, max_length, - /*expect_dictionary_fallback=*/enable_dictionary); + AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, + kMinChunkSize, kMaxChunkSize); AssertAppendCase(base_result.first, modified_result.first); } INSTANTIATE_TEST_SUITE_P( - FixedSizedTypes, TestFixedSizedTypeCDC, + FixedSizedTypes, TestColumnCDC, Combine(Values(::arrow::uint8(), ::arrow::uint16(), ::arrow::uint32(), ::arrow::uint64(), ::arrow::int8(), ::arrow::int16(), ::arrow::int32(), ::arrow::int64(), ::arrow::float16(), ::arrow::float32(), - ::arrow::float64()), + ::arrow::float64(), ::arrow::binary(), ::arrow::large_binary(), + ::arrow::fixed_size_binary(16), ::arrow::utf8(), + ::arrow::large_utf8()), Bool(), Bool())); -class TestVariableLengthTypeCDC - : public ::testing::TestWithParam< - std::tuple, bool>> {}; - -TEST_P(TestVariableLengthTypeCDC, Append) { - auto dtype = std::get<0>(GetParam()); - auto nullable = std::get<1>(GetParam()); - - auto field = ::arrow::field("f0", dtype, nullable); - - auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); - auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); - auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); - auto part4 = GenerateTable({field}, kEditLength, /*seed=*/4); - - auto base = ConcatAndCombine({part1, part2, part3}); - auto modified = ConcatAndCombine({part1, part2, part3, part4}); - ASSERT_FALSE(base->Equals(*modified)); - - ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize)); - ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize)); - - if (!nullable) { - AssertAllBetween(base_result.second, kMinChunkSize, kMaxChunkSize); - AssertAllBetween(modified_result.second, kMinChunkSize, kMaxChunkSize); - } - AssertAppendCase(base_result.first, modified_result.first); -} - -TEST_P(TestVariableLengthTypeCDC, UpdateOnce) { - auto dtype = std::get<0>(GetParam()); - auto nullable = std::get<1>(GetParam()); - - auto field = ::arrow::field("f0", dtype, nullable); - - auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); - auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); - auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); - auto part4 = GenerateTable({field}, kEditLength, /*seed=*/4); - - auto base = ConcatAndCombine({part1, part2, part3}); - auto modified = ConcatAndCombine({part1, part4, part3}); - ASSERT_FALSE(base->Equals(*modified)); - - ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize)); - ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize)); - if (!nullable) { - AssertAllBetween(base_result.second, kMinChunkSize, kMaxChunkSize); - AssertAllBetween(modified_result.second, kMinChunkSize, kMaxChunkSize); - } - AssertUpdateCase(base_result.first, modified_result.first, 1); -} - -TEST_P(TestVariableLengthTypeCDC, UpdateTwice) { - auto dtype = std::get<0>(GetParam()); - auto nullable = std::get<1>(GetParam()); - - auto field = ::arrow::field("f0", dtype, nullable); - - auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); - auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); - auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); - auto part4 = GenerateTable({field}, kEditLength, /*seed=*/4); - auto part5 = GenerateTable({field}, kPartLength, /*seed=*/5); - auto part6 = GenerateTable({field}, kEditLength, /*seed=*/6); - auto part7 = GenerateTable({field}, kEditLength, /*seed=*/7); - - auto base = ConcatAndCombine({part1, part2, part3, part4, part5}); - auto modified = ConcatAndCombine({part1, part6, part3, part7, part5}); - ASSERT_FALSE(base->Equals(*modified)); - - ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize)); - ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize)); - if (!nullable) { - AssertAllBetween(base_result.second, kMinChunkSize, kMaxChunkSize); - AssertAllBetween(modified_result.second, kMinChunkSize, kMaxChunkSize); - } - AssertUpdateCase(base_result.first, modified_result.first, 2); -} - -TEST_P(TestVariableLengthTypeCDC, InsertOnce) { - auto dtype = std::get<0>(GetParam()); - auto nullable = std::get<1>(GetParam()); - - auto field = ::arrow::field("f0", dtype, nullable); - - auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); - auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); - auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); - - auto base = ConcatAndCombine({part1, part3}); - auto modified = ConcatAndCombine({part1, part2, part3}); - ASSERT_FALSE(base->Equals(*modified)); - - ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize)); - ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize)); - if (!nullable) { - AssertAllBetween(base_result.second, kMinChunkSize, kMaxChunkSize); - AssertAllBetween(modified_result.second, kMinChunkSize, kMaxChunkSize); - } - AssertInsertCase(base_result.first, modified_result.first, 1, kEditLength); -} - -TEST_P(TestVariableLengthTypeCDC, InsertTwice) { - auto dtype = std::get<0>(GetParam()); - auto nullable = std::get<1>(GetParam()); - - auto field = ::arrow::field("f0", dtype, nullable); - - auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); - auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); - auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); - auto part4 = GenerateTable({field}, kEditLength, /*seed=*/4); - auto part5 = GenerateTable({field}, kPartLength, /*seed=*/5); - - auto base = ConcatAndCombine({part1, part3, part5}); - auto modified = ConcatAndCombine({part1, part2, part3, part4, part5}); - ASSERT_FALSE(base->Equals(*modified)); - - ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize)); - ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize)); - if (!nullable) { - AssertAllBetween(base_result.second, kMinChunkSize, kMaxChunkSize); - AssertAllBetween(modified_result.second, kMinChunkSize, kMaxChunkSize); - } - AssertInsertCase(base_result.first, modified_result.first, 2, kEditLength); -} - -TEST_P(TestVariableLengthTypeCDC, DeleteOnce) { - auto dtype = std::get<0>(GetParam()); - auto nullable = std::get<1>(GetParam()); - - auto field = ::arrow::field("f0", dtype, nullable); - - auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); - auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); - auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); - - auto base = ConcatAndCombine({part1, part2, part3}); - auto modified = ConcatAndCombine({part1, part3}); - ASSERT_FALSE(base->Equals(*modified)); - - ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize)); - ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize)); - if (!nullable) { - AssertAllBetween(base_result.second, kMinChunkSize, kMaxChunkSize); - AssertAllBetween(modified_result.second, kMinChunkSize, kMaxChunkSize); - } - AssertDeleteCase(base_result.first, modified_result.first, 1, kEditLength); -} - -TEST_P(TestVariableLengthTypeCDC, DeleteTwice) { - auto dtype = std::get<0>(GetParam()); - auto nullable = std::get<1>(GetParam()); - - auto field = ::arrow::field("f0", dtype, nullable); - - auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); - auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); - auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); - auto part4 = GenerateTable({field}, kEditLength, /*seed=*/4); - auto part5 = GenerateTable({field}, kPartLength, /*seed=*/5); - - auto base = ConcatAndCombine({part1, part2, part3, part4, part5}); - auto modified = ConcatAndCombine({part1, part3, part5}); - ASSERT_FALSE(base->Equals(*modified)); - - ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize)); - ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize)); - if (!nullable) { - AssertAllBetween(base_result.second, kMinChunkSize, kMaxChunkSize); - AssertAllBetween(modified_result.second, kMinChunkSize, kMaxChunkSize); - } - AssertDeleteCase(base_result.first, modified_result.first, 2, kEditLength); -} - -INSTANTIATE_TEST_SUITE_P(VarLenTypes, TestVariableLengthTypeCDC, - Combine(Values(::arrow::utf8(), ::arrow::large_utf8(), - ::arrow::binary(), ::arrow::large_binary(), - ::arrow::fixed_size_binary(16)), - Bool())); - } // namespace parquet From c9b42b4999f1b15a8877ae2fdee2a2e676336d2e Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Sat, 22 Feb 2025 00:55:00 +0100 Subject: [PATCH 017/102] reduce duplication in testing --- cpp/src/parquet/column_chunker_test.cc | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/cpp/src/parquet/column_chunker_test.cc b/cpp/src/parquet/column_chunker_test.cc index 574df08c47f..bf4f6ed77e5 100644 --- a/cpp/src/parquet/column_chunker_test.cc +++ b/cpp/src/parquet/column_chunker_test.cc @@ -145,8 +145,10 @@ Result WriteAndGetPageSizes(const std::shared_ptr
& table, ARROW_ASSIGN_OR_RAISE(auto readback, ReadTableFromBuffer(buffer)); RETURN_NOT_OK(readback->ValidateFull()); - ARROW_RETURN_IF(!readback->Equals(*table), - Status::Invalid("Readback table not equal to original")); + if (readback->schema()->Equals(*table->schema())) { + ARROW_RETURN_IF(!readback->Equals(*table), + Status::Invalid("Readback table not equal to original")); + } return GetColumnPageSizes(buffer, column_index); } @@ -338,8 +340,8 @@ void AssertDeleteCase(const std::vector& original, for (const auto& val : diff.first) left_sum += val; for (const auto& val : diff.second) right_sum += val; ASSERT_EQ(left_sum, right_sum + edit_length); - ASSERT_LE(diff.first.size(), 2); - ASSERT_LE(diff.second.size(), 2); + ASSERT_LE(diff.first.size(), 3); + ASSERT_LE(diff.second.size(), 3); } } @@ -354,8 +356,8 @@ void AssertInsertCase(const std::vector& original, for (const auto& val : diff.first) left_sum += val; for (const auto& val : diff.second) right_sum += val; ASSERT_EQ(left_sum + edit_length, right_sum); - ASSERT_LE(diff.first.size(), 2); - ASSERT_LE(diff.second.size(), 2); + ASSERT_LE(diff.first.size(), 3); + ASSERT_LE(diff.second.size(), 3); } } @@ -608,14 +610,19 @@ TEST_P(TestColumnCDC, Append) { AssertAppendCase(base_result.first, modified_result.first); } +// TODO(kszucs): add extension type and dictionary type INSTANTIATE_TEST_SUITE_P( FixedSizedTypes, TestColumnCDC, Combine(Values(::arrow::uint8(), ::arrow::uint16(), ::arrow::uint32(), ::arrow::uint64(), ::arrow::int8(), ::arrow::int16(), ::arrow::int32(), ::arrow::int64(), ::arrow::float16(), ::arrow::float32(), ::arrow::float64(), ::arrow::binary(), ::arrow::large_binary(), - ::arrow::fixed_size_binary(16), ::arrow::utf8(), - ::arrow::large_utf8()), + ::arrow::fixed_size_binary(16), ::arrow::utf8(), ::arrow::large_utf8(), + ::arrow::date32(), ::arrow::date64(), ::arrow::decimal128(18, 6), + ::arrow::decimal256(40, 6), ::arrow::time32(::arrow::TimeUnit::SECOND), + ::arrow::time64(::arrow::TimeUnit::NANO), + ::arrow::timestamp(::arrow::TimeUnit::NANO), + ::arrow::duration(::arrow::TimeUnit::NANO)), Bool(), Bool())); } // namespace parquet From 50ce77c7b66d34535d6a543ad5b6832a5dfb9ffd Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Mon, 24 Feb 2025 19:19:26 +0100 Subject: [PATCH 018/102] refactoring + testing + introduce norm_factor parameter --- cpp/src/parquet/column_chunker.cc | 135 ++++--- cpp/src/parquet/column_chunker.h | 12 +- cpp/src/parquet/column_chunker_test.cc | 526 ++++++++++++++----------- cpp/src/parquet/column_writer.cc | 4 +- cpp/src/parquet/properties.h | 24 +- python/pyarrow/_parquet.pxd | 5 +- python/pyarrow/_parquet.pyx | 26 +- 7 files changed, 412 insertions(+), 320 deletions(-) diff --git a/cpp/src/parquet/column_chunker.cc b/cpp/src/parquet/column_chunker.cc index bc5640cbd7d..14de6d81575 100644 --- a/cpp/src/parquet/column_chunker.cc +++ b/cpp/src/parquet/column_chunker.cc @@ -20,6 +20,7 @@ #include #include #include "arrow/array.h" +#include "arrow/util/logging.h" #include "parquet/level_conversion.h" namespace parquet { @@ -558,65 +559,83 @@ class FakeNullArray { int64_t null_count() const { return 0; } }; -static uint64_t GetMask(uint64_t min_size, uint64_t max_size) { +static uint64_t GetMask(uint64_t min_size, uint64_t max_size, uint8_t norm_factor) { + // we aim for gaussian-like distribution of chunk sizes between min_size and max_size uint64_t avg_size = (min_size + max_size) / 2; + // we skip calculating gearhash for the first `min_size` bytes, so we are looking for + // a smaller chunk as the average size uint64_t target_size = avg_size - min_size; size_t mask_bits = static_cast(std::floor(std::log2(target_size))); // -3 because we are using 8 hash tables to have more gaussian-like distribution - // -1 narrows the chunk size distribution in order to avoid having too many hard - // cuts at the minimum and maximum chunk sizes - size_t effective_bits = mask_bits - 3 - 1; + // `norm_factor` narrows the chunk size distribution aroun avg_size + size_t effective_bits = mask_bits - 3 - norm_factor; return std::numeric_limits::max() << (64 - effective_bits); } ContentDefinedChunker::ContentDefinedChunker(const LevelInfo& level_info, - uint64_t min_size, uint64_t max_size) + std::pair size_range, + uint8_t norm_factor) : level_info_(level_info), - min_size_(min_size), - max_size_(max_size), - hash_mask_(GetMask(min_size, max_size)) {} + min_size_(size_range.first), + max_size_(size_range.second), + hash_mask_(GetMask(size_range.first, size_range.second, norm_factor)) {} template -bool ContentDefinedChunker::Roll(const T value) { +void ContentDefinedChunker::Roll(const T value) { constexpr size_t BYTE_WIDTH = sizeof(T); chunk_size_ += BYTE_WIDTH; if (chunk_size_ < min_size_) { - return false; + // short-circuit if we haven't reached the minimum chunk size, this speeds up the + // chunking process since the gearhash doesn't need to be updated + return; } auto bytes = reinterpret_cast(&value); - bool match = false; for (size_t i = 0; i < BYTE_WIDTH; ++i) { rolling_hash_ = (rolling_hash_ << 1) + GEAR_HASH_TABLE[nth_run_][bytes[i]]; - match |= (rolling_hash_ & hash_mask_) == 0; + if ((rolling_hash_ & hash_mask_) == 0) { + has_matched_ = true; + } } - return match; } -bool ContentDefinedChunker::Roll(std::string_view value) { +void ContentDefinedChunker::Roll(std::string_view value) { chunk_size_ += value.size(); if (chunk_size_ < min_size_) { - return false; + // short-circuit if we haven't reached the minimum chunk size, this speeds up the + // chunking process since the gearhash doesn't need to be updated + return; } - bool match = false; for (char c : value) { rolling_hash_ = (rolling_hash_ << 1) + GEAR_HASH_TABLE[nth_run_][static_cast(c)]; - match |= (rolling_hash_ & hash_mask_) == 0; + if ((rolling_hash_ & hash_mask_) == 0) { + has_matched_ = true; + } } - return match; } -bool ContentDefinedChunker::Check(bool match) { - if (ARROW_PREDICT_FALSE(match && ++nth_run_ >= 7)) { - nth_run_ = 0; - chunk_size_ = 0; - return true; - } else if (ARROW_PREDICT_FALSE(chunk_size_ >= max_size_)) { +bool ContentDefinedChunker::Check() { + // decide whether to create a new chunk based on the rolling hash; has_matched_ is + // set to true if we encountered a match since the last Check() call + if (ARROW_PREDICT_FALSE(has_matched_)) { + has_matched_ = false; + // in order to have a normal distribution of chunk sizes, we only create a new chunk + // if the adjused mask matches the rolling hash 8 times in a row, each run uses a + // different gearhash table (gearhash's chunk size has exponential distribution, and + // we use central limit theorem to approximate normal distribution) + if (ARROW_PREDICT_FALSE(++nth_run_ >= 7)) { + nth_run_ = 0; + chunk_size_ = 0; + return true; + } + } + if (ARROW_PREDICT_FALSE(chunk_size_ >= max_size_)) { + // we have a hard limit on the maximum chunk size, not that we don't reset the rolling + // hash state here, so the next Check() call will continue from the current state chunk_size_ = 0; return true; - } else { - return false; } + return false; } template @@ -629,14 +648,13 @@ const std::vector ContentDefinedChunker::Calculate(const int16_t* def_lev bool has_rep_levels = level_info_.rep_level > 0; if (!has_rep_levels && !has_def_levels) { - // fastest path for non-repeated non-null data - bool val_match; + // fastest path for non-nested non-null data int64_t offset = 0; int64_t prev_offset = 0; while (offset < num_levels) { - val_match = Roll(leaf_array.GetView(offset)); + Roll(leaf_array.GetView(offset)); ++offset; - if (Check(val_match)) { + if (Check()) { result.emplace_back(prev_offset, prev_offset, offset - prev_offset); prev_offset = offset; } @@ -645,15 +663,14 @@ const std::vector ContentDefinedChunker::Calculate(const int16_t* def_lev result.emplace_back(prev_offset, prev_offset, num_levels - prev_offset); } } else if (!has_rep_levels) { - // non-repeated data possibly with nulls - bool def_match, val_match; + // non-nested data with nulls int64_t offset = 0; int64_t prev_offset = 0; while (offset < num_levels) { - def_match = Roll(def_levels[offset]); - val_match = Roll(leaf_array.GetView(offset)); + Roll(def_levels[offset]); + Roll(leaf_array.GetView(offset)); ++offset; - if (Check(def_match || val_match)) { + if (Check()) { result.emplace_back(prev_offset, prev_offset, offset - prev_offset); prev_offset = offset; } @@ -662,52 +679,48 @@ const std::vector ContentDefinedChunker::Calculate(const int16_t* def_lev result.emplace_back(prev_offset, prev_offset, num_levels - prev_offset); } } else { - // repeated data possibly with nulls - bool def_match, rep_match, val_match; + // nested data with nulls + bool has_leaf_value; + bool is_record_boundary; int16_t def_level; int16_t rep_level; int64_t level_offset = 0; int64_t value_offset = 0; int64_t record_level_offset = 0; int64_t record_value_offset = 0; - int64_t prev_record_level_offset = 0; - int64_t prev_record_value_offset = 0; while (level_offset < num_levels) { def_level = def_levels[level_offset]; rep_level = rep_levels[level_offset]; - if (rep_level == 0) { - record_level_offset = level_offset; - record_value_offset = value_offset; - } - ++level_offset; - def_match = Roll(def_level); - rep_match = Roll(rep_level); - if (ARROW_PREDICT_TRUE(def_level >= level_info_.repeated_ancestor_def_level)) { - val_match = Roll(leaf_array.GetView(value_offset)); - ++value_offset; - } else { - val_match = false; + has_leaf_value = def_level >= level_info_.repeated_ancestor_def_level; + is_record_boundary = rep_level == 0; + + Roll(def_level); + Roll(rep_level); + if (has_leaf_value) { + Roll(leaf_array.GetView(value_offset)); } - if (Check(def_match || rep_match || val_match)) { - auto levels_to_write = record_level_offset - prev_record_level_offset; + if (is_record_boundary && Check()) { + auto levels_to_write = level_offset - record_level_offset; if (levels_to_write > 0) { - result.emplace_back(prev_record_level_offset, prev_record_value_offset, - levels_to_write); - prev_record_level_offset = record_level_offset; - prev_record_value_offset = record_value_offset; + result.emplace_back(record_level_offset, record_value_offset, levels_to_write); + record_level_offset = level_offset; + record_value_offset = value_offset; } } + + ++level_offset; + if (has_leaf_value) { + ++value_offset; + } } - auto levels_to_write = num_levels - prev_record_level_offset; + auto levels_to_write = num_levels - record_level_offset; if (levels_to_write > 0) { - result.emplace_back(prev_record_level_offset, prev_record_value_offset, - levels_to_write); + result.emplace_back(record_level_offset, record_value_offset, levels_to_write); } - return result; } return result; diff --git a/cpp/src/parquet/column_chunker.h b/cpp/src/parquet/column_chunker.h index 25ed78cb288..5011620bd31 100644 --- a/cpp/src/parquet/column_chunker.h +++ b/cpp/src/parquet/column_chunker.h @@ -41,8 +41,9 @@ struct Chunk { class ContentDefinedChunker { public: - ContentDefinedChunker(const LevelInfo& level_info, uint64_t min_size, - uint64_t max_size); + ContentDefinedChunker(const LevelInfo& level_info, + std::pair size_range, + uint8_t norm_factor = 1); const ::arrow::Result> GetBoundaries(const int16_t* def_levels, const int16_t* rep_levels, @@ -51,9 +52,9 @@ class ContentDefinedChunker { private: template - bool Roll(const T value); - bool Roll(std::string_view value); - inline bool Check(bool match); + void Roll(const T value); + void Roll(std::string_view value); + inline bool Check(); template const std::vector Calculate(const int16_t* def_levels, const int16_t* rep_levels, @@ -64,6 +65,7 @@ class ContentDefinedChunker { const uint64_t max_size_; const uint64_t hash_mask_; + bool has_matched_ = false; uint64_t nth_run_ = 0; uint64_t chunk_size_ = 0; uint64_t rolling_hash_ = 0; diff --git a/cpp/src/parquet/column_chunker_test.cc b/cpp/src/parquet/column_chunker_test.cc index bf4f6ed77e5..7bfdb4ed138 100644 --- a/cpp/src/parquet/column_chunker_test.cc +++ b/cpp/src/parquet/column_chunker_test.cc @@ -31,6 +31,7 @@ #include "arrow/type_traits.h" #include "arrow/util/decimal.h" #include "arrow/util/float16.h" +#include "arrow/util/logging.h" #include "parquet/arrow/reader.h" #include "parquet/arrow/reader_internal.h" #include "parquet/arrow/schema.h" @@ -71,6 +72,8 @@ std::shared_ptr
GenerateTable(const std::vector>& std::shared_ptr
ConcatAndCombine( const std::vector>& parts) { + // Concatenate and combine chunks so the table doesn't carry information about + // the modification points auto table = ConcatenateTables(parts).ValueOrDie(); return table->CombineChunks().ValueOrDie(); } @@ -84,7 +87,6 @@ Result> WriteTableToBuffer(const std::shared_ptr
& auto sink = CreateOutputStream(); auto builder = WriterProperties::Builder(); - // enable content defined chunking builder.enable_cdc()->cdc_size_range(min_chunk_size, max_chunk_size); if (enable_dictionary) { builder.enable_dictionary(); @@ -92,7 +94,6 @@ Result> WriteTableToBuffer(const std::shared_ptr
& builder.disable_dictionary(); } auto write_props = builder.build(); - auto arrow_props = ArrowWriterProperties::Builder().store_schema()->build(); RETURN_NOT_OK(WriteTable(*table, default_memory_pool(), sink, row_group_size, write_props, arrow_props)); @@ -111,11 +112,16 @@ Result> ReadTableFromBuffer(const std::shared_ptr return result; } -using PageSizes = std::pair, std::vector>; +struct PageSizes { + std::vector lengths; + std::vector sizes; +}; PageSizes GetColumnPageSizes(const std::shared_ptr& data, int column_index = 0) { - std::vector page_sizes; - std::vector page_lengths; + // Read the parquet data out of the buffer and get the sizes and lengths of the + // data pages in given column. We assert on the sizes and lengths of the pages + // to ensure that the chunking is done correctly. + PageSizes result; auto buffer_reader = std::make_shared(data); auto parquet_reader = ParquetFileReader::Open(std::move(buffer_reader)); @@ -126,19 +132,20 @@ PageSizes GetColumnPageSizes(const std::shared_ptr& data, int column_ind while (auto page = page_reader->NextPage()) { if (page->type() == PageType::DATA_PAGE || page->type() == PageType::DATA_PAGE_V2) { auto data_page = static_cast(page.get()); - page_sizes.push_back(data_page->size()); - page_lengths.push_back(data_page->num_values()); + result.sizes.push_back(data_page->size()); + result.lengths.push_back(data_page->num_values()); } } } - return {page_lengths, page_sizes}; + return result; } Result WriteAndGetPageSizes(const std::shared_ptr
& table, uint64_t min_chunk_size, uint64_t max_chunk_size, bool enable_dictionary = false, int column_index = 0) { + // Write the table to a buffer and read it back to get the page sizes ARROW_ASSIGN_OR_RAISE( auto buffer, WriteTableToBuffer(table, min_chunk_size, max_chunk_size, enable_dictionary)); @@ -180,51 +187,74 @@ void AssertAllBetween(const std::vector& values, uint64_t min, uint64_ std::vector, std::vector>> FindDifferences( const std::vector& first, const std::vector& second) { - auto n = first.size(), m = second.size(); - - // Build DP table for LCS. + // Compute LCS table. + size_t n = first.size(), m = second.size(); std::vector> dp(n + 1, std::vector(m + 1, 0)); - for (size_t i = 0; i < n; ++i) { - for (size_t j = 0; j < m; ++j) { - dp[i + 1][j + 1] = - (first[i] == second[j]) ? dp[i][j] + 1 : std::max(dp[i + 1][j], dp[i][j + 1]); + for (size_t i = 0; i < n; i++) { + for (size_t j = 0; j < m; j++) { + if (first[i] == second[j]) { + dp[i + 1][j + 1] = dp[i][j] + 1; + } else { + dp[i + 1][j + 1] = std::max(dp[i + 1][j], dp[i][j + 1]); + } } } - // Backtrack to recover LCS indices. + // Backtrack to get common indices. std::vector> common; - for (auto i = n, j = m; i > 0 && j > 0;) { + for (size_t i = n, j = m; i > 0 && j > 0;) { if (first[i - 1] == second[j - 1]) { common.emplace_back(i - 1, j - 1); - --i, --j; + i--, j--; } else if (dp[i - 1][j] >= dp[i][j - 1]) { - --i; + i--; } else { - --j; + j--; } } std::reverse(common.begin(), common.end()); - // Extract differences using the common indices as anchors. + // Build raw differences. std::vector, std::vector>> result; size_t last_i = 0, last_j = 0; - for (auto [ci, cj] : common) { - std::vector diff1(first.begin() + last_i, first.begin() + ci); - std::vector diff2(second.begin() + last_j, second.begin() + cj); - if (!diff1.empty() || !diff2.empty()) { - result.emplace_back(std::move(diff1), std::move(diff2)); + for (auto& c : common) { + auto ci = c.first; + auto cj = c.second; + if (ci > last_i || cj > last_j) { + result.push_back({{first.begin() + last_i, first.begin() + ci}, + {second.begin() + last_j, second.begin() + cj}}); } last_i = ci + 1; last_j = cj + 1; } - // Add any remaining elements after the last common index. - std::vector diff1(first.begin() + last_i, first.end()); - std::vector diff2(second.begin() + last_j, second.end()); - if (!diff1.empty() || !diff2.empty()) { - result.emplace_back(std::move(diff1), std::move(diff2)); + if (last_i < n || last_j < m) { + result.push_back( + {{first.begin() + last_i, first.end()}, {second.begin() + last_j, second.end()}}); } - return result; + // Merge adjacent diffs if one side is empty in the first diff and the other side + // is empty in the next diff, to avoid splitting single changes into two parts. + std::vector, std::vector>> merged; + for (auto& diff : result) { + if (!merged.empty()) { + auto& prev = merged.back(); + bool can_merge_a = prev.first.empty() && !prev.second.empty() && + !diff.first.empty() && diff.second.empty(); + bool can_merge_b = prev.second.empty() && !prev.first.empty() && + !diff.second.empty() && diff.first.empty(); + if (can_merge_a) { + // Combine into one change + prev.first = std::move(diff.first); + continue; + } else if (can_merge_b) { + prev.second = std::move(diff.second); + continue; + } + } + merged.push_back(std::move(diff)); + } + + return merged; } TEST(TestFindDifferences, Basic) { @@ -309,16 +339,40 @@ TEST(TestFindDifferences, LongSequenceWithMiddleChanges) { } } -void AssertUpdateCase(const std::vector& original, +TEST(TestFindDifferences, AdditionalCase) { + std::vector original = {445, 312, 393, 401, 410, 138, 558, 457}; + std::vector modified = {445, 312, 393, 393, 410, 138, 558, 457}; + + auto diffs = FindDifferences(original, modified); + ASSERT_EQ(diffs.size(), 1); + + ASSERT_EQ(diffs[0].first, std::vector({401})); + ASSERT_EQ(diffs[0].second, std::vector({393})); + + // Verify elements before and after the difference are identical + for (size_t i = 0; i < 3; i++) { + ASSERT_EQ(original[i], modified[i]); + } + for (size_t i = 4; i < original.size(); i++) { + ASSERT_EQ(original[i], modified[i]); + } +} + +void AssertUpdateCase(const std::shared_ptr<::arrow::DataType>& dtype, + const std::vector& original, const std::vector& modified, uint8_t n_modifications) { auto diffs = FindDifferences(original, modified); + // Print diffs, original, and modified sequences for debugging purposes + ASSERT_LE(diffs.size(), n_modifications); for (const auto& diff : diffs) { - uint64_t left_sum = 0, right_sum = 0; - for (const auto& val : diff.first) left_sum += val; - for (const auto& val : diff.second) right_sum += val; - ASSERT_EQ(left_sum, right_sum); + if (!::arrow::is_list_like(dtype->id())) { + uint64_t left_sum = 0, right_sum = 0; + for (const auto& val : diff.first) left_sum += val; + for (const auto& val : diff.second) right_sum += val; + ASSERT_EQ(left_sum, right_sum); + } ASSERT_LE(diff.first.size(), 2); ASSERT_LE(diff.second.size(), 2); } @@ -329,35 +383,41 @@ void AssertUpdateCase(const std::vector& original, } } -void AssertDeleteCase(const std::vector& original, +void AssertDeleteCase(const std::shared_ptr<::arrow::DataType>& dtype, + const std::vector& original, const std::vector& modified, uint8_t n_modifications, uint64_t edit_length) { auto diffs = FindDifferences(original, modified); ASSERT_EQ(diffs.size(), n_modifications); for (const auto& diff : diffs) { - uint64_t left_sum = 0, right_sum = 0; - for (const auto& val : diff.first) left_sum += val; - for (const auto& val : diff.second) right_sum += val; - ASSERT_EQ(left_sum, right_sum + edit_length); - ASSERT_LE(diff.first.size(), 3); - ASSERT_LE(diff.second.size(), 3); + if (!::arrow::is_list_like(dtype->id())) { + uint64_t left_sum = 0, right_sum = 0; + for (const auto& val : diff.first) left_sum += val; + for (const auto& val : diff.second) right_sum += val; + ASSERT_EQ(left_sum, right_sum + edit_length); + } + ASSERT_LE(diff.first.size(), 2); + ASSERT_LE(diff.second.size(), 2); } } -void AssertInsertCase(const std::vector& original, +void AssertInsertCase(const std::shared_ptr<::arrow::DataType>& dtype, + const std::vector& original, const std::vector& modified, uint8_t n_modifications, uint64_t edit_length) { auto diffs = FindDifferences(original, modified); ASSERT_EQ(diffs.size(), n_modifications); for (const auto& diff : diffs) { - uint64_t left_sum = 0, right_sum = 0; - for (const auto& val : diff.first) left_sum += val; - for (const auto& val : diff.second) right_sum += val; - ASSERT_EQ(left_sum + edit_length, right_sum); - ASSERT_LE(diff.first.size(), 3); - ASSERT_LE(diff.second.size(), 3); + if (!::arrow::is_list_like(dtype->id())) { + uint64_t left_sum = 0, right_sum = 0; + for (const auto& val : diff.first) left_sum += val; + for (const auto& val : diff.second) right_sum += val; + ASSERT_EQ(left_sum + edit_length, right_sum); + } + ASSERT_LE(diff.first.size(), 2); + ASSERT_LE(diff.second.size(), 2); } } @@ -387,242 +447,244 @@ void AssertChunkSizes(const std::shared_ptr<::arrow::DataType>& dtype, if (::arrow::is_fixed_width(dtype->id())) { auto min_length = ElementCount(min_chunk_size, dtype->byte_width(), nullable); auto max_length = ElementCount(max_chunk_size, dtype->byte_width(), nullable); - AssertAllBetween(base_result.first, min_length, max_length, + AssertAllBetween(base_result.lengths, min_length, max_length, /*expect_dictionary_fallback=*/enable_dictionary); - AssertAllBetween(modified_result.first, min_length, max_length, + AssertAllBetween(modified_result.lengths, min_length, max_length, /*expect_dictionary_fallback=*/enable_dictionary); } else if (::arrow::is_base_binary_like(dtype->id()) && !nullable && !enable_dictionary) { - AssertAllBetween(base_result.second, min_chunk_size, max_chunk_size); - AssertAllBetween(modified_result.second, min_chunk_size, max_chunk_size); + AssertAllBetween(base_result.sizes, min_chunk_size, max_chunk_size); + AssertAllBetween(modified_result.sizes, min_chunk_size, max_chunk_size); } } -constexpr uint64_t kMinChunkSize = 32 * 1024; -constexpr uint64_t kMaxChunkSize = 128 * 1024; -constexpr uint64_t kPartLength = 128 * 1024; -constexpr uint64_t kEditLength = 32; - -// TODO: -// - test nested types -// - test multiple row groups +constexpr uint64_t kMinChunkSize = 16 * 1024; +constexpr uint64_t kMaxChunkSize = 64 * 1024; +constexpr uint64_t kPartSize = 256 * 1024; +constexpr uint64_t kEditSize = 256; class TestColumnCDC : public ::testing::TestWithParam< - std::tuple, bool, bool>> {}; + std::tuple, bool, size_t>> { + protected: + // Column random table parts for testing + std::shared_ptr field_; + std::shared_ptr
part1_, part2_, part3_, part4_, part5_, part6_, part7_; + + void SetUp() override { + auto [dtype, nullable, byte_per_record] = GetParam(); + auto field_ = ::arrow::field("f0", dtype, nullable); + + auto part_length = kPartSize / byte_per_record; + auto edit_length = kEditSize / byte_per_record; + // Generate random table parts, these are later concatenated to simulate + // different scenarios like insert, update, delete, and append. + part1_ = GenerateTable({field_}, part_length, /*seed=*/1); + part2_ = GenerateTable({field_}, edit_length, /*seed=*/2); + part3_ = GenerateTable({field_}, part_length, /*seed=*/3); + part4_ = GenerateTable({field_}, edit_length, /*seed=*/4); + part5_ = GenerateTable({field_}, part_length, /*seed=*/5); + part6_ = GenerateTable({field_}, edit_length, /*seed=*/6); + part7_ = GenerateTable({field_}, edit_length, /*seed=*/7); + } +}; TEST_P(TestColumnCDC, DeleteOnce) { - auto dtype = std::get<0>(GetParam()); - auto nullable = std::get<1>(GetParam()); - auto enable_dictionary = std::get<2>(GetParam()); - - auto field = ::arrow::field("f0", dtype, nullable); - - auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); - auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); - auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); + auto [dtype, nullable, _] = GetParam(); - auto base = ConcatAndCombine({part1, part2, part3}); - auto modified = ConcatAndCombine({part1, part3}); + auto base = ConcatAndCombine({part1_, part2_, part3_}); + auto modified = ConcatAndCombine({part1_, part3_}); ASSERT_FALSE(base->Equals(*modified)); - ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + for (bool enable_dictionary : {false, true}) { + ASSERT_OK_AND_ASSIGN(auto base_result, + WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto modified_result, + WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); - AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, - kMinChunkSize, kMaxChunkSize); + AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, + kMinChunkSize, kMaxChunkSize); - AssertDeleteCase(base_result.first, modified_result.first, 1, kEditLength); + AssertDeleteCase(dtype, base_result.lengths, modified_result.lengths, 1, + part2_->num_rows()); + } } TEST_P(TestColumnCDC, DeleteTwice) { - auto dtype = std::get<0>(GetParam()); - auto nullable = std::get<1>(GetParam()); - auto enable_dictionary = std::get<2>(GetParam()); - - auto field = ::arrow::field("f0", dtype, nullable); + auto [dtype, nullable, _] = GetParam(); - auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); - auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); - auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); - auto part4 = GenerateTable({field}, kEditLength, /*seed=*/4); - auto part5 = GenerateTable({field}, kPartLength, /*seed=*/5); - - auto base = ConcatAndCombine({part1, part2, part3, part4, part5}); - auto modified = ConcatAndCombine({part1, part3, part5}); + auto base = ConcatAndCombine({part1_, part2_, part3_, part4_, part5_}); + auto modified = ConcatAndCombine({part1_, part3_, part5_}); ASSERT_FALSE(base->Equals(*modified)); - ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - - AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, - kMinChunkSize, kMaxChunkSize); - AssertDeleteCase(base_result.first, modified_result.first, 2, kEditLength); + for (bool enable_dictionary : {false, true}) { + ASSERT_OK_AND_ASSIGN(auto base_result, + WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto modified_result, + WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + + AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, + kMinChunkSize, kMaxChunkSize); + AssertDeleteCase(dtype, base_result.lengths, modified_result.lengths, 2, + part2_->num_rows()); + } } TEST_P(TestColumnCDC, UpdateOnce) { - auto dtype = std::get<0>(GetParam()); - auto nullable = std::get<1>(GetParam()); - auto enable_dictionary = std::get<2>(GetParam()); - - auto field = ::arrow::field("f0", dtype, nullable); + auto [dtype, nullable, _] = GetParam(); - auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); - auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); - auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); - auto part4 = GenerateTable({field}, kEditLength, /*seed=*/4); - - auto base = ConcatAndCombine({part1, part2, part3}); - auto modified = ConcatAndCombine({part1, part4, part3}); + auto base = ConcatAndCombine({part1_, part2_, part3_}); + auto modified = ConcatAndCombine({part1_, part4_, part3_}); ASSERT_FALSE(base->Equals(*modified)); - ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - - AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, - kMinChunkSize, kMaxChunkSize); - AssertUpdateCase(base_result.first, modified_result.first, 1); + for (bool enable_dictionary : {false, true}) { + ASSERT_OK_AND_ASSIGN(auto base_result, + WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto modified_result, + WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + + AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, + kMinChunkSize, kMaxChunkSize); + AssertUpdateCase(dtype, base_result.lengths, modified_result.lengths, 1); + } } TEST_P(TestColumnCDC, UpdateTwice) { - auto dtype = std::get<0>(GetParam()); - auto nullable = std::get<1>(GetParam()); - auto enable_dictionary = std::get<2>(GetParam()); - - auto field = ::arrow::field("f0", dtype, nullable); - - auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); - auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); - auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); - auto part4 = GenerateTable({field}, kEditLength, /*seed=*/4); - auto part5 = GenerateTable({field}, kPartLength, /*seed=*/5); - auto part6 = GenerateTable({field}, kEditLength, /*seed=*/6); - auto part7 = GenerateTable({field}, kEditLength, /*seed=*/7); - - auto base = ConcatAndCombine({part1, part2, part3, part4, part5}); - auto modified = ConcatAndCombine({part1, part6, part3, part7, part5}); - ASSERT_FALSE(base->Equals(*modified)); + auto [dtype, nullable, _] = GetParam(); - ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + auto base = ConcatAndCombine({part1_, part2_, part3_, part4_, part5_}); + auto modified = ConcatAndCombine({part1_, part6_, part3_, part7_, part5_}); + ASSERT_FALSE(base->Equals(*modified)); - AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, - kMinChunkSize, kMaxChunkSize); - AssertUpdateCase(base_result.first, modified_result.first, 2); + for (bool enable_dictionary : {false, true}) { + ASSERT_OK_AND_ASSIGN(auto base_result, + WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto modified_result, + WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + + AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, + kMinChunkSize, kMaxChunkSize); + AssertUpdateCase(dtype, base_result.lengths, modified_result.lengths, 2); + } } TEST_P(TestColumnCDC, InsertOnce) { - auto dtype = std::get<0>(GetParam()); - auto nullable = std::get<1>(GetParam()); - auto enable_dictionary = std::get<2>(GetParam()); + auto [dtype, nullable, _] = GetParam(); - auto field = ::arrow::field("f0", dtype, nullable); - - auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); - auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); - auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); - - auto base = ConcatAndCombine({part1, part3}); - auto modified = ConcatAndCombine({part1, part2, part3}); + auto base = ConcatAndCombine({part1_, part3_}); + auto modified = ConcatAndCombine({part1_, part2_, part3_}); ASSERT_FALSE(base->Equals(*modified)); - ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - - AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, - kMinChunkSize, kMaxChunkSize); - AssertInsertCase(base_result.first, modified_result.first, 1, kEditLength); + for (bool enable_dictionary : {false, true}) { + ASSERT_OK_AND_ASSIGN(auto base_result, + WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto modified_result, + WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + + AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, + kMinChunkSize, kMaxChunkSize); + AssertInsertCase(dtype, base_result.lengths, modified_result.lengths, 1, + part2_->num_rows()); + } } TEST_P(TestColumnCDC, InsertTwice) { - auto dtype = std::get<0>(GetParam()); - auto nullable = std::get<1>(GetParam()); - auto enable_dictionary = std::get<2>(GetParam()); + auto [dtype, nullable, _] = GetParam(); - auto field = ::arrow::field("f0", dtype, nullable); - - auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); - auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); - auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); - auto part4 = GenerateTable({field}, kEditLength, /*seed=*/4); - auto part5 = GenerateTable({field}, kPartLength, /*seed=*/5); - - auto base = ConcatAndCombine({part1, part3, part5}); - auto modified = ConcatAndCombine({part1, part2, part3, part4, part5}); + auto base = ConcatAndCombine({part1_, part3_, part5_}); + auto modified = ConcatAndCombine({part1_, part2_, part3_, part4_, part5_}); ASSERT_FALSE(base->Equals(*modified)); - ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - - AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, - kMinChunkSize, kMaxChunkSize); - AssertInsertCase(base_result.first, modified_result.first, 2, kEditLength); + for (bool enable_dictionary : {false, true}) { + ASSERT_OK_AND_ASSIGN(auto base_result, + WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto modified_result, + WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + + AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, + kMinChunkSize, kMaxChunkSize); + AssertInsertCase(dtype, base_result.lengths, modified_result.lengths, 2, + part2_->num_rows()); + } } TEST_P(TestColumnCDC, Append) { - auto dtype = std::get<0>(GetParam()); - auto nullable = std::get<1>(GetParam()); - auto enable_dictionary = std::get<2>(GetParam()); + auto [dtype, nullable, _] = GetParam(); - auto field = ::arrow::field("f0", dtype, nullable); - - auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); - auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); - auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); - auto part4 = GenerateTable({field}, kEditLength, /*seed=*/4); - - auto base = ConcatAndCombine({part1, part2, part3}); - auto modified = ConcatAndCombine({part1, part2, part3, part4}); + auto base = ConcatAndCombine({part1_, part2_, part3_}); + auto modified = ConcatAndCombine({part1_, part2_, part3_, part4_}); ASSERT_FALSE(base->Equals(*modified)); - ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - - AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, - kMinChunkSize, kMaxChunkSize); - AssertAppendCase(base_result.first, modified_result.first); + for (bool enable_dictionary : {false, true}) { + ASSERT_OK_AND_ASSIGN(auto base_result, + WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto modified_result, + WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + + AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, + kMinChunkSize, kMaxChunkSize); + AssertAppendCase(base_result.lengths, modified_result.lengths); + } } // TODO(kszucs): add extension type and dictionary type INSTANTIATE_TEST_SUITE_P( FixedSizedTypes, TestColumnCDC, - Combine(Values(::arrow::uint8(), ::arrow::uint16(), ::arrow::uint32(), - ::arrow::uint64(), ::arrow::int8(), ::arrow::int16(), ::arrow::int32(), - ::arrow::int64(), ::arrow::float16(), ::arrow::float32(), - ::arrow::float64(), ::arrow::binary(), ::arrow::large_binary(), - ::arrow::fixed_size_binary(16), ::arrow::utf8(), ::arrow::large_utf8(), - ::arrow::date32(), ::arrow::date64(), ::arrow::decimal128(18, 6), - ::arrow::decimal256(40, 6), ::arrow::time32(::arrow::TimeUnit::SECOND), - ::arrow::time64(::arrow::TimeUnit::NANO), - ::arrow::timestamp(::arrow::TimeUnit::NANO), - ::arrow::duration(::arrow::TimeUnit::NANO)), - Bool(), Bool())); + testing::Values( + // Numeric + std::make_tuple(::arrow::uint8(), false, 1), + std::make_tuple(::arrow::uint16(), true, 2), + std::make_tuple(::arrow::uint32(), false, 4), + std::make_tuple(::arrow::uint64(), true, 8), + std::make_tuple(::arrow::int8(), false, 1), + std::make_tuple(::arrow::int16(), false, 2), + std::make_tuple(::arrow::int32(), false, 4), + std::make_tuple(::arrow::int64(), true, 8), + std::make_tuple(::arrow::float16(), false, 2), + std::make_tuple(::arrow::float32(), false, 4), + std::make_tuple(::arrow::float64(), true, 8), + std::make_tuple(::arrow::decimal128(18, 6), false, 16), + std::make_tuple(::arrow::decimal256(40, 6), false, 32), + // Binary-like + std::make_tuple(::arrow::binary(), true, 16), + + std::make_tuple(::arrow::large_binary(), false, 16), + std::make_tuple(::arrow::fixed_size_binary(16), true, 16), + std::make_tuple(::arrow::utf8(), false, 16), + std::make_tuple(::arrow::utf8(), true, 16), + std::make_tuple(::arrow::large_utf8(), false, 16), + // Temporal + std::make_tuple(::arrow::date32(), false, 4), + std::make_tuple(::arrow::date64(), false, 8), + std::make_tuple(::arrow::time32(::arrow::TimeUnit::SECOND), true, 4), + std::make_tuple(::arrow::time64(::arrow::TimeUnit::NANO), false, 8), + std::make_tuple(::arrow::timestamp(::arrow::TimeUnit::NANO), true, 8), + std::make_tuple(::arrow::duration(::arrow::TimeUnit::NANO), false, 8), + // Nested types + std::make_tuple(::arrow::list(::arrow::int32()), false, 64), + std::make_tuple(::arrow::list(::arrow::int32()), true, 64), + std::make_tuple(::arrow::list(::arrow::utf8()), true, 64), + std::make_tuple(::arrow::large_list(::arrow::int32()), true, 64), + std::make_tuple(::arrow::struct_({::arrow::field("f0", ::arrow::int32())}), false, + 8), + std::make_tuple(::arrow::struct_({::arrow::field("f0", ::arrow::float64())}), + true, 10))); } // namespace parquet + +// TODO: +// - test multiple row groups +// - test empty diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index c6e2716d4e4..26a30a8bcb3 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -754,8 +754,8 @@ class ColumnWriterImpl { fallback_(false), definition_levels_sink_(allocator_), repetition_levels_sink_(allocator_), - content_defined_chunker_(level_info_, properties->cdc_size_range().first, - properties->cdc_size_range().second) { + content_defined_chunker_(level_info_, properties->cdc_size_range(), + properties->cdc_norm_factor()) { definition_levels_rle_ = std::static_pointer_cast(AllocateBuffer(allocator_, 0)); repetition_levels_rle_ = diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index ccb8e975d12..a980ab29d80 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -170,6 +170,7 @@ static constexpr SizeStatisticsLevel DEFAULT_SIZE_STATISTICS_LEVEL = SizeStatisticsLevel::PageAndColumnChunk; static constexpr std::pair DEFAULT_CDC_SIZE_RANGE = std::make_pair(256 * 1024, 1024 * 1024); +static constexpr uint8_t DEFAULT_CDC_NORM_FACTOR = 0; class PARQUET_EXPORT ColumnProperties { public: @@ -265,7 +266,8 @@ class PARQUET_EXPORT WriterProperties { page_checksum_enabled_(false), size_statistics_level_(DEFAULT_SIZE_STATISTICS_LEVEL), cdc_enabled_(false), - cdc_size_range_(DEFAULT_CDC_SIZE_RANGE) {} + cdc_size_range_(DEFAULT_CDC_SIZE_RANGE), + cdc_norm_factor_(DEFAULT_CDC_NORM_FACTOR) {} explicit Builder(const WriterProperties& properties) : pool_(properties.memory_pool()), @@ -282,7 +284,8 @@ class PARQUET_EXPORT WriterProperties { sorting_columns_(properties.sorting_columns()), default_column_properties_(properties.default_column_properties()), cdc_enabled_(properties.cdc_enabled()), - cdc_size_range_(properties.cdc_size_range()) {} + cdc_size_range_(properties.cdc_size_range()), + cdc_norm_factor_(properties.cdc_norm_factor()) {} virtual ~Builder() {} @@ -301,6 +304,11 @@ class PARQUET_EXPORT WriterProperties { return this; } + Builder* cdc_norm_factor(uint8_t norm_factor) { + cdc_norm_factor_ = norm_factor; + return this; + } + /// Specify the memory pool for the writer. Default default_memory_pool. Builder* memory_pool(MemoryPool* pool) { pool_ = pool; @@ -724,7 +732,7 @@ class PARQUET_EXPORT WriterProperties { size_statistics_level_, std::move(file_encryption_properties_), default_column_properties_, column_properties, data_page_version_, store_decimal_as_integer_, std::move(sorting_columns_), cdc_enabled_, - cdc_size_range_)); + cdc_size_range_, cdc_norm_factor_)); } private: @@ -756,6 +764,7 @@ class PARQUET_EXPORT WriterProperties { bool cdc_enabled_; std::pair cdc_size_range_; + uint8_t cdc_norm_factor_; }; inline MemoryPool* memory_pool() const { return pool_; } @@ -782,6 +791,7 @@ class PARQUET_EXPORT WriterProperties { inline bool cdc_enabled() const { return cdc_enabled_; } inline std::pair cdc_size_range() const { return cdc_size_range_; } + inline uint8_t cdc_norm_factor() const { return cdc_norm_factor_; } inline SizeStatisticsLevel size_statistics_level() const { return size_statistics_level_; @@ -886,7 +896,7 @@ class PARQUET_EXPORT WriterProperties { const std::unordered_map& column_properties, ParquetDataPageVersion data_page_version, bool store_short_decimal_as_integer, std::vector sorting_columns, bool cdc_enabled, - std::pair cdc_size_range) + std::pair cdc_size_range, uint8_t cdc_norm_factor) : pool_(pool), dictionary_pagesize_limit_(dictionary_pagesize_limit), write_batch_size_(write_batch_size), @@ -903,9 +913,8 @@ class PARQUET_EXPORT WriterProperties { default_column_properties_(default_column_properties), column_properties_(column_properties), cdc_enabled_(cdc_enabled), - cdc_size_range_(cdc_size_range) - - {} + cdc_size_range_(cdc_size_range), + cdc_norm_factor_(cdc_norm_factor) {} MemoryPool* pool_; int64_t dictionary_pagesize_limit_; @@ -928,6 +937,7 @@ class PARQUET_EXPORT WriterProperties { bool cdc_enabled_; std::pair cdc_size_range_; + uint8_t cdc_norm_factor_; }; PARQUET_EXPORT const std::shared_ptr& default_writer_properties(); diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index a0dcfd6d453..02bb070aadb 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -498,6 +498,7 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: Builder* enable_cdc() Builder* disable_cdc() Builder* cdc_size_range(uint64_t min_size, uint64_t max_size) + Builder* cdc_norm_factor(uint8_t norm_factor) shared_ptr[WriterProperties] build() cdef cppclass ArrowWriterProperties: @@ -649,7 +650,9 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( write_page_checksum=*, sorting_columns=*, store_decimal_as_integer=*, - content_defined_chunking=* + cdc=*, + cdc_size_range=*, + cdc_norm_factor=*, ) except * diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 1ef86d3e3f9..28228780ea9 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1973,7 +1973,9 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( write_page_checksum=False, sorting_columns=None, store_decimal_as_integer=False, - content_defined_chunking=False) except *: + cdc=False, + cdc_size_range=None, + cdc_norm_factor=None) except *: """General writer properties""" cdef: @@ -2114,18 +2116,16 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( props.dictionary_pagesize_limit(dictionary_pagesize_limit) # content defined chunking - if content_defined_chunking is False: + if cdc is False: props.disable_cdc() - elif content_defined_chunking is True: - props.enable_cdc() - elif isinstance(content_defined_chunking, tuple): - min_size, max_size = content_defined_chunking + elif cdc is True: props.enable_cdc() + + if cdc_size_range is not None: + min_size, max_size = cdc_size_range props.cdc_size_range(min_size, max_size) - else: - raise ValueError( - "Unsupported value for content_defined_chunking: {0}" - .format(content_defined_chunking)) + if cdc_norm_factor is not None: + props.cdc_norm_factor(cdc_norm_factor) # encryption @@ -2277,7 +2277,9 @@ cdef class ParquetWriter(_Weakrefable): write_page_checksum=False, sorting_columns=None, store_decimal_as_integer=False, - content_defined_chunking=False): + cdc=False, + cdc_size_range=None, + cdc_norm_factor=None): cdef: shared_ptr[WriterProperties] properties shared_ptr[ArrowWriterProperties] arrow_properties @@ -2312,7 +2314,7 @@ cdef class ParquetWriter(_Weakrefable): write_page_checksum=write_page_checksum, sorting_columns=sorting_columns, store_decimal_as_integer=store_decimal_as_integer, - content_defined_chunking=content_defined_chunking + cdc=cdc, cdc_size_range=cdc_size_range, cdc_norm_factor=cdc_norm_factor ) arrow_properties = _create_arrow_writer_properties( use_deprecated_int96_timestamps=use_deprecated_int96_timestamps, From a20ebbf1e16f77411424b8a371a01927d3ab6f77 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Mon, 24 Feb 2025 20:18:22 +0100 Subject: [PATCH 019/102] reduce the testing data size to make the test cases quicker --- cpp/src/parquet/column_chunker_test.cc | 40 ++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/column_chunker_test.cc b/cpp/src/parquet/column_chunker_test.cc index 7bfdb4ed138..d4b323f1e06 100644 --- a/cpp/src/parquet/column_chunker_test.cc +++ b/cpp/src/parquet/column_chunker_test.cc @@ -87,7 +87,9 @@ Result> WriteTableToBuffer(const std::shared_ptr
& auto sink = CreateOutputStream(); auto builder = WriterProperties::Builder(); - builder.enable_cdc()->cdc_size_range(min_chunk_size, max_chunk_size); + builder.enable_cdc() + ->cdc_size_range(min_chunk_size, max_chunk_size) + ->cdc_norm_factor(0); if (enable_dictionary) { builder.enable_dictionary(); } else { @@ -257,6 +259,36 @@ std::vector, std::vector>> FindDiffere return merged; } +void PrintDifferences( + const std::vector& original, const std::vector& modified, + std::vector, std::vector>>& diffs) { + std::cout << "Original: "; + for (const auto& val : original) { + std::cout << val << " "; + } + std::cout << std::endl; + + std::cout << "Modified: "; + for (const auto& val : modified) { + std::cout << val << " "; + } + std::cout << std::endl; + + for (const auto& diff : diffs) { + std::cout << "First: "; + for (const auto& val : diff.first) { + std::cout << val << " "; + } + std::cout << std::endl; + + std::cout << "Second: "; + for (const auto& val : diff.second) { + std::cout << val << " "; + } + std::cout << std::endl; + } +} + TEST(TestFindDifferences, Basic) { std::vector first = {1, 2, 3, 4, 5}; std::vector second = {1, 7, 8, 4, 5}; @@ -362,8 +394,6 @@ void AssertUpdateCase(const std::shared_ptr<::arrow::DataType>& dtype, const std::vector& original, const std::vector& modified, uint8_t n_modifications) { auto diffs = FindDifferences(original, modified); - // Print diffs, original, and modified sequences for debugging purposes - ASSERT_LE(diffs.size(), n_modifications); for (const auto& diff : diffs) { @@ -458,9 +488,9 @@ void AssertChunkSizes(const std::shared_ptr<::arrow::DataType>& dtype, } } -constexpr uint64_t kMinChunkSize = 16 * 1024; +constexpr uint64_t kMinChunkSize = 8 * 1024; constexpr uint64_t kMaxChunkSize = 64 * 1024; -constexpr uint64_t kPartSize = 256 * 1024; +constexpr uint64_t kPartSize = 64 * 1024; constexpr uint64_t kEditSize = 256; class TestColumnCDC : public ::testing::TestWithParam< From 47aa8b0dd873ba7541fbb1cb10d928240c43879b Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Mon, 24 Feb 2025 21:28:45 +0100 Subject: [PATCH 020/102] increase testing data size --- cpp/src/parquet/column_chunker_test.cc | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/column_chunker_test.cc b/cpp/src/parquet/column_chunker_test.cc index d4b323f1e06..79ae8f1cca9 100644 --- a/cpp/src/parquet/column_chunker_test.cc +++ b/cpp/src/parquet/column_chunker_test.cc @@ -394,6 +394,9 @@ void AssertUpdateCase(const std::shared_ptr<::arrow::DataType>& dtype, const std::vector& original, const std::vector& modified, uint8_t n_modifications) { auto diffs = FindDifferences(original, modified); + if (diffs.size() > n_modifications) { + PrintDifferences(original, modified, diffs); + } ASSERT_LE(diffs.size(), n_modifications); for (const auto& diff : diffs) { @@ -418,6 +421,9 @@ void AssertDeleteCase(const std::shared_ptr<::arrow::DataType>& dtype, const std::vector& modified, uint8_t n_modifications, uint64_t edit_length) { auto diffs = FindDifferences(original, modified); + if (diffs.size() != n_modifications) { + PrintDifferences(original, modified, diffs); + } ASSERT_EQ(diffs.size(), n_modifications); for (const auto& diff : diffs) { @@ -437,6 +443,9 @@ void AssertInsertCase(const std::shared_ptr<::arrow::DataType>& dtype, const std::vector& modified, uint8_t n_modifications, uint64_t edit_length) { auto diffs = FindDifferences(original, modified); + if (diffs.size() != n_modifications) { + PrintDifferences(original, modified, diffs); + } ASSERT_EQ(diffs.size(), n_modifications); for (const auto& diff : diffs) { @@ -474,6 +483,7 @@ void AssertChunkSizes(const std::shared_ptr<::arrow::DataType>& dtype, PageSizes base_result, PageSizes modified_result, bool nullable, bool enable_dictionary, uint64_t min_chunk_size, uint64_t max_chunk_size) { + max_chunk_size *= 1.2; if (::arrow::is_fixed_width(dtype->id())) { auto min_length = ElementCount(min_chunk_size, dtype->byte_width(), nullable); auto max_length = ElementCount(max_chunk_size, dtype->byte_width(), nullable); @@ -488,10 +498,10 @@ void AssertChunkSizes(const std::shared_ptr<::arrow::DataType>& dtype, } } -constexpr uint64_t kMinChunkSize = 8 * 1024; -constexpr uint64_t kMaxChunkSize = 64 * 1024; -constexpr uint64_t kPartSize = 64 * 1024; -constexpr uint64_t kEditSize = 256; +constexpr uint64_t kMinChunkSize = 64 * 1024; +constexpr uint64_t kMaxChunkSize = 128 * 1024; +constexpr uint64_t kPartSize = 256 * 1024; +constexpr uint64_t kEditSize = 128; class TestColumnCDC : public ::testing::TestWithParam< std::tuple, bool, size_t>> { From 86e348fa1f27df1d2a4274a727f84abe58264a85 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Mon, 24 Feb 2025 21:46:18 +0100 Subject: [PATCH 021/102] add a custom array generator to alwayw produce the same array --- cpp/src/parquet/column_chunker_test.cc | 281 +++++++++++++++++++------ 1 file changed, 221 insertions(+), 60 deletions(-) diff --git a/cpp/src/parquet/column_chunker_test.cc b/cpp/src/parquet/column_chunker_test.cc index 79ae8f1cca9..f7b1e8fe1c7 100644 --- a/cpp/src/parquet/column_chunker_test.cc +++ b/cpp/src/parquet/column_chunker_test.cc @@ -17,43 +17,33 @@ #include #include +#include +#include +#include #include #include -#include "arrow/array.h" -#include "arrow/array/builder_binary.h" -#include "arrow/array/builder_decimal.h" -#include "arrow/array/builder_primitive.h" #include "arrow/table.h" -#include "arrow/testing/gtest_util.h" -#include "arrow/testing/random.h" #include "arrow/type_fwd.h" -#include "arrow/type_traits.h" -#include "arrow/util/decimal.h" #include "arrow/util/float16.h" -#include "arrow/util/logging.h" #include "parquet/arrow/reader.h" #include "parquet/arrow/reader_internal.h" -#include "parquet/arrow/schema.h" #include "parquet/arrow/test_util.h" #include "parquet/arrow/writer.h" #include "parquet/column_writer.h" #include "parquet/file_writer.h" -#include "parquet/page_index.h" -#include "parquet/test_util.h" namespace parquet { using ::arrow::Array; using ::arrow::ChunkedArray; using ::arrow::ConcatenateTables; +using ::arrow::DataType; using ::arrow::default_memory_pool; using ::arrow::Field; using ::arrow::Result; using ::arrow::Table; using ::arrow::io::BufferReader; -using ::arrow::random::GenerateArray; -using ::arrow::random::GenerateBatch; using ::parquet::arrow::FileReader; using ::parquet::arrow::FileReaderBuilder; using ::parquet::arrow::MakeSimpleTable; @@ -64,18 +54,191 @@ using ::testing::Bool; using ::testing::Combine; using ::testing::Values; -std::shared_ptr
GenerateTable(const std::vector>& fields, - int64_t size, int32_t seed = 42) { - auto batch = GenerateBatch(fields, size, seed); - return Table::FromRecordBatches({batch}).ValueOrDie(); +// generate determinisic and platform-independent data +inline uint64_t hash(uint64_t seed, uint64_t index) { + uint64_t h = (index + seed) * 0xc4ceb9fe1a85ec53ull; + h ^= h >> 33; + h *= 0xff51afd7ed558ccdull; + h ^= h >> 33; + h *= 0xc4ceb9fe1a85ec53ull; + h ^= h >> 33; + return h; } -std::shared_ptr
ConcatAndCombine( +#define GENERATE_CASE_BODY(BUILDER_TYPE, VALUE_EXPR) \ + { \ + BUILDER_TYPE builder(type, default_memory_pool()); \ + if (nullable) { \ + for (int64_t i = 0; i < length; ++i) { \ + uint64_t val = hash(seed, i); \ + if (val % 10 == 0) { \ + RETURN_NOT_OK(builder.AppendNull()); \ + } else { \ + RETURN_NOT_OK(builder.Append(VALUE_EXPR)); \ + } \ + } \ + } else { \ + for (int64_t i = 0; i < length; ++i) { \ + uint64_t val = hash(seed, i); \ + RETURN_NOT_OK(builder.Append(VALUE_EXPR)); \ + } \ + } \ + std::shared_ptr array; \ + RETURN_NOT_OK(builder.Finish(&array)); \ + RETURN_NOT_OK(array->ValidateFull()); \ + return array; \ + } + +// Macro to generate a case for a given scalar type. +#define GENERATE_CASE(TYPE_ID, BUILDER_TYPE, VALUE_EXPR) \ + case ::arrow::Type::TYPE_ID: { \ + GENERATE_CASE_BODY(BUILDER_TYPE, VALUE_EXPR) \ + } + +Result> GenerateArray(const std::shared_ptr& field, + int64_t length, uint64_t seed) { + const std::shared_ptr& type = field->type(); + bool nullable = field->nullable(); + + switch (type->id()) { + GENERATE_CASE(BOOL, ::arrow::BooleanBuilder, (val % 2 == 0)) + + // Numeric types. + GENERATE_CASE(INT8, ::arrow::Int8Builder, static_cast(val)) + GENERATE_CASE(INT16, ::arrow::Int16Builder, static_cast(val)) + GENERATE_CASE(INT32, ::arrow::Int32Builder, static_cast(val)) + GENERATE_CASE(INT64, ::arrow::Int64Builder, static_cast(val)) + GENERATE_CASE(UINT8, ::arrow::UInt8Builder, static_cast(val)) + GENERATE_CASE(UINT16, ::arrow::UInt16Builder, static_cast(val)) + GENERATE_CASE(UINT32, ::arrow::UInt32Builder, static_cast(val)) + GENERATE_CASE(UINT64, ::arrow::UInt64Builder, static_cast(val)) + GENERATE_CASE(HALF_FLOAT, ::arrow::HalfFloatBuilder, + static_cast(val % 1000)) + GENERATE_CASE(FLOAT, ::arrow::FloatBuilder, static_cast(val % 1000) / 1000.0f) + GENERATE_CASE(DOUBLE, ::arrow::DoubleBuilder, + static_cast(val % 100000) / 1000.0) + case ::arrow::Type::DECIMAL128: { + const auto& decimal_type = static_cast(*type); + // Limit the value to fit within the specified precision + int32_t max_exponent = decimal_type.precision() - decimal_type.scale(); + int64_t max_value = static_cast(std::pow(10, max_exponent) - 1); + GENERATE_CASE_BODY(::arrow::Decimal128Builder, ::arrow::Decimal128(val % max_value)) + } + case ::arrow::Type::DECIMAL256: { + const auto& decimal_type = static_cast(*type); + // Limit the value to fit within the specified precision, capped at 9 to avoid + // int64_t overflow + int32_t max_exponent = std::min(9, decimal_type.precision() - decimal_type.scale()); + int64_t max_value = static_cast(std::pow(10, max_exponent) - 1); + GENERATE_CASE_BODY(::arrow::Decimal256Builder, ::arrow::Decimal256(val % max_value)) + } + + // Temporal types + GENERATE_CASE(DATE32, ::arrow::Date32Builder, static_cast(val)) + GENERATE_CASE(TIME32, ::arrow::Time32Builder, + std::abs(static_cast(val) % 86400000)) + GENERATE_CASE(TIME64, ::arrow::Time64Builder, + std::abs(static_cast(val) % 86400000000)) + GENERATE_CASE(TIMESTAMP, ::arrow::TimestampBuilder, static_cast(val)) + GENERATE_CASE(DURATION, ::arrow::DurationBuilder, static_cast(val)) + + // Binary and string types. + GENERATE_CASE(STRING, ::arrow::StringBuilder, + std::string("str_") + std::to_string(val)) + GENERATE_CASE(LARGE_STRING, ::arrow::LargeStringBuilder, + std::string("str_") + std::to_string(val)) + GENERATE_CASE(BINARY, ::arrow::BinaryBuilder, + std::string("bin_") + std::to_string(val)) + case ::arrow::Type::FIXED_SIZE_BINARY: { + auto size = static_cast<::arrow::FixedSizeBinaryType*>(type.get())->byte_width(); + GENERATE_CASE_BODY(::arrow::FixedSizeBinaryBuilder, + std::string("bin_") + std::to_string(val).substr(0, size - 4)) + } + + case ::arrow::Type::STRUCT: { + auto struct_type = static_cast<::arrow::StructType*>(type.get()); + std::vector> child_arrays; + for (auto i = 0; i < struct_type->num_fields(); i++) { + ARROW_ASSIGN_OR_RAISE(auto child_array, + GenerateArray(struct_type->field(i), length, + seed + static_cast(i + 300))); + child_arrays.push_back(child_array); + } + auto struct_array = + std::make_shared<::arrow::StructArray>(type, length, child_arrays); + return struct_array; + } + + case ::arrow::Type::LIST: { + auto list_type = static_cast<::arrow::ListType*>(type.get()); + auto value_field = ::arrow::field("item", list_type->value_type()); + ARROW_ASSIGN_OR_RAISE(auto values_array, GenerateArray(value_field, length, seed)); + auto offset_builder = ::arrow::Int32Builder(); + auto bitmap_builder = ::arrow::TypedBufferBuilder(); + + int32_t num_nulls = 0; + int32_t num_elements = 0; + uint8_t element_size = 0; + int32_t current_offset = 0; + RETURN_NOT_OK(offset_builder.Append(current_offset)); + while (current_offset < length) { + num_elements++; + auto is_valid = !(nullable && (num_elements % 10 == 0)); + if (is_valid) { + RETURN_NOT_OK(bitmap_builder.Append(true)); + current_offset += element_size; + if (current_offset > length) { + RETURN_NOT_OK(offset_builder.Append(static_cast(length))); + break; + } else { + RETURN_NOT_OK(offset_builder.Append(current_offset)); + } + } else { + RETURN_NOT_OK(offset_builder.Append(static_cast(current_offset))); + RETURN_NOT_OK(bitmap_builder.Append(false)); + num_nulls++; + } + + if (element_size > 4) { + element_size = 0; + } else { + element_size++; + } + } + + std::shared_ptr offsets_array; + RETURN_NOT_OK(offset_builder.Finish(&offsets_array)); + std::shared_ptr bitmap_buffer; + RETURN_NOT_OK(bitmap_builder.Finish(&bitmap_buffer)); + ARROW_ASSIGN_OR_RAISE( + auto list_array, ::arrow::ListArray::FromArrays( + type, *offsets_array, *values_array, default_memory_pool(), + bitmap_buffer, num_nulls)); + RETURN_NOT_OK(list_array->ValidateFull()); + return list_array; + } + + default: + return ::arrow::Status::NotImplemented("Unsupported data type " + type->ToString()); + } +} + +Result> GenerateTable( + const std::shared_ptr<::arrow::Schema>& schema, int64_t size, uint64_t seed = 0) { + std::vector> arrays; + for (const auto& field : schema->fields()) { + ARROW_ASSIGN_OR_RAISE(auto array, GenerateArray(field, size, seed)); + arrays.push_back(array); + } + return Table::Make(schema, arrays, size); +} + +Result> ConcatAndCombine( const std::vector>& parts) { // Concatenate and combine chunks so the table doesn't carry information about // the modification points - auto table = ConcatenateTables(parts).ValueOrDie(); - return table->CombineChunks().ValueOrDie(); + ARROW_ASSIGN_OR_RAISE(auto table, ConcatenateTables(parts)); + return table->CombineChunks(); } Result> WriteTableToBuffer(const std::shared_ptr
& table, @@ -483,7 +646,7 @@ void AssertChunkSizes(const std::shared_ptr<::arrow::DataType>& dtype, PageSizes base_result, PageSizes modified_result, bool nullable, bool enable_dictionary, uint64_t min_chunk_size, uint64_t max_chunk_size) { - max_chunk_size *= 1.2; + max_chunk_size = static_cast(max_chunk_size * 1.2); if (::arrow::is_fixed_width(dtype->id())) { auto min_length = ElementCount(min_chunk_size, dtype->byte_width(), nullable); auto max_length = ElementCount(max_chunk_size, dtype->byte_width(), nullable); @@ -498,9 +661,9 @@ void AssertChunkSizes(const std::shared_ptr<::arrow::DataType>& dtype, } } -constexpr uint64_t kMinChunkSize = 64 * 1024; -constexpr uint64_t kMaxChunkSize = 128 * 1024; -constexpr uint64_t kPartSize = 256 * 1024; +constexpr uint64_t kMinChunkSize = 8 * 1024; +constexpr uint64_t kMaxChunkSize = 32 * 1024; +constexpr uint64_t kPartSize = 128 * 1024; constexpr uint64_t kEditSize = 128; class TestColumnCDC : public ::testing::TestWithParam< @@ -513,26 +676,25 @@ class TestColumnCDC : public ::testing::TestWithParam< void SetUp() override { auto [dtype, nullable, byte_per_record] = GetParam(); auto field_ = ::arrow::field("f0", dtype, nullable); + auto schema = ::arrow::schema({field_}); auto part_length = kPartSize / byte_per_record; auto edit_length = kEditSize / byte_per_record; - // Generate random table parts, these are later concatenated to simulate - // different scenarios like insert, update, delete, and append. - part1_ = GenerateTable({field_}, part_length, /*seed=*/1); - part2_ = GenerateTable({field_}, edit_length, /*seed=*/2); - part3_ = GenerateTable({field_}, part_length, /*seed=*/3); - part4_ = GenerateTable({field_}, edit_length, /*seed=*/4); - part5_ = GenerateTable({field_}, part_length, /*seed=*/5); - part6_ = GenerateTable({field_}, edit_length, /*seed=*/6); - part7_ = GenerateTable({field_}, edit_length, /*seed=*/7); + ASSERT_OK_AND_ASSIGN(part1_, GenerateTable(schema, part_length, 0)); + ASSERT_OK_AND_ASSIGN(part2_, GenerateTable(schema, edit_length, 1)); + ASSERT_OK_AND_ASSIGN(part3_, GenerateTable(schema, part_length, part_length)); + ASSERT_OK_AND_ASSIGN(part4_, GenerateTable(schema, edit_length, 2)); + ASSERT_OK_AND_ASSIGN(part5_, GenerateTable(schema, part_length, 2 * part_length)); + ASSERT_OK_AND_ASSIGN(part6_, GenerateTable(schema, edit_length, 3)); + ASSERT_OK_AND_ASSIGN(part7_, GenerateTable(schema, edit_length, 4)); } }; TEST_P(TestColumnCDC, DeleteOnce) { auto [dtype, nullable, _] = GetParam(); - auto base = ConcatAndCombine({part1_, part2_, part3_}); - auto modified = ConcatAndCombine({part1_, part3_}); + ASSERT_OK_AND_ASSIGN(auto base, ConcatAndCombine({part1_, part2_, part3_})); + ASSERT_OK_AND_ASSIGN(auto modified, ConcatAndCombine({part1_, part3_})); ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { @@ -554,8 +716,9 @@ TEST_P(TestColumnCDC, DeleteOnce) { TEST_P(TestColumnCDC, DeleteTwice) { auto [dtype, nullable, _] = GetParam(); - auto base = ConcatAndCombine({part1_, part2_, part3_, part4_, part5_}); - auto modified = ConcatAndCombine({part1_, part3_, part5_}); + ASSERT_OK_AND_ASSIGN(auto base, + ConcatAndCombine({part1_, part2_, part3_, part4_, part5_})); + ASSERT_OK_AND_ASSIGN(auto modified, ConcatAndCombine({part1_, part3_, part5_})); ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { @@ -576,8 +739,8 @@ TEST_P(TestColumnCDC, DeleteTwice) { TEST_P(TestColumnCDC, UpdateOnce) { auto [dtype, nullable, _] = GetParam(); - auto base = ConcatAndCombine({part1_, part2_, part3_}); - auto modified = ConcatAndCombine({part1_, part4_, part3_}); + ASSERT_OK_AND_ASSIGN(auto base, ConcatAndCombine({part1_, part2_, part3_})); + ASSERT_OK_AND_ASSIGN(auto modified, ConcatAndCombine({part1_, part4_, part3_})); ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { @@ -597,8 +760,10 @@ TEST_P(TestColumnCDC, UpdateOnce) { TEST_P(TestColumnCDC, UpdateTwice) { auto [dtype, nullable, _] = GetParam(); - auto base = ConcatAndCombine({part1_, part2_, part3_, part4_, part5_}); - auto modified = ConcatAndCombine({part1_, part6_, part3_, part7_, part5_}); + ASSERT_OK_AND_ASSIGN(auto base, + ConcatAndCombine({part1_, part2_, part3_, part4_, part5_})); + ASSERT_OK_AND_ASSIGN(auto modified, + ConcatAndCombine({part1_, part6_, part3_, part7_, part5_})); ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { @@ -618,8 +783,8 @@ TEST_P(TestColumnCDC, UpdateTwice) { TEST_P(TestColumnCDC, InsertOnce) { auto [dtype, nullable, _] = GetParam(); - auto base = ConcatAndCombine({part1_, part3_}); - auto modified = ConcatAndCombine({part1_, part2_, part3_}); + ASSERT_OK_AND_ASSIGN(auto base, ConcatAndCombine({part1_, part3_})); + ASSERT_OK_AND_ASSIGN(auto modified, ConcatAndCombine({part1_, part2_, part3_})); ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { @@ -640,8 +805,9 @@ TEST_P(TestColumnCDC, InsertOnce) { TEST_P(TestColumnCDC, InsertTwice) { auto [dtype, nullable, _] = GetParam(); - auto base = ConcatAndCombine({part1_, part3_, part5_}); - auto modified = ConcatAndCombine({part1_, part2_, part3_, part4_, part5_}); + ASSERT_OK_AND_ASSIGN(auto base, ConcatAndCombine({part1_, part3_, part5_})); + ASSERT_OK_AND_ASSIGN(auto modified, + ConcatAndCombine({part1_, part2_, part3_, part4_, part5_})); ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { @@ -662,8 +828,8 @@ TEST_P(TestColumnCDC, InsertTwice) { TEST_P(TestColumnCDC, Append) { auto [dtype, nullable, _] = GetParam(); - auto base = ConcatAndCombine({part1_, part2_, part3_}); - auto modified = ConcatAndCombine({part1_, part2_, part3_, part4_}); + ASSERT_OK_AND_ASSIGN(auto base, ConcatAndCombine({part1_, part2_, part3_})); + ASSERT_OK_AND_ASSIGN(auto modified, ConcatAndCombine({part1_, part2_, part3_, part4_})); ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { @@ -686,7 +852,7 @@ INSTANTIATE_TEST_SUITE_P( testing::Values( // Numeric std::make_tuple(::arrow::uint8(), false, 1), - std::make_tuple(::arrow::uint16(), true, 2), + std::make_tuple(::arrow::uint16(), false, 2), std::make_tuple(::arrow::uint32(), false, 4), std::make_tuple(::arrow::uint64(), true, 8), std::make_tuple(::arrow::int8(), false, 1), @@ -699,25 +865,20 @@ INSTANTIATE_TEST_SUITE_P( std::make_tuple(::arrow::decimal128(18, 6), false, 16), std::make_tuple(::arrow::decimal256(40, 6), false, 32), // Binary-like + std::make_tuple(::arrow::utf8(), false, 16), std::make_tuple(::arrow::binary(), true, 16), - - std::make_tuple(::arrow::large_binary(), false, 16), std::make_tuple(::arrow::fixed_size_binary(16), true, 16), - std::make_tuple(::arrow::utf8(), false, 16), - std::make_tuple(::arrow::utf8(), true, 16), - std::make_tuple(::arrow::large_utf8(), false, 16), + // Temporal std::make_tuple(::arrow::date32(), false, 4), - std::make_tuple(::arrow::date64(), false, 8), - std::make_tuple(::arrow::time32(::arrow::TimeUnit::SECOND), true, 4), + std::make_tuple(::arrow::time32(::arrow::TimeUnit::MILLI), true, 4), std::make_tuple(::arrow::time64(::arrow::TimeUnit::NANO), false, 8), std::make_tuple(::arrow::timestamp(::arrow::TimeUnit::NANO), true, 8), std::make_tuple(::arrow::duration(::arrow::TimeUnit::NANO), false, 8), // Nested types - std::make_tuple(::arrow::list(::arrow::int32()), false, 64), - std::make_tuple(::arrow::list(::arrow::int32()), true, 64), - std::make_tuple(::arrow::list(::arrow::utf8()), true, 64), - std::make_tuple(::arrow::large_list(::arrow::int32()), true, 64), + std::make_tuple(::arrow::list(::arrow::int32()), false, 16), + std::make_tuple(::arrow::list(::arrow::int32()), true, 18), + std::make_tuple(::arrow::list(::arrow::utf8()), true, 18), std::make_tuple(::arrow::struct_({::arrow::field("f0", ::arrow::int32())}), false, 8), std::make_tuple(::arrow::struct_({::arrow::field("f0", ::arrow::float64())}), From 960883a4710ecb81bcf47bc4f1aa2605ef2b52f7 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Thu, 27 Feb 2025 18:05:23 +0100 Subject: [PATCH 022/102] address review comments --- cpp/src/parquet/column_chunker.cc | 41 ++++++++---------- cpp/src/parquet/column_chunker.h | 2 +- cpp/src/parquet/column_writer.cc | 2 +- python/pyarrow/_parquet.pyx | 6 +-- python/run_test.sh | 72 +++++++++++++++++++++++++++++++ 5 files changed, 95 insertions(+), 28 deletions(-) create mode 100755 python/run_test.sh diff --git a/cpp/src/parquet/column_chunker.cc b/cpp/src/parquet/column_chunker.cc index 14de6d81575..68c522a17c5 100644 --- a/cpp/src/parquet/column_chunker.cc +++ b/cpp/src/parquet/column_chunker.cc @@ -592,9 +592,7 @@ void ContentDefinedChunker::Roll(const T value) { auto bytes = reinterpret_cast(&value); for (size_t i = 0; i < BYTE_WIDTH; ++i) { rolling_hash_ = (rolling_hash_ << 1) + GEAR_HASH_TABLE[nth_run_][bytes[i]]; - if ((rolling_hash_ & hash_mask_) == 0) { - has_matched_ = true; - } + has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0); } } @@ -608,15 +606,13 @@ void ContentDefinedChunker::Roll(std::string_view value) { for (char c : value) { rolling_hash_ = (rolling_hash_ << 1) + GEAR_HASH_TABLE[nth_run_][static_cast(c)]; - if ((rolling_hash_ & hash_mask_) == 0) { - has_matched_ = true; - } + has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0); } } -bool ContentDefinedChunker::Check() { +bool ContentDefinedChunker::NeedNewChunk() { // decide whether to create a new chunk based on the rolling hash; has_matched_ is - // set to true if we encountered a match since the last Check() call + // set to true if we encountered a match since the last NeedNewChunk() call if (ARROW_PREDICT_FALSE(has_matched_)) { has_matched_ = false; // in order to have a normal distribution of chunk sizes, we only create a new chunk @@ -631,7 +627,8 @@ bool ContentDefinedChunker::Check() { } if (ARROW_PREDICT_FALSE(chunk_size_ >= max_size_)) { // we have a hard limit on the maximum chunk size, not that we don't reset the rolling - // hash state here, so the next Check() call will continue from the current state + // hash state here, so the next NeedNewChunk() call will continue from the current + // state chunk_size_ = 0; return true; } @@ -643,7 +640,7 @@ const std::vector ContentDefinedChunker::Calculate(const int16_t* def_lev const int16_t* rep_levels, int64_t num_levels, const T& leaf_array) { - std::vector result; + std::vector chunks; bool has_def_levels = level_info_.def_level > 0; bool has_rep_levels = level_info_.rep_level > 0; @@ -654,13 +651,13 @@ const std::vector ContentDefinedChunker::Calculate(const int16_t* def_lev while (offset < num_levels) { Roll(leaf_array.GetView(offset)); ++offset; - if (Check()) { - result.emplace_back(prev_offset, prev_offset, offset - prev_offset); + if (NeedNewChunk()) { + chunks.emplace_back(prev_offset, prev_offset, offset - prev_offset); prev_offset = offset; } } if (prev_offset < num_levels) { - result.emplace_back(prev_offset, prev_offset, num_levels - prev_offset); + chunks.emplace_back(prev_offset, prev_offset, num_levels - prev_offset); } } else if (!has_rep_levels) { // non-nested data with nulls @@ -670,13 +667,13 @@ const std::vector ContentDefinedChunker::Calculate(const int16_t* def_lev Roll(def_levels[offset]); Roll(leaf_array.GetView(offset)); ++offset; - if (Check()) { - result.emplace_back(prev_offset, prev_offset, offset - prev_offset); + if (NeedNewChunk()) { + chunks.emplace_back(prev_offset, prev_offset, offset - prev_offset); prev_offset = offset; } } if (prev_offset < num_levels) { - result.emplace_back(prev_offset, prev_offset, num_levels - prev_offset); + chunks.emplace_back(prev_offset, prev_offset, num_levels - prev_offset); } } else { // nested data with nulls @@ -684,12 +681,11 @@ const std::vector ContentDefinedChunker::Calculate(const int16_t* def_lev bool is_record_boundary; int16_t def_level; int16_t rep_level; - int64_t level_offset = 0; int64_t value_offset = 0; int64_t record_level_offset = 0; int64_t record_value_offset = 0; - while (level_offset < num_levels) { + for (int64_t level_offset = 0; level_offset < num_levels; ++level_offset) { def_level = def_levels[level_offset]; rep_level = rep_levels[level_offset]; @@ -702,16 +698,15 @@ const std::vector ContentDefinedChunker::Calculate(const int16_t* def_lev Roll(leaf_array.GetView(value_offset)); } - if (is_record_boundary && Check()) { + if (is_record_boundary && NeedNewChunk()) { auto levels_to_write = level_offset - record_level_offset; if (levels_to_write > 0) { - result.emplace_back(record_level_offset, record_value_offset, levels_to_write); + chunks.emplace_back(record_level_offset, record_value_offset, levels_to_write); record_level_offset = level_offset; record_value_offset = value_offset; } } - ++level_offset; if (has_leaf_value) { ++value_offset; } @@ -719,11 +714,11 @@ const std::vector ContentDefinedChunker::Calculate(const int16_t* def_lev auto levels_to_write = num_levels - record_level_offset; if (levels_to_write > 0) { - result.emplace_back(record_level_offset, record_value_offset, levels_to_write); + chunks.emplace_back(record_level_offset, record_value_offset, levels_to_write); } } - return result; + return chunks; } #define PRIMITIVE_CASE(TYPE_ID, ArrowType) \ diff --git a/cpp/src/parquet/column_chunker.h b/cpp/src/parquet/column_chunker.h index 5011620bd31..f77fdc435fb 100644 --- a/cpp/src/parquet/column_chunker.h +++ b/cpp/src/parquet/column_chunker.h @@ -54,7 +54,7 @@ class ContentDefinedChunker { template void Roll(const T value); void Roll(std::string_view value); - inline bool Check(); + inline bool NeedNewChunk(); template const std::vector Calculate(const int16_t* def_levels, const int16_t* rep_levels, diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 26a30a8bcb3..c7d15f4d5a7 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1362,7 +1362,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, bits_buffer_->ZeroPadding(); } - if (this->properties_->cdc_enabled()) { + if (properties_->cdc_enabled()) { ARROW_ASSIGN_OR_RAISE(auto boundaries, content_defined_chunker_.GetBoundaries( def_levels, rep_levels, num_levels, leaf_array)); diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 28228780ea9..e0b39666e1e 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -2116,10 +2116,10 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( props.dictionary_pagesize_limit(dictionary_pagesize_limit) # content defined chunking - if cdc is False: - props.disable_cdc() - elif cdc is True: + if cdc: props.enable_cdc() + else: + props.disable_cdc() if cdc_size_range is not None: min_size, max_size = cdc_size_range diff --git a/python/run_test.sh b/python/run_test.sh new file mode 100755 index 00000000000..6476c12dcd4 --- /dev/null +++ b/python/run_test.sh @@ -0,0 +1,72 @@ +set -e + +# -DARROW_USE_ASAN=OFF \ +# -DARROW_USE_UBSAN=OFF \ +# -DARROW_USE_TSAN=OFF \ + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd) +ARROW_DIR=${SCRIPT_DIR}/.. +export ARROW_BUILD_TYPE=${ARROW_BUILD_TYPE:-debug} +export ARROW_HOME=$CONDA_PREFIX +export PARQUET_TEST_DATA=${ARROW_DIR}/cpp/submodules/parquet-testing/data +export ARROW_TEST_DATA=${ARROW_DIR}/testing/data + +export ARROW_HDFS_TEST_HOST=impala +export ARROW_HDFS_TEST_PORT=8020 +export ARROW_HDFS_TEST_USER=hdfs + +mkdir -p ${ARROW_DIR}/cpp/build +pushd ${ARROW_DIR}/cpp/build + +cmake -GNinja \ + -DARROW_BUILD_BENCHMARKS=OFF \ + -DARROW_BUILD_STATIC=OFF \ + -DARROW_BUILD_TESTS=ON \ + -DARROW_USE_ASAN=OFF \ + -DARROW_DATASET=ON \ + -DARROW_EXTRA_ERROR_CONTEXT=ON \ + -DARROW_BUILD_INTEGRATION=ON \ + -DARROW_DEPENDENCY_SOURCE=CONDA \ + -DARROW_FLIGHT=OFF \ + -DARROW_GANDIVA=OFF \ + -DARROW_JEMALLOC=ON \ + -DARROW_MIMALLOC=ON \ + -DARROW_WITH_SNAPPY=ON \ + -DARROW_WITH_LZ4=ON \ + -DARROW_WITH_ZSTD=ON \ + -DARROW_COMPUTE=ON \ + -DARROW_PARQUET=ON \ + -DARROW_CSV=ON \ + -DARROW_ORC=OFF \ + -DARROW_USE_CCACHE=ON \ + -DARROW_S3=ON \ + -DARROW_TEST_MEMCHECK=OFF \ + -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=YES \ + -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ + .. + +ninja +ninja install + +popd + +export PYARROW_CMAKE_GENERATOR=Ninja +export PYARROW_BUILD_TYPE=debug +export PYARROW_WITH_PARQUET=1 +# export PYARROW_WITH_HDFS=1 +# export PYARROW_WITH_GANDIVA=0 +export PYARROW_WITH_DATASET=1 +# export PYARROW_WITH_FLIGHT=1 +export PYARROW_WITH_S3=1 +export PYARROW_PARALLEL=8 +# export PYARROW_WITH_ORC=1 + +# # export DYLD_INSERT_LIBRARIES=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/clang/12.0.0/lib/darwin/libclang_rt.asan_osx_dynamic.dylib +# # export DYLD_INSERT_LIBRARIES=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/clang/12.0.0/lib/darwin/libclang_rt.tsan_osx_dynamic.dylib + +pushd ${ARROW_DIR}/python +#python setup.py build_ext --inplace +python setup.py develop +popd +# pytest -sv "$@" From 3a9266292dda0a7717ebbb28ec48f355763c2618 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Sat, 1 Mar 2025 22:19:16 +0100 Subject: [PATCH 023/102] rename GEAR_HASH_TABLE to GEARHASH_TABLE --- cpp/src/parquet/column_chunker.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/column_chunker.cc b/cpp/src/parquet/column_chunker.cc index 68c522a17c5..b35ea431b57 100644 --- a/cpp/src/parquet/column_chunker.cc +++ b/cpp/src/parquet/column_chunker.cc @@ -26,7 +26,7 @@ namespace parquet { namespace internal { -constexpr uint64_t GEAR_HASH_TABLE[8][256] = { +constexpr uint64_t GEARHASH_TABLE[8][256] = { {// seed = 0 0xf09f35a563783945, 0x0dcc5b3bc5ae410a, 0x63f1ea8d22554270, 0xfbe5ee7bd05a7b61, 0x3f692ed5e9934aba, 0xaab3755952250eb8, 0xdefb168dc2888fa5, 0x501b36f7c77a7d47, @@ -591,7 +591,7 @@ void ContentDefinedChunker::Roll(const T value) { } auto bytes = reinterpret_cast(&value); for (size_t i = 0; i < BYTE_WIDTH; ++i) { - rolling_hash_ = (rolling_hash_ << 1) + GEAR_HASH_TABLE[nth_run_][bytes[i]]; + rolling_hash_ = (rolling_hash_ << 1) + GEARHASH_TABLE[nth_run_][bytes[i]]; has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0); } } @@ -605,7 +605,7 @@ void ContentDefinedChunker::Roll(std::string_view value) { } for (char c : value) { rolling_hash_ = - (rolling_hash_ << 1) + GEAR_HASH_TABLE[nth_run_][static_cast(c)]; + (rolling_hash_ << 1) + GEARHASH_TABLE[nth_run_][static_cast(c)]; has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0); } } From 9208bd34dbc6c2dcec496b23ee95a7a534f08f07 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Mon, 3 Mar 2025 12:48:25 +0100 Subject: [PATCH 024/102] some docstrings about CDC --- cpp/src/parquet/column_chunker.h | 83 +++++++++++++++++++++++++- cpp/src/parquet/column_chunker_hash.py | 27 +++++++++ python/run_test.sh | 72 ---------------------- 3 files changed, 107 insertions(+), 75 deletions(-) create mode 100644 cpp/src/parquet/column_chunker_hash.py delete mode 100755 python/run_test.sh diff --git a/cpp/src/parquet/column_chunker.h b/cpp/src/parquet/column_chunker.h index f77fdc435fb..1921efc0492 100644 --- a/cpp/src/parquet/column_chunker.h +++ b/cpp/src/parquet/column_chunker.h @@ -23,11 +23,12 @@ #include "arrow/array.h" #include "parquet/level_conversion.h" -using arrow::internal::checked_cast; - namespace parquet { + namespace internal { +// Represents a chunk of data with level offsets and value offsets due to the +// record shredding for nested data. struct Chunk { int64_t level_offset; int64_t value_offset; @@ -39,12 +40,81 @@ struct Chunk { levels_to_write(levels_to_write) {} }; +/// CDC (Content-Defined Chunking) is a technique that divides data into variable-sized +/// chunks based on the content of the data itself, rather than using fixed-size +/// boundaries. +/// +/// For example, given this sequence of values in a column: +/// +/// File1: [1,2,3, 4,5,6, 7,8,9] +/// chunk1 chunk2 chunk3 +/// +/// Assume there is an inserted value between 3 and 4: +/// +/// File2: [1,2,3,0, 4,5,6, 7,8,9] +/// new-chunk chunk2 chunk3 +/// +/// The chunking process will adjust to maintain stable boundaries across data +/// modifications. Each chunk defines a new parquet data page which are contiguously +/// written out to the file. Since each page compressed independently, the files' contents +/// would look like the following with unique page identifiers: +/// +/// File1: [Page1][Page2][Page3]... +/// File2: [Page4][Page2][Page3]... +/// +/// Then the parquet file is being uploaded to a content addressable storage systems (CAS) +/// which split the bytes stream into content defined blobs. The CAS system will calculate +/// a unique identifier for each blob, then store the blob in a key-value store. If the +/// same blob is encountered again, the system can refer to the hash instead of physically +/// storing the blob again. In the example above, the CAS system would phiysically store +/// Page1, Page2, Page3, and Page4 only once and the required metadata to reassemble the +/// files. +/// While the deduplication is performed by the CAS system, the parquet chunker makes it +/// possible to efficiently deduplicate the data by consistently dividing the data into +/// chunks. +/// +/// Implementation details: +/// +/// Only the parquet writer must be aware of the content defined chunking, the reader +/// doesn't need to know about it. Each parquet column writer holds a +/// ContentDefinedChunker instance depending on the writer's properties. The chunker's +/// state is maintained across the entire column without being reset between pages and row +/// groups. +/// +/// The chunker receives the record shredded column data (def_levels, rep_levels, values) +/// and goes over the (def_level, rep_level, value) triplets one by one while adjusting +/// the column-global rolling hash based on the triplet. Whenever the rolling hash matches +/// a predefined mask, the chunker creates a new chunk. The chunker returns a vector of +/// Chunk objects that represent the boundaries of the chunks/// +/// Note that the boundaries are deterministically calculated exclusively based on the +/// data itself, so the same data will always produce the same chunks - given the same +/// chunker configuration. +/// +/// References: +/// - FastCDC paper: "FastCDC: a Fast and Efficient Content-Defined Chunking Approach for +/// Data Deduplication" +/// https://www.usenix.org/system/files/conference/atc16/atc16-paper-xia.pdf class ContentDefinedChunker { public: + /// Create a new ContentDefinedChunker instance + /// + /// @param level_info Information about definition and repetition levels + /// @param size_range Min/max chunk size as pair, the chunker will + /// attempt to uniformly distribute the chunks between these extremes. + /// @param norm_factor Normalization factor to center the chunk size around the average + /// size more aggressively. By increasing the normalization factor, + /// probability of finding a chunk boundary increases. ContentDefinedChunker(const LevelInfo& level_info, std::pair size_range, - uint8_t norm_factor = 1); + uint8_t norm_factor = 0); + /// Get the chunk boundaries for the given column data + /// + /// @param def_levels Definition levels + /// @param rep_levels Repetition levels + /// @param num_levels Number of levels + /// @param values Column values as an Arrow array + /// @return Vector of Chunk objects representing the chunk boundaries const ::arrow::Result> GetBoundaries(const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, @@ -60,9 +130,16 @@ class ContentDefinedChunker { const std::vector Calculate(const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, const T& leaf_array); + // Reference to the column's level information const internal::LevelInfo& level_info_; + // Minimum chunk size in bytes, the rolling hash will not be updated until this size is + // reached for each chunk. Note that all data sent through the hash function is counted + // towards the chunk size, including definition and repetition levels. const uint64_t min_size_; const uint64_t max_size_; + // The mask to match the rolling hash against to determine if a new chunk should be + // created. The mask is calculated based on min/max chunk size and the normalization + // factor. const uint64_t hash_mask_; bool has_matched_ = false; diff --git a/cpp/src/parquet/column_chunker_hash.py b/cpp/src/parquet/column_chunker_hash.py new file mode 100644 index 00000000000..179918dd4fe --- /dev/null +++ b/cpp/src/parquet/column_chunker_hash.py @@ -0,0 +1,27 @@ +import hashlib +import sys + + +def gearhash(n: int, seed: int): + value = bytes([seed] * 64 + [n] * 64) + hasher = hashlib.md5(value) + return hasher.hexdigest()[:16] + + +def print_table(seed: int, length=256, comma=True): + table = [gearhash(n, seed=seed) for n in range(length)] + print(f"{{ // seed = {seed}") + for i in range(0, length, 4): + print(" ", end="") + values = [f"0x{value}" for value in table[i:i + 4]] + values = ", ".join(values) + print(f" {values}", end=",\n" if i < length - 4 else "\n") + print(" }", end=", " if comma else "") + + +if __name__ == "__main__": + print("{") + n = int(sys.argv[1]) + for seed in range(n): + print_table(seed, comma=seed < n) + print("}") \ No newline at end of file diff --git a/python/run_test.sh b/python/run_test.sh deleted file mode 100755 index 6476c12dcd4..00000000000 --- a/python/run_test.sh +++ /dev/null @@ -1,72 +0,0 @@ -set -e - -# -DARROW_USE_ASAN=OFF \ -# -DARROW_USE_UBSAN=OFF \ -# -DARROW_USE_TSAN=OFF \ - -SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd) -ARROW_DIR=${SCRIPT_DIR}/.. -export ARROW_BUILD_TYPE=${ARROW_BUILD_TYPE:-debug} -export ARROW_HOME=$CONDA_PREFIX -export PARQUET_TEST_DATA=${ARROW_DIR}/cpp/submodules/parquet-testing/data -export ARROW_TEST_DATA=${ARROW_DIR}/testing/data - -export ARROW_HDFS_TEST_HOST=impala -export ARROW_HDFS_TEST_PORT=8020 -export ARROW_HDFS_TEST_USER=hdfs - -mkdir -p ${ARROW_DIR}/cpp/build -pushd ${ARROW_DIR}/cpp/build - -cmake -GNinja \ - -DARROW_BUILD_BENCHMARKS=OFF \ - -DARROW_BUILD_STATIC=OFF \ - -DARROW_BUILD_TESTS=ON \ - -DARROW_USE_ASAN=OFF \ - -DARROW_DATASET=ON \ - -DARROW_EXTRA_ERROR_CONTEXT=ON \ - -DARROW_BUILD_INTEGRATION=ON \ - -DARROW_DEPENDENCY_SOURCE=CONDA \ - -DARROW_FLIGHT=OFF \ - -DARROW_GANDIVA=OFF \ - -DARROW_JEMALLOC=ON \ - -DARROW_MIMALLOC=ON \ - -DARROW_WITH_SNAPPY=ON \ - -DARROW_WITH_LZ4=ON \ - -DARROW_WITH_ZSTD=ON \ - -DARROW_COMPUTE=ON \ - -DARROW_PARQUET=ON \ - -DARROW_CSV=ON \ - -DARROW_ORC=OFF \ - -DARROW_USE_CCACHE=ON \ - -DARROW_S3=ON \ - -DARROW_TEST_MEMCHECK=OFF \ - -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ - -DCMAKE_EXPORT_COMPILE_COMMANDS=YES \ - -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ - .. - -ninja -ninja install - -popd - -export PYARROW_CMAKE_GENERATOR=Ninja -export PYARROW_BUILD_TYPE=debug -export PYARROW_WITH_PARQUET=1 -# export PYARROW_WITH_HDFS=1 -# export PYARROW_WITH_GANDIVA=0 -export PYARROW_WITH_DATASET=1 -# export PYARROW_WITH_FLIGHT=1 -export PYARROW_WITH_S3=1 -export PYARROW_PARALLEL=8 -# export PYARROW_WITH_ORC=1 - -# # export DYLD_INSERT_LIBRARIES=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/clang/12.0.0/lib/darwin/libclang_rt.asan_osx_dynamic.dylib -# # export DYLD_INSERT_LIBRARIES=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/clang/12.0.0/lib/darwin/libclang_rt.tsan_osx_dynamic.dylib - -pushd ${ARROW_DIR}/python -#python setup.py build_ext --inplace -python setup.py develop -popd -# pytest -sv "$@" From 123721622ca5c723290753f9bcb96eeda569437f Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Mon, 3 Mar 2025 13:23:12 +0100 Subject: [PATCH 025/102] place the gearhash table to a separate header --- cpp/src/parquet/column_chunker.cc | 524 +------------------ cpp/src/parquet/column_chunker_hash.py | 27 - cpp/src/parquet/column_chunker_hashtable.h | 547 ++++++++++++++++++++ cpp/src/parquet/column_chunker_hashtable.py | 90 ++++ cpp/src/parquet/column_chunker_test.cc | 19 +- 5 files changed, 656 insertions(+), 551 deletions(-) delete mode 100644 cpp/src/parquet/column_chunker_hash.py create mode 100644 cpp/src/parquet/column_chunker_hashtable.h create mode 100644 cpp/src/parquet/column_chunker_hashtable.py diff --git a/cpp/src/parquet/column_chunker.cc b/cpp/src/parquet/column_chunker.cc index b35ea431b57..90979bbd25a 100644 --- a/cpp/src/parquet/column_chunker.cc +++ b/cpp/src/parquet/column_chunker.cc @@ -21,534 +21,12 @@ #include #include "arrow/array.h" #include "arrow/util/logging.h" +#include "parquet/column_chunker_hashtable.h" #include "parquet/level_conversion.h" namespace parquet { namespace internal { -constexpr uint64_t GEARHASH_TABLE[8][256] = { - {// seed = 0 - 0xf09f35a563783945, 0x0dcc5b3bc5ae410a, 0x63f1ea8d22554270, 0xfbe5ee7bd05a7b61, - 0x3f692ed5e9934aba, 0xaab3755952250eb8, 0xdefb168dc2888fa5, 0x501b36f7c77a7d47, - 0xd2fff45d1989642d, 0x80217c1c600e30a6, 0xb9469ee2e43df7ac, 0x3654b76a61999706, - 0x6ea73dfe5de0c6b6, 0xdfd662e1937a589d, 0x0dbe0cc74b188a68, 0xde45f4e6d73ffc6f, - 0xcdf7a7759e70d87e, 0x5d6a951b8d38c310, 0xdc9423c3813fcf2c, 0x25dc2976e167ffce, - 0xc2555baa1d031c84, 0x115bc3f2230a3ab6, 0xd4b10260f350bede, 0xdfd3501ab447d723, - 0x022e79217edaf167, 0x1635e2255c5a7526, 0xa0a750350cc77102, 0xc027133e05d39f56, - 0xd949459779cf0387, 0xb92f1464f5c688c2, 0xd9ac5f3e8b42f2f3, 0xdf02bb6f5ecaac21, - 0x8156f988fac7bfa4, 0xe4580f97bede2ec8, 0x44fe7d17a76fca32, 0x885f59bd54c2014c, - 0x435e63ec655ffae9, 0x5ebc51930967b1f1, 0x5428c2084ac29e47, 0x9465938fec30e36b, - 0xc7cb3de4977772cd, 0x15692d7c201e8c3a, 0x505ee65cdc4b17f4, 0x7d9839a0a7aead6b, - 0xeef5f5b6a0105291, 0x76c2fb232ce7f5bf, 0x5c13893c1c3ff3a9, 0x65b6b547d4442f98, - 0xb8ad7487c8c96fce, 0x906bcf51c99974f8, 0x2f56e48bb943a48c, 0xbc9ab109f82d3a44, - 0xcd5160cdc8c7e735, 0xbe9acb9df3427732, 0x386b91d477d7fade, 0x36be463621dd5af2, - 0xcbe6a2faffd627a8, 0x9c8fd528463a2f5a, 0xb9b88c6bb802b184, 0xb414b4e665c597c7, - 0xbedb142568209556, 0x5360d81c25429dce, 0x63a69a960a952f37, 0xc900d63899e1b503, - 0x1abc63a8b37c7728, 0xa8b3a8b6409080eb, 0x495e391f662959f6, 0xdf1e136f3e12229b, - 0x33d5fc526b0dd38d, 0x321221ae2abfac63, 0x7fde18351fda7395, 0xed79fe5c3a6aa4c3, - 0x2dd6965a4867d8d4, 0x54813ca20fe8799b, 0x5d59ea6456465c39, 0x0de0c294d1936b81, - 0x4aaf0755002c588c, 0x3530a1857ad04c6d, 0xb8a64f4ce184442b, 0xe0def10bceedfa17, - 0x46e38d0a443757ec, 0x9795a1c645ee16d7, 0x7e531def245eac8a, 0x683b25c43a0716cf, - 0x884583d372da219d, 0x5b06b62c910416e5, 0x54b6902fbebd3dbe, 0x931198d40a761a75, - 0xead7d8e830013590, 0x80b4d5dc99bfaced, 0xf98272c8108a1ad2, 0x1adce054289a0ec6, - 0x7d53a1143c56b465, 0x497fbe4f00c92b52, 0x525e4cc2e81ebd69, 0xc94478e0d5508ff6, - 0xb8a5da83c196d07c, 0x7667a921b65b0603, 0xf236fabbdefe6cd1, 0x53da978d19a92b98, - 0xc604f6e97087124d, 0x2cbd27221924b094, 0x65cd1102c985b1d2, 0x08c0755dc1a97eb4, - 0x5e0419e921c0fef1, 0x282d2c1196f84a29, 0xe21117fcfc5793f7, 0xcf4e985dc38e6c2e, - 0xd521f4f264d55616, 0xde69b04c485f2a10, 0x59410e245305178a, 0xceab1d477c943601, - 0xa9805732d71ee5e9, 0x054cd443896974f6, 0xf2b517717a423a3e, 0x09517937fa9fac95, - 0x4938233e9ca871e3, 0x9132cbaf56f83ec0, 0x4703421ed1dd027d, 0xfd9933f4e6f1ec4e, - 0xf237c7fded2274a8, 0xdf4616efe68cd7b4, 0x5e46de0f39f0a380, 0x3d41e0c6d8e095b0, - 0xc5272f8a5bb2df09, 0x68aa78e8301fb964, 0xbf5b5b52c8e32ae0, 0xbf28ed3df74bdcf7, - 0xd6198f64c833815a, 0x8cd99d2974267544, 0xd90560ea4465ff2c, 0x571d65ad7ad59261, - 0x309453518baa367a, 0xa60538377bc79fb2, 0xace515da1ab4183c, 0xf56d3c8d891d1c5b, - 0x5b0d8370b59def49, 0x775866ce7c83c762, 0x3d76085695c8e18a, 0xba064d1a9af1b114, - 0xc84ef7cd7b98b521, 0x90b9231681c2bc37, 0x37e2b13e6f585b6b, 0x1d0a34e55e0f369f, - 0x86bb8019cf41447c, 0x4b95c6ef55b3f71f, 0x3b6ed1660732b310, 0x617eee603d137f21, - 0xf4f6278b464f3bbc, 0xdfb763b720da205a, 0x353478899b871cb7, 0xe45fbbff574cc41e, - 0x1a94b60847907d72, 0xb10eef051eff67a5, 0xf0e012ec6a284d40, 0xcc1cd1a11b926d7c, - 0xcf9d9c5453e19cad, 0x270febcc0fc0e86b, 0xd6567568778b781e, 0x7323b98965eeb46b, - 0xccecd374567086ff, 0xef7b44bfc497a704, 0xebc479c051a9f0a5, 0xc9b7410e3e00a235, - 0x1d084f7ecdf83dab, 0xc8a9a97e33ba8ba3, 0x8c75318f5b2350d6, 0xaa3cd5d0c684bdda, - 0xa81125fe0901bedf, 0xf7bcd76020edfc93, 0x834ee4c12e75874f, 0xb2bb8a7beb44fa14, - 0x32cd26f50a4f4e4d, 0x0fc5817ca55d959a, 0xd6e4ae2e3ae10718, 0x074abdcceb8d6e38, - 0xc0cc5f4f9b3a9c43, 0x1115d364363595b2, 0x69861db2eb19f2e8, 0x59b8d804cf92bc67, - 0x9bac9785e5e4b863, 0x7fa0e17a41869561, 0x10d3c9633f0c709c, 0x534a03deee6bc44a, - 0x73b1f7201257f581, 0x46fd6a11e2e0706b, 0x494abb554946e67a, 0xb5d6da317864dc8e, - 0x402ded9238f39687, 0xd8fa37d2cbd6d290, 0xcc818293fcb06791, 0x6482ab344806cd4d, - 0x0956e6ee9d8eb60b, 0x01fee622d8465ac8, 0xae7ece370cbd9c35, 0x7ff09e937a177279, - 0xa2c29ee7a33ca5f1, 0x990e8dbee083923b, 0x4a819b72f610863a, 0xddecfad79d3f08be, - 0x627372480fac20a7, 0x802154d6eca2db4c, 0x8fcf02e42f805e55, 0x040a911ff8cea977, - 0xbb544485bc64d0d4, 0xaddde1aeb406d0fb, 0xf6b35fae23dce66f, 0xc07a9fb3645d2f9b, - 0xccd113907e9c0fed, 0xd17af369984fd213, 0x9223823c59a083e7, 0xe19d475606b81013, - 0xe181ac116a90e57a, 0x71f7b6258c6def4c, 0x2246f34b45964f7c, 0xd74aedaea2d31751, - 0xb1add86e5dd305d1, 0xeb9ba881f16d6471, 0xef7600e036f5c6ff, 0x1d50bc9735b8fb85, - 0xe63942bd1f3e2969, 0x9241ba9f8b3f4e72, 0xee8bb2bca07d35b6, 0x55cd55dab522654e, - 0x94d0cfa7c1a6845d, 0x02f9845d559884c3, 0x8ce70ea21063b560, 0xd70998028ef08b74, - 0xdfdb5bbee310876b, 0x4e21b2e348256d16, 0xde007a981c13debc, 0xe51950cbbddabfdd, - 0xd223301dbe9957c1, 0x084b8634cc2cce4b, 0x90e551378aa9d70c, 0x833b533ac633e448, - 0x7891e232882da57f, 0xa1bf26f0163ce2b3, 0xf33a0171eb9c68d5, 0x2e7de18ca69b3fa2, - 0x666fd6f175619199, 0x1239d37edb5feb9f, 0xfa9fc9382e61ff5c, 0x3ca4ad427e3c126f, - 0x37c6dd4c2c31ae6e, 0x1f1bacb619d427b2, 0x7dd09f5d10759afe, 0xc8d941432327d733, - 0x2b389ba25e1d43a7, 0xa4e3030c3740ff21, 0xcc56dae13fd37463, 0x2481457c175b560f, - 0x9deb35bde77c5c41, 0x847aa6ea5549a0c3, 0xcde01bb48b6e7f02, 0x15a28844e64cb211}, - {// seed = 1 - 0xecfcba92fe5691a3, 0x71377799fea34699, 0xb284c9096fa614e5, 0x54534170f40de6c8, - 0xbbd804d45884fba3, 0x44929a896388c8a1, 0x79b712508e0fa3b1, 0xeb53ab280af31054, - 0x351ea23a6319da7a, 0x2fbe55d9819d85a2, 0x34f4b6568dcd28b1, 0x8c94ea5e5d82967a, - 0x09068d333a46d3c5, 0x762ad4f64cb73381, 0xd5c6db5ef0e22640, 0x36d8ab5a36175680, - 0xd41fe333cdc3525a, 0xa1f51dbdf20ce781, 0x1410a95e786c8be6, 0x96b7499a670c2b41, - 0x3912e1037835d893, 0x272c5bd83e1e9115, 0x2ea7f91cad82a0d6, 0xcd10e85662ce9931, - 0xedad49be8d5e8b74, 0x7ccd8fe0f37d12bc, 0xfac0482005eed593, 0x4513991681f6c8b0, - 0x2804d612eb0ad37d, 0x7cca9e8412b81d34, 0x85ffd6707192b7b8, 0xea0560aeea954411, - 0x0122d28226102bba, 0xf51c47cdbd22fdd1, 0x3707d851183ff17c, 0xaef5a1465f3e902d, - 0xbcb38c2d8736a04f, 0x4025317e864bef15, 0x8d3f66d86e1ea58f, 0xc16759a3d97ed79a, - 0x1c62abdc0659f2f5, 0x23b3eb4e699bd28f, 0x5083c4fceed3ccaf, 0xa65bf34562cc989c, - 0xaa5865932fd79064, 0xf24d08d268c24593, 0x7fbd00a215196999, 0x7812cd366d752964, - 0x62e8dcb27ef3d945, 0xf08b7984e1b946dc, 0x547d23ad9a5c1dcf, 0x496b1fb249b27fb7, - 0xcd692e1db5f3b3ba, 0x41931e39f1e1bc61, 0x286c6a7d7edae82b, 0x17ef6638b6c4ca6e, - 0x609beb5a2576a934, 0xcc5e16fe4a69b83c, 0xbbd14d08b078fc24, 0x2a617680f481cb94, - 0x81dbbd5f86e6d039, 0xeb8205e1fc8ecc3c, 0xe5e3bb576faa8042, 0x5d6f1eb9d9df01b5, - 0x9a47b8739c10fb44, 0x398a7caad7ea7696, 0x9c0fc1d7c46adde6, 0x67cd6de0a51978a6, - 0x68ccc4b77a21cca4, 0x1e067066b82f415c, 0xf7ddade6535e1819, 0xf2185c884291751b, - 0xc322b7381fcbe34f, 0x242f593e88290b9b, 0x8e11ccc0ea5e84a3, 0x40e3a2e3346db8a2, - 0xf18bfc3ad2931a2c, 0x2468397394b00144, 0xeae199cce14e6817, 0x05b462686c75a1ae, - 0xda096cb859c51673, 0xd87aeb967a906bef, 0xaabc74493cb02fe6, 0x74d48fc2e7da143e, - 0x6ec1c8fed3f2c1fd, 0xe01e0704b463f18e, 0xc3d88a4d3a8056e4, 0xd01ae0ffab6c8f3f, - 0x881ba052620ae7c7, 0xcea033aef0a823a5, 0x8d2cad91d83df1e3, 0x18746d205e66dbe9, - 0x3061f8e58d046650, 0xd819c59f0ce2cf8b, 0x144e89e93635e870, 0x3415e88279b21651, - 0xd6f7ab944b86c3fa, 0x45f1dd15d0f67bdc, 0xbf0d97c7f4fa24f4, 0x34a7de520a57fcd2, - 0x4ba86fda03e9e2bc, 0xa7995265a025b552, 0x698f6819d5f51cf7, 0xd07dbe9d8a156981, - 0x2683945373857fc1, 0x116f8a84f96167de, 0x8bc832bd85595ebf, 0xb206519d74fdfafa, - 0xde9519b2e9b5cc5f, 0x16fdd6f2da1d8163, 0x7ba32bd48ef56f11, 0x6f4e4d7ee8b29717, - 0xd31576dde7468aad, 0x023bb08848676045, 0xf6dcc083178160b7, 0x42035f426250e683, - 0x343732993cfed89f, 0x0640a870a22d3d58, 0x65cff80b53b4ae6a, 0x27996fa17ab05215, - 0xfd5db01401b21a04, 0x894508784bc1673c, 0x5bfcf43a2380e27d, 0x4cd6dcc2715583b7, - 0xa43b3763e7d4c902, 0x6da83e12ef0c1257, 0xfe80a602b0335aff, 0x293a7d8f4ff344de, - 0xb4ae7c2b8956bf5a, 0x6b45432d38254b4d, 0xd086acbdf15d9455, 0xa4d19e43f41ea87b, - 0xf01f13ba4bb87fbf, 0xca582cf301a299ff, 0x0ddad3d45298fa7d, 0x0646a130459c3999, - 0xc08e3af3747e2cee, 0xfc7db8aa9ed67295, 0x783b329e7bd79d5f, 0x732dbc607957af7b, - 0x8e446ac19fb26555, 0xff1dfa4d61dc89a5, 0xb6fbc46bd8d011d8, 0x185147ec5779f0d7, - 0x6eb2cf6149a5380f, 0xb0e773df803a1eae, 0xc07706c5519bfce5, 0xc35abcf54fa95f14, - 0x40a01d99a38608ea, 0x776dcd6f603c277f, 0x6ae12389b1d6d0bb, 0x8bd981448df92bb9, - 0x426a6a7ca21a2c16, 0x87efd5b71c1bad26, 0x71fb7fc4cd41de48, 0xdd9033c45619d463, - 0x40eaab322654cef7, 0xe077fffed6f3e3a2, 0x375a4dbef9384447, 0x2066b009d2c4a100, - 0xeca4a5794a068447, 0x2128f64bddf341a1, 0x738b4bb1be90bd61, 0x433772cf3813d52e, - 0x9540c88add8e4474, 0x0b6d5decd21d3519, 0x654ead966745642d, 0xe1bfb03c3b4bdb4c, - 0x0b977a9937515b1f, 0x0a4587509ef63870, 0xe89f0de1d9cfd44a, 0x23a91390272e7f68, - 0xd92defbc9096b8d8, 0x004db87174612539, 0xc88ecaabdd1a71f1, 0x050de38393073346, - 0x8af1426d7964e038, 0xf352c4fef8ad5c87, 0x6f26bc7408e26548, 0x0d41543fd9bf3084, - 0xfc4e07553a840fc6, 0x5ef117de86a555a9, 0x1f11c42dffb5ae1b, 0x4147648f07490fa5, - 0x09b35fd7671b21aa, 0x1453b14f7ccca481, 0x944f6fcce4c9b2ba, 0x5b08dd2e3583dc06, - 0xe0220df78dc9c22d, 0x1c200b9506cbf666, 0x8a0b7465eadb523b, 0xfbcb43a91a1e2d80, - 0xe697f44be3c36a58, 0x2f8a8e48fb7e350d, 0x7baba71b8920d55f, 0x10edc0216105bc96, - 0x52db07c79d7a7a63, 0x1916e8cef9452ac3, 0x5cbbbf21f867b6cc, 0xadd583365a690a4b, - 0x4e4ca2c8bffc2fdb, 0xf5fe3416d2eebcfe, 0x839af8b85e452476, 0x8496c0c54ad44e16, - 0x6c46f1ecad4482bf, 0xb794cad76ae18715, 0x67b762eec7c62985, 0x52dc9e68df5b3a53, - 0x0cc7e444b422a5f9, 0xadbfe90841c112b0, 0xfe37b136f0ca5c34, 0xcfe9e47948a8d73e, - 0xee90572b86a30d91, 0x549e72d8262830aa, 0x3361564b469f32c6, 0x1e6eba9e0d2648e2, - 0x5f8e2b2ac5fcb4eb, 0xe4224fa5f71f7cc6, 0x7357a9230c76757b, 0xcad70f74aaf6b702, - 0xeef28ced23894cc2, 0x753fdd3352aefd68, 0x1fed6ba90bbeb9d2, 0x05316f4ab4034b4b, - 0x3396df022b9f63d6, 0x82d7125a7cfd0935, 0x3519a71caf1f87f0, 0xd1dfb7a5cc3974be, - 0xbfae40ecbdbbcc2a, 0x152c11778e08dd54, 0x4a96566a6c848554, 0x3a84d621c340cdd7, - 0xfd47aa1887e2fb03, 0xa63cae94b2f1d099, 0xed61783f3e5b75e0, 0xefd44864106019be, - 0x145ff78b80b081aa, 0x34670e5fcea9230e, 0x876ef976328db371, 0x4221f3a5269942a6, - 0x95315cbd85c648f4, 0x3ca344dc7c3b1600, 0x38421ea39ff28780, 0x31dbeee967c0435c, - 0x27437c3e268402e7, 0xdd0cf8343312a654, 0x965ab9dad1d8aa29, 0xf871706dd3e23509, - 0xce23d06c7a25e699, 0x1b37d59382b27589, 0x3407f004723d6324, 0x56efb69cdb5deaa1, - 0xf46cdd2b9fd604e0, 0xcad3ca79fdac69bd, 0x7252802a574e63cb, 0xc281fb8acc6ec1d3}, - {// seed = 2 - 0xdd16cb672ba6979c, 0x3954eaa9ec41ae41, 0x52cb802771d2966d, 0xf57ed8eb0d0294f2, - 0x768be23c71da2219, 0x6131e22d95a84ad3, 0xd849e4e49bb15842, 0x18e8e5c4978cf00d, - 0x3af5e5867ce1f9bd, 0x06c75a9fffe83d63, 0xe8de75a00b58a065, 0x0a773251bc0d755a, - 0x629dc21e54548329, 0x2a168f5e5a883e70, 0x33547375f0996c86, 0xdfcb4c7680451322, - 0x55c1ecaaaa57e397, 0x4546c346c24f5a31, 0x6f8f0401dfabc86c, 0x7760d2d36ee340b4, - 0xf6448e48bdeb229d, 0xba70e1633b4dba65, 0x069cda561e273054, 0xa010b6a84aebf340, - 0x5c23b8229eee34b6, 0xea63c926d90153af, 0x7d7de27b3e43ec1b, 0xea119541eddc3491, - 0xf1259daeddfc724c, 0x2873ca9a67730647, 0xa1e7710dade32607, 0x758de030b61d43fd, - 0xd2c9bcbfa475edb4, 0x18ade47bb8a0aa29, 0xf7a74af0ff1aea88, 0x6f8873274a987162, - 0x6963e8d876f4d282, 0xd435d4fe448c6c5b, 0x93ec80ba404cafff, 0xcf90d24c509e41e7, - 0x5f0fc8a62923e36e, 0x9224878fe458f3a4, 0xd9a039edf1945bcd, 0x0877d1892c288441, - 0x75205491f4b4740b, 0x30f9d2d523a9085b, 0x4b7f4029fa097c99, 0x170bb013745709d4, - 0x7087af537f11ef2e, 0x28c62b88e08fc464, 0x84bbcb3e0bb56271, 0x485a4b099165c681, - 0x357c63357caa9292, 0x819eb7d1aee2d27e, 0xdaa759eb9c0f8c9d, 0x42cdc36729cc3db5, - 0x9489aa852eddbb06, 0x8161e4f85a84e6d4, 0xa964863fdad3eb29, 0xcc095ddbce1a6702, - 0x3ecfadbb8dc2ce58, 0x971316509b95a231, 0xc8f484d1dbc38427, 0xae9c510c463574c0, - 0xdf2b31179600c21a, 0x440de87bada4dfa3, 0xbd8d30f3f6fb7522, 0x84e6d7f678a0e2d0, - 0x0ec4d74323e15975, 0xf6947610dad6d9ab, 0x73a55a95d73fe3a5, 0x3e5f623024d37eda, - 0x8d99a728d95d9344, 0x8b82a7956c4acdc4, 0x7faeaea4385b27f6, 0x540625ff4aa2ff21, - 0x4aa43b3ebd92ce2b, 0x899646a6df2da807, 0x49225115780942d7, 0xe16606636af89525, - 0xb980bcf893888e33, 0xf9ed57695291b0d8, 0x5c6dd14464619afa, 0x50606d69b733d4f3, - 0x7fb1af465b990f97, 0x3fab2634c8bbd936, 0x556da6168838b902, 0x0f15975902a30e1f, - 0xb29d782ae9e1991f, 0xae00e26ff8f7e739, 0xd3da86458bb292d5, 0x4528ee0afb27e4ce, - 0x49882d5ba49fabad, 0x7e873b6a7cf875ee, 0x777edd535113c912, 0x94ed05e7ff149594, - 0x0b8f95fc4211df43, 0x9135c2b42426fef2, 0x411e6c2b47307073, 0x503207d1af0c8cf8, - 0xd76f8619059f9a79, 0x64d24617855dee45, 0xf7bc7a877923196a, 0xd6cc42ed6a65be79, - 0xe3912ff09d4fc574, 0x4192d03b2bc2460a, 0xa0dcc37dad98af85, 0xfc59049b2a5818a4, - 0x2128bae90a5b975f, 0xbe7067ca05ea3294, 0x5bab7e7753064c4f, 0x42cbf0949ef88443, - 0x564df4bbd017492c, 0xf2c2eb500cf80564, 0x5b92e67eb00e92af, 0x8c4103eef59c0341, - 0x83412122b8284998, 0x888daf2da0636b6d, 0x4d54b10303dd07d6, 0x201190e7c1e7b5ed, - 0x3797510bb53a5771, 0x03f7bc598b570b79, 0xdc1e15d67d94f73e, 0x721e8b499ebe02c1, - 0x71f954f606d13fa0, 0x0c7a2e408c168bf0, 0x07df2ef14f69c89d, 0xe295096f46b4baaf, - 0x7a2037916438737e, 0xd1e861aeaf8676ea, 0xb36ebdce368b8108, 0xb7e53b090ddb5d25, - 0x5a606607b390b1aa, 0x475e52994f4a2471, 0xbcc2038ba55b2078, 0x28b8a6b6c80df694, - 0xb5f0130ec972c9a2, 0x7a87cd2a93276b54, 0x4d0eec7ecf92d625, 0xac1a8ce16269a42e, - 0xa4ca0237ca9637b8, 0xd8dc8ff91202b6ff, 0x75b29846799d7678, 0x761b11a5edd9c757, - 0xf2581db294ef3307, 0xe3173c2b6a48e20f, 0xe46fd7d486d65b3c, 0x1352024303580d1f, - 0x2d665dae485c1d6d, 0x4e0905c825d74d3b, 0x14ff470c331c229e, 0xbdc656b8613d8805, - 0x36de38e396345721, 0xaae682c1aa8ff13b, 0x57eb28d7b85a1052, 0xf3145290231d443a, - 0xd0f68095e23cbe39, 0x67f99b3c2570b33d, 0x54575285f3017a83, 0x9b2f7bb03d836a79, - 0xa57b209d303367a9, 0x7ccb545dd0939c79, 0x1392b79a37f4716d, 0x6e81bb91a3c79bcd, - 0x2c2cd80307dddf81, 0xb949e119e2a16cbb, 0x69625382c4c7596f, 0xf19c6d97204fb95c, - 0x1b2ea42a24b6b05e, 0x8976f83cd43d20ac, 0x7149dd3de44c9872, 0xc79f1ae2d2623059, - 0xca17a4f143a414e1, 0x66d7a1a21b6f0185, 0xed2c6198fe73f113, 0x16a5f0295cbe06af, - 0x5f27162e38d98013, 0xf54d9f295bdc0f76, 0x9ba7d562073ef77b, 0xa4a24daaa2cfc571, - 0x49884cf486da43cd, 0x74c641c0e2148a24, 0xbff9dcbff504c482, 0xf8fc2d9403c837ab, - 0x6ccc44828af0bb1e, 0xbcf0d69b4c19dfdb, 0x8fe0d962d47abf8f, 0xa65f1d9d5514271d, - 0x26ff393e62ef6a03, 0xc7153500f283e8fc, 0xea5ed99cdd9d15cd, 0xfc16ac2ba8b48bb7, - 0xf49694b70041c67a, 0xbd35dd30f5d15f72, 0xcf10ad7385f83f98, 0x709e52e27339cdc2, - 0xe9505cb3ec893b71, 0x2ffa610e4a229af7, 0x12e1bc774d1f0e52, 0xe301a3bb7eacccc8, - 0x1fdd3b6dcd877ebf, 0x56a7e8bda59c05aa, 0x99acd421035d6ab4, 0xfd21e401cecd2808, - 0x9a89d23df8b8d46f, 0x4e26b1f1eb297b9c, 0x9df24d973e1eae07, 0xe6cdc74da62a6318, - 0xfc360d74df992db0, 0xf4eca0a739514c98, 0x481c515ba9bf5215, 0xce89cce80f5f3022, - 0xf487a10fc80e4777, 0x235b379a87e41832, 0x76f72e028371f194, 0xd044d4a201325a7d, - 0x47d8e855e0ffbdde, 0x268ae196fe7334b0, 0x123f2b26db46faa8, 0x11741175b86eb083, - 0x72ee185a423e6e31, 0x8da113dfe6f6df89, 0x286b72e338bbd548, 0xa922246204973592, - 0x7237b4f939a6b629, 0x31babda9bedf039a, 0xb2e8f18c6aeec258, 0x0f5f6ce6dd65a45e, - 0x8f9071a0f23e57d3, 0x71307115ba598423, 0xcbe70264c0e1768c, 0x1c23729f955681a8, - 0xfbc829099bc2fc24, 0x9619355cbc37d5d6, 0xea694d4e59b59a74, 0xb41cf8d3a7c4f638, - 0xae1e792df721cd0b, 0x7cd855d28aac11f6, 0xca11ba0efec11238, 0x7c433e554ce261d8, - 0xe3140366f042b6ba, 0x8a59d68642b3b18c, 0x094fcdd5d7bccac2, 0x9517d80356362c37, - 0x4a20a9949c6c74e8, 0xc25bcf1699d3b326, 0xa8893f1d1ed2f340, 0x9b58986e0e8a886e, - 0x29d78c647587ce41, 0x3b210181df471767, 0xd45e8e807627849d, 0x1ec56bc3f2b653e3, - 0x974ff23068558b00, 0xdb72bdac5d34262c, 0x23225143bb206b57, 0xd0a34cfe027cbb7e}, - {// seed = 3 - 0x39209fb3eb541043, 0xee0cd3754563088f, 0x36c05fc545bf8abe, 0x842cb6381a9d396b, - 0xd5059dcb443ce3bf, 0xe92545a8dfa7097e, 0xb9d47558d8049174, 0xc6389e426f4c2fc0, - 0xd8e0a6e4c0b850d3, 0x7730e54360bd0d0d, 0x6ecb4d4c50d050d5, 0x07a16584d4eb229f, - 0x13305d05f4a92267, 0xb278ddd75db4baec, 0x32381b774138608f, 0x61fe7a7163948057, - 0x460c58a9092efee6, 0x553bf895d9b5ff62, 0x899daf2dabfd0189, 0xf388ab9c1c4b6f70, - 0xd600fe47027ea4cd, 0x16d527ec2b5ef355, 0x5ac1f58ff6908c81, 0xa08d79ff8ee9ffe8, - 0xc1060a80b7a5e117, 0x14b2c23118c60bda, 0x8cc0defbb890df8f, 0xe29540fd94c6d28b, - 0xa604f003f82d5b71, 0xa67583d4eb066d18, 0xd62cbd796322b3fc, 0x070cfe244cdcccf3, - 0x73557c30b3af47e5, 0x2e544e31153a2163, 0x996eef7464d5bead, 0xbc71cb5ab0586cdc, - 0x0bfcb6c1b517ed69, 0x62b4f1fcc82e8ca0, 0x0edbc68f544965c5, 0x40fa39baa24af412, - 0xf39aeb2413dab165, 0x17e6013e7afee738, 0x8109bff1c8d42a9d, 0x3cd99863390989b5, - 0x02021a4cc9c336c8, 0xa06060778cb60aa4, 0xd96591db60bc1e06, 0xd2727175183f4022, - 0xcdc1f1c5bce3e7ce, 0xb393ccc447872a37, 0xdf6efe63257ead3a, 0x20729d0340dbceb6, - 0x9f3d2d26fc0ea0d7, 0xf392e0885189bd79, 0xdf2ee01eb212b8b6, 0x6e103a0c0f97e2c3, - 0x96c604a763bd841b, 0x9fc590c43bba0169, 0xf92dcd5ddc248c40, 0x113a8b54446941dc, - 0x5943eda146b46bb8, 0xbf657901a36a39a7, 0x5a4e0e7ea6568971, 0xb94c635bae9f9117, - 0x2626fb65b3a4ef81, 0xa59bfd5478ce97de, 0x79112ba9cc1a1c63, 0xf41f102f002cf39c, - 0x0a589bcbfb7ff1c8, 0xa1478c53540c4fa1, 0x60d55e72c86dfaca, 0x312e7b6840ea7a39, - 0x8aae72dcccfe1f75, 0xff2f51f55bf0247a, 0x3c2e4b109edb4a90, 0x5c6d73f6525c7637, - 0xe49acb04a199f61c, 0x27860642d966df7f, 0x541ce75fb1e21c30, 0xd9fcd6f90806c7cc, - 0xb87c27bc93a7969b, 0x92f77a1179b8f8dc, 0xb1f29379deb89ed4, 0x7e63ead35808efe7, - 0x13545183d7fa5420, 0x575f593e34cf029d, 0x27f1199fb07344ae, 0xe67f95f7dc741455, - 0x49b478b761ab850b, 0xd7bedf794adfc21e, 0xdc788dcd2dda40ae, 0x14673eb9f4d8ad35, - 0x0cced3c71ecf5eb1, 0xe62d4e6c84471180, 0xdfe1b9e2cb4ada7d, 0x70185a8fce980426, - 0x0ce2db5e8f9553d6, 0x1fedc57bb37b7264, 0xb9310a2e970b3760, 0x989ff8ab9805e87d, - 0x0b912d7eb712d9ee, 0x1fe272830379e67c, 0x16e6a73aff4738fb, 0xeed196d98ba43866, - 0x7088ca12d356cbe2, 0x23539aa43a71eee0, 0xed52f0311fa0f7ad, 0xa12b16233f302eea, - 0xc477786f0870ecb4, 0xd603674717a93920, 0x4abe0ae17fa62a4c, 0xa18f1ad79e4edc8d, - 0xc49fe6db967c6981, 0xcc154d7e3c1271e9, 0xdd075d640013c0c0, 0xc026cd797d10922a, - 0xead7339703f95572, 0x4342f6f11739eb4b, 0x9862f4657d15c197, 0x4f3cb1d4d392f9ff, - 0xe35bffa018b97d03, 0x600c755031939ad3, 0xb8c6557ffea83abf, 0x14c9e7f2f8a122ea, - 0x0a2eb9285ee95a7c, 0x8823fec19840c46f, 0x2c4c445c736ed1d0, 0x83181dff233449f1, - 0x15ed3fca3107bef5, 0x305e9adb688a4c71, 0x7dbef196f68a3e2e, 0x93e47ece3e249187, - 0x8353c5e890ead93c, 0xea8a7ae66abafdf7, 0xf956dbb6becf7f74, 0x9f37c494fbfdb6e4, - 0x11c6cbaa2485dd32, 0x206f336fcca11320, 0x9befe9a59135d8fe, 0x5f3ef8b8db92c7db, - 0xbb305e556ce0ce9a, 0xf26bdafb1305887f, 0xcbf28abe23f08c61, 0x0bc64173b914e00b, - 0x9168da52e983f54a, 0x6ea41d09c3574a3e, 0x78aa44d4a74459ae, 0x2931422878387bf5, - 0x018f64a3a92c2d9c, 0x9be43f6752e66b34, 0xae378890decd1152, 0x07325329a1cb7623, - 0x3b96f4ee3dd9c525, 0x2d6ebcdbe77d61a3, 0x10e32b0e975f510c, 0xffc007b9da959bf9, - 0x38bf66c6559e5d90, 0xbe22bdf0bf8899fe, 0x87807d7a991632a8, 0x149a0d702816766a, - 0x026f723db057e9ab, 0xeeecb83625ec6798, 0xcec2ed5984208148, 0xd985a78e97f03c84, - 0xf96c279e7927b116, 0x99d5027b3204f6e2, 0x13a84878c3d34c55, 0x5cf5ec96229e9676, - 0x0bc36b07e4f8e289, 0xbed33b80a069914d, 0x2fbfbdd1ff4b9396, 0xab352bb6982da90f, - 0x154d219e4fa3f62b, 0x4d087512bb6b9be7, 0xc582e31775ee400e, 0x7dadb002ae8c4a4e, - 0xaae2957375c1aee2, 0x5f36ca643356625b, 0xf87cf8eb76e07fb7, 0x46f432a755e02cc3, - 0x36087e07aba09642, 0xe5642c1e4ebb9939, 0xb9152d22338eefad, 0xf7ba44278a22cf7f, - 0xd3b8013502acd838, 0x7761511da6482659, 0xb0857621638e8e50, 0x552eddb4a8b1d5f5, - 0xc43d9861e812c3ea, 0xd765c2aada47910c, 0x21c935b68f552b19, 0x6256d5641a2b47dc, - 0xab711d8e6c94bc79, 0xa8d0b91a2a01ab81, 0x5e6d66141e8d632a, 0x7638285124d5d602, - 0x794876dbca3e471f, 0x951937d8682670ce, 0x0f99cb1f52ed466a, 0x8c7cd205543b804c, - 0x2fd24d74a9c33783, 0xe5dcb7b7762e5af1, 0x45e6749cca4af77c, 0x540ac7ee61f2259f, - 0x89c505c72802ce86, 0xeab83b9d2d8000d1, 0x9f01d5e76748d005, 0xc740aaef3035b6d0, - 0x49afcd31d582d054, 0xcba5dc4c1efb5ddc, 0xc0a4c07434350ca1, 0xfc8dfaddcc65ee80, - 0x157c9780f6e4b2d9, 0x9762a872e1797617, 0xc4afae2cf3c7e1bd, 0x71cde14591b595d4, - 0x8843c3e0e641f3b9, 0xd92ecd91dce28750, 0x1474e7a1742cb19f, 0xec198e22764fa06b, - 0x39394edb47330c7d, 0x00ba1d925242533d, 0xaed8702536c6fb30, 0x6d3618e531c2967a, - 0x77f7cedcd7cc0411, 0xbc1e2ab82be5b752, 0x07b0cf9223676977, 0x596c693b099edd53, - 0xbb7f570f5b9b2811, 0x96bfdad3c4a6840c, 0x668015e79b60c534, 0x3ad38d72123f1366, - 0x6b994d81d2fcbb09, 0x70885f022c5052d8, 0xc891ee79d9306a7b, 0x2c4df05c0ed02497, - 0x19ebc13816898be2, 0xea7c64df11c392a2, 0xb7663e88dd12e1bd, 0x79f768cb8e154c21, - 0x1fb21b12e945933b, 0xe6a9045643f6906e, 0x544c47acd7e15371, 0xb7709b14f727e3d1, - 0x326ee36a46942971, 0x477f1cf7b0e2d847, 0x88b8f6b82b3b0c24, 0x18bc357b80e3cd5c, - 0x3333de70e4d66e0b, 0x4fd4c5e148583cf6, 0xae1b62f3008c0af3, 0xc49f419b6ab29cf5, - 0x2c29fa65afc3fa28, 0x4b19d93734d03009, 0x7dd6c09e589276ad, 0x1cece97f30de48ad}, - {// seed = 4 - 0x58bdf4338602e4fb, 0x71a5620b02c926d5, 0x3811c960129c2d9f, 0x29c2fb11fccac567, - 0x0d6b1ea7780f1352, 0xcc4d3ddfae3f87b3, 0xfdd30257362a586b, 0xabc948fde69f25f1, - 0x51b3523469d30f7b, 0xe0f0322724405ace, 0xd3729266d896da1e, 0xb10c37e5147915bf, - 0x8b577039f9fa32a3, 0xe677c6a9cbfb44b3, 0x7317a756ebb51a03, 0xf8e988ef37359485, - 0x600fc1ef3f469ff3, 0xbf0b8f8520444e01, 0x3711168b08b63d73, 0x34146f2944a6cb36, - 0x717feb263862cdde, 0x7185f8347db00412, 0x900798d82127e693, 0x84089e976a473268, - 0x10f8308c0d293719, 0xf62a618d4e5719b8, 0x8bdbd257a1a9516f, 0xf49f666fd7a75110, - 0xbaf45e2db7864339, 0xe4efa1ea0c627697, 0x3e71d4c82a09fe10, 0x54a2a51cf12127bb, - 0xa0592c9f54ba14cd, 0x27dd627a101c7a42, 0x3d2ceb44b3d20d72, 0x7ee1f94a68ca8f5d, - 0x7e8cb8651b006c36, 0xbd9fa7ca3a475259, 0x856de173586a7b34, 0xcedb291b594cb1b5, - 0xa3d6e462fd21cddc, 0x74561d10af9118e4, 0x13a3d389fc2d4b36, 0xeea8594a4a054856, - 0xf56d7474d9ba4b13, 0x25ddce2f6490b2fd, 0x920653ff3a8d830b, 0xcd8c0c9cdac740d1, - 0x2c348a738db9c4a0, 0x2967ccbe8ea44c22, 0x47963f69adb049f8, 0xf9d01eb5b4cf7eb6, - 0x7a5c26eb63a86bd2, 0x62ad8b7a71fa0566, 0xb373213179f250ae, 0x589d4e9a88245a4d, - 0x433dafebe2d558a8, 0x521fbef2c8fe4399, 0x62a31f9ff9ccd46b, 0x51602203eba7c1a6, - 0x9afc8c451b06c99f, 0xb529085bdbaffcea, 0xac251825cc75892b, 0x94976a5bce23d58e, - 0xdd17925b6c71b515, 0x568fd07a57bce92e, 0xefac31200d8bd340, 0x716c3e466b540ef9, - 0x3d2c9e380063c69b, 0x14168f9a3662dd83, 0xd298c7504dbc412f, 0x74490a94f016719f, - 0x0e0da431e1ab80c8, 0xe321f63dc6b169ae, 0xf08671544febc95a, 0x39324450cc394b3b, - 0xea6e3d35f1aa3a70, 0x8ef8a886508ce486, 0xdc1a631ef0a17f06, 0xfda2b3fbcd79e87b, - 0xd75bcae936403b10, 0xf88b5bd9f035f875, 0xc43efec2e3792dd4, 0xe9fac21a9d47cd94, - 0xc2876f0c4b7d47c3, 0xaba156cf49f368b4, 0x5ccda2170fa58bf9, 0xadc92c879ed18df7, - 0x110c1b227354e6c8, 0x298ee7a603249200, 0xde92142ede0e8ee7, 0x88e4a4610644ba9e, - 0xbb62d277e7641d3a, 0xb9be1985b7bf8073, 0x29024e5426cdb0d1, 0xf6aefd01f3092ab8, - 0x2a07087b313133aa, 0x6d71f445d6dfc839, 0x1e2412ff12e5526b, 0xed5cdeba6617b9e1, - 0x20b1d0d5e5f8760e, 0x12ff15705c368260, 0x7bf4338b7c387203, 0x34ff25f00cd06185, - 0x1148c706c518cf28, 0x5c04f0623388f025, 0xcb9d649275d87d79, 0x9b5f0c24fabc42ec, - 0x1a7b5e7964e33858, 0x2a81bbd8efdc6793, 0x8d05431ffe42752e, 0x83915cd511002677, - 0x580ed4d791837b31, 0x5982e041d19ff306, 0xcad0d08fa5d864ca, 0x867bee6efe1afa63, - 0x26467b0320f23009, 0xd842414dfda4ec36, 0x047fcdcbc0a76725, 0xbddb340a3768aeca, - 0xef4ce6fa6e99ab45, 0x88c5b66c7762bf9b, 0x5679f1c51ffb225d, 0xdab79048317d77ee, - 0xf14e9b8a8ba03803, 0xe77f07f7731184c1, 0x4c2aab9a108c1ef5, 0xa137795718e6ad97, - 0x8d6c7cc73350b88b, 0x5c34e2ae74131a49, 0xd4828f579570a056, 0xb7862594da5336fc, - 0x6fd590a4a2bed7a5, 0x138d327de35e0ec1, 0xe8290eb33d585b0b, 0xcee01d52cdf88833, - 0x165c7c76484f160e, 0x7232653da72fc7f6, 0x66600f13445ca481, 0x6bbdf0a01f7b127d, - 0xd7b71d6a1992c73b, 0xcf259d37ae3fda4a, 0xf570c70d05895acf, 0x1e01e6a3e8f60155, - 0x2dacbb83c2bd3671, 0x9c291f5a5bca81af, 0xd976826c68b4ee90, 0x95112eec1f6310a2, - 0x11ebc7f623bc4c9a, 0x18471781b1122b30, 0x48f7c65414b00187, 0x6834b03efa2f5c30, - 0x0875ef5c2c56b164, 0x45248d4f2a60ba71, 0x5a7d466e7f7ba830, 0x2bebe6a5e42c4a1d, - 0xd871d8483db51d10, 0x6ee37decd2fd392f, 0x7d724392010cede3, 0x8e96ef11e1c9bcc8, - 0x804a61d86b89d178, 0xbb1b83ce956055ec, 0xcb44e107410ff64f, 0xc426bb09ee0ba955, - 0x057c08f42c3dd7f1, 0x40ea1ec148602bdf, 0xc24688deeb65d7f1, 0xd8bcc53c768ba4e4, - 0x16e0e3af65c1106c, 0xfc12f7e7d647218b, 0x70d6e1d3ee93cef4, 0x01d2a505c4541ef9, - 0x1ef79e16e764d5c3, 0x0363d14d13870b98, 0xb56ef64345d06b11, 0xe653d557ebb7c346, - 0x8304a8597c2b2706, 0x1536e1322ce7e7bb, 0x525aec08a65af822, 0x91f66d6e98d28e43, - 0xe65af12c0b5c0274, 0xdf6ae56b7d5ea4c2, 0x5cef621cedf3c81c, 0x41e8b1ffd4889944, - 0xb5c0f452c213c3e5, 0x77af86f3e67e499b, 0xe20e76ea5b010704, 0xbdc205ab0c889ec0, - 0xc76d93eb0469cd83, 0x17ac27f65cab0034, 0xd49ec4531fd62133, 0x07a873ea2f1b9984, - 0xbff270dfef0032ee, 0x1764dbe91592f255, 0xe40363126f79e859, 0xa06cad3ab46971f6, - 0x0be596e90dedd875, 0x3387cce5c1658461, 0x44246acf88a9585e, 0xe0ad82b92d5ecb2c, - 0x2177491c9a1600a6, 0x16e7c4aac0f02422, 0x75792eeeec15c4e1, 0x2309cd359d08ee30, - 0x7cd9831dd1b83b0a, 0x374914a7c4ee8cf0, 0x0dd17765c9ac2e54, 0xb7847470ba9a7688, - 0xfba4f4bbe2991173, 0x422b203fc3de040e, 0x63bfcaf2ecf2ab0e, 0x0c5559f3a192946e, - 0xfdf80675c1847695, 0xf5f570accab842c9, 0x65cc5a448767afea, 0x1efeb0a7ee234f2f, - 0x9b05f03d81e7b5d2, 0xe7c31317a8626cf4, 0x620f2a53081d0398, 0x1b6de96cdd9943ae, - 0x8c226a436777d303, 0xa08fbbd50fafb10d, 0x6a64c5ec20104883, 0x9c9c653502c0f671, - 0x678a02b2174f52a0, 0x68e008ba16bbad4b, 0xa317c16d2efb860f, 0xeab2075d17ed714c, - 0x565eeeddf0c4ea15, 0x8ec8e94d242a6c19, 0x139e8e27d9000fae, 0xc977a7ff1b33d2f5, - 0x1d0accca84420346, 0xc9e82602cd436e03, 0x6a2231da53d2ccd3, 0xb44b12d917826e2a, - 0x4f4567c6a74cf0b9, 0xd8e115a42fc6da8f, 0xb6bbe79d95742a74, 0x5686c647f1707dab, - 0xa70d58eb6c008fc5, 0xaaedc2dbe4418026, 0x6661e2267bdcfd3d, 0x4882a6eda7706f9e, - 0xf6c2d2c912dafdd0, 0x2f2298c142fd61f9, 0x31d75afeb17143a8, 0x1f9b96580a2a982f, - 0xa6cd3e5604a8ad49, 0x0dae2a80aad17419, 0xdb9a9d12868124ac, 0x66b6109f80877fac, - 0x9a81d9c703a94029, 0xbd3b381b1e03c647, 0xe88bc07b70f31083, 0x4e17878356a55822}, - {// seed = 5 - 0xb3c58c2483ad5ead, 0x6570847428cdcf6c, 0x2b38adbf813ac866, 0x8cb9945d37eb9ad3, - 0xf5b409ec3d1aed1c, 0xa35f4bffc9bb5a93, 0x5db89cde3c9e9340, 0xff1225231b2afb2b, - 0x157b0b212b9cc47d, 0xf03faf97a2b2e04d, 0x86fdab8544a20f87, 0xfcb8732744ae5c1c, - 0xd91744c0787986d5, 0x5f8db2a76d65ad05, 0xcff605cbed17a90d, 0xf80284980a3164e7, - 0x59cc24e713fccc7d, 0x268982cada117ce4, 0xcd020e63896e730e, 0xe760dc46e9fe9885, - 0x6aaece8ab49c6b5d, 0x7451194d597aae3e, 0x35d4385900332457, 0xa40fb563a096583d, - 0xa797b612f7f11b76, 0x2fed6eb68e6a2b9b, 0x2f06ee64aeffd943, 0x9dd0e49d9ca45330, - 0x97d48f08bd7f1d8f, 0x1cfa7fe3ebe4d8ee, 0x2a2ba076bd397d42, 0x68c4344f7472f333, - 0xce21ec31987d74b5, 0xb73dabdc91d84088, 0x801aadee592222fe, 0xaf41345398ebc3f5, - 0x8a8f653d7f15ee46, 0xce2d065ff2ba2965, 0x4e05da515da2adb7, 0xa6dbdb8aa25f0fd4, - 0xca9f9666bbd2d5a9, 0x6b917ce50bd46408, 0x1550cc564ba6c84d, 0xb3063ae043506504, - 0x84e5f96bb796653d, 0xe2364798096cf6e3, 0x3b0dfedf6d3a53d0, 0xb7e4c7c77bde8d93, - 0xe99545bac9ab418a, 0xa0e31f96889507bb, 0x883c74f80c346885, 0xf674ae0b039fd341, - 0x8bb6ce2d5e8d1c75, 0x0c48737966a7ed7c, 0x04fcdf897b34c61c, 0xe96ac181bacbd4d6, - 0x5a9c55a6106a9c01, 0x2520f020de4f45d3, 0x935730955e94d208, 0xce5ad4d7f3f67d3b, - 0xa4b6d107fe2d81ca, 0x4f0033f50ae7944e, 0x32c5d28dd8a645a7, 0x57ce018223ef1039, - 0x2cbab15a661ab68e, 0x6de08798c0b5bec2, 0xee197fb2c5c007c6, 0x31b630ac63e7bda2, - 0xab98785aefe9efe3, 0xa36006158a606bf7, 0x7b20376b9f4af635, 0xa40762fdc3c08680, - 0x943b5faffd0ebee2, 0x7f39f41d0b81f06e, 0x7c4b399b116a90f8, 0x24e1662ac92bc9f3, - 0xcf586fc4e8e6c7db, 0xe46e0d047eeb12d7, 0xe8021076e4ea9958, 0x11fc13492e3ca22a, - 0xd61eae01410397e3, 0x7e8c4a58036a8e9f, 0x068a6de267970745, 0x64faab129bef1a41, - 0xb4a6f720943dad01, 0x631491058d73a9d5, 0xdad4fe95eab3ec02, 0x0a8b141c5c3a44f6, - 0x9fc69d4c2b335b98, 0x94d5f84a07d6e4cd, 0x1b73965de143c608, 0x443932c2dda54bcc, - 0x7397818fb0b04cd2, 0xef4ab03a1202b277, 0xf3d2ee459c0c2b92, 0x182d4daf8b058a87, - 0x90e63035d7b51368, 0xba4cd8b9a95d45fd, 0x12a7392c76731090, 0x890d264ec5d082d2, - 0xeeaf5c363da4994e, 0xd6aad756902123fb, 0xb531ebebdb28f191, 0xe71ce659fc59babd, - 0x37c1b94f63f2dcb5, 0xe4e3abeb311f9b96, 0x4a31b72ccb8695d3, 0x52cae1f0629fdce4, - 0xe5b0475e2ed71369, 0x2724e8c3506414fb, 0xbab0367920672deb, 0x0161a781c305449f, - 0x37b70f40f5bb60be, 0xddd1094c50251a01, 0x3b28283afd17224e, 0x06dec0cfe889fc6b, - 0x47608ea95bb4902d, 0xad883ebc12c00e82, 0x9e8d7ae0f7a8df29, 0xa79443e9f7c013a1, - 0xcfa26f68b7c68b71, 0x33ae6cc19bda1f23, 0xd9741e22b407887f, 0xf2bff78066d46b1c, - 0x794123191c9d32d4, 0x56cb6b903764ec76, 0x98775d0ef91e1a5a, 0xae7b713bc15c1db9, - 0x3b4c1a7870ed7a0d, 0x46666965f305cc34, 0x0ea0c3b2e9c6b3cd, 0x4dc387039a143bff, - 0x5f38bb9229ef9477, 0xea5d39ba72af7850, 0x69a5ed0174ce2b6d, 0x06969a36bfe7594d, - 0x0adee8e4065ccaa3, 0x908a581d57113718, 0x64822d6c5a8190ed, 0x8c5068b56ace4e4c, - 0x88ba3b4fb4e30bef, 0xa6ec0b8bb5896cfe, 0x4e23fcc6b47996fd, 0xe18e75b0dd549c7a, - 0xcd90f17e106cf939, 0x1666fdfb2ef7c52f, 0x4fae325f206dd88c, 0xe7bc1160e25b062d, - 0x3cc999cb246db950, 0xc5930a7326cd5c37, 0xb008a48a211367bd, 0xc5559da145a88fd4, - 0x1e3ad46655fac69c, 0x7834266b4841bfd7, 0xa764450fbffc58cc, 0x54d8cf93a939c667, - 0x93c51f11b21b2d9d, 0x0964112082ed65cc, 0x4c2df21213e7fb03, 0xf0405bc877468615, - 0x17b4fc835d116ab4, 0xa6b112ae5f3cb4ef, 0x23cfc8a7fd38a46e, 0x8e0a360dc2774808, - 0x24ca9c8092105ad5, 0xafd3f75524f2e0d5, 0x4f39ed7dbaddc24c, 0xe5e362c7679a7875, - 0x00914a916b07b389, 0xdfe1119b7d5ab5da, 0xabd6ed9940e46161, 0x630ed2044171e22c, - 0xdecc244157dd1601, 0x777e6d5b4b4868d5, 0x9b3530bee67017d8, 0xd2faf08b291fdcb9, - 0x006e99455d6523de, 0xd559b5817f6955b5, 0xefcc1063b0088c61, 0xed73145ae0f00ae7, - 0xab2af402cf5b7421, 0x897767f537644926, 0x26c9c0473ca83695, 0x192e34e1881b2962, - 0xf7cf666ec3b3d020, 0x27f9b79c7404afb7, 0xe533e8bed3010767, 0xe5817838e11d05d3, - 0x65659c531bd36517, 0xd427c5e0a23836fd, 0xf3eab7ea58fa3528, 0x07683adae1289f35, - 0x201d6af7e896dd32, 0xd5da938b9a21ad88, 0x843fb73ad67bc316, 0x1782ec7d5feef21b, - 0x943f66f6ec772877, 0x7e9112e7b26da097, 0xeac8161f8663c2c7, 0xe8600db480a9ebf4, - 0x07807fc90f6eaf5f, 0xe0e4c9deb41abf83, 0xbdf533db271f9c15, 0xb398411b0497afe2, - 0xdebb45ef25448940, 0xe7a5decefcd376c4, 0xaf1ef3c728c83735, 0xb8b83a99355cb15a, - 0x6444a0344f1611e4, 0xe8bb7f5cf3c60179, 0x77ab5c5177e75ff7, 0xc38fd6fa849d585d, - 0x390d57d53029060a, 0xa66327eb7b8b593c, 0x6350a14f6fcd5ac9, 0x2c08125bcd7008b4, - 0x2d00c299a6a6bf8e, 0x6b0039c1f68d1445, 0x0035150c5d06f143, 0xa34d01628cc927e1, - 0xdf5b3164d7b2ede1, 0x8167db1d0583d72e, 0x4e13b341cd2ae8bc, 0xa693d9b1f416e306, - 0xc15ed7ca0bc67609, 0xdc344313c1c4f0af, 0x88b6887ccf772bb4, 0x6326d8f93ca0b20e, - 0x6964fad667dc2f11, 0xe9783dd38fc6d515, 0x359ed258fa022718, 0x27ac934d1f7fd60a, - 0xd68130437294dbcc, 0xaf5f869921f8f416, 0x2b8f149b4ab4bf9f, 0xc41caca607e421cb, - 0x7746976904238ef9, 0x604cb5529b1532f0, 0x1c94cd17c4c4e4ab, 0xe833274b734d6bbe, - 0xe9f1d3ef674539ce, 0x64f56ed68d193c6a, 0xe34192343d8ecfc1, 0xcb162f6c3aa71fe8, - 0x99eaf25f4c0f8fa4, 0x92f11e7361cb8d02, 0xb89170cddff37197, 0x4f86e68a51e071e3, - 0x31abf6afd911a75b, 0x6d20cf259c269333, 0x4150b9f88fcb6513, 0x705063989ebf7451, - 0x559231d927c84410, 0x1ca8ec4b098bc687, 0xebed22405c9180e0, 0xaa815b37d052af59}, - {// seed = 6 - 0x946ac62246e04460, 0x9cebee264fcbc1ae, 0x8af54943a415652b, 0x2b327ed3b17b8682, - 0x983fde47b3c3847e, 0x10a3013f99a2ad33, 0x6e230bb92d2721ef, 0x1cf8b8369e5c5c50, - 0x7f64017f2b7b3738, 0xd393248a62417fa1, 0x9ff01c0b20a372c5, 0xb0e44abce7e7c220, - 0xcebb9f88d48a815f, 0xdb7df6bd09033886, 0x7844fc82b6fa9091, 0x72d095449863b8ec, - 0xc13e678c89da2c7e, 0x6caf4d5ad231d12f, 0x2e0ab7b5fcf35c49, 0xf410720cb932a70f, - 0xd66ea581f16fce06, 0x175c9f002f57dc98, 0xccbcfd0d32988775, 0xfde4c407d3b0a232, - 0x5db2931ae7e97223, 0x6e07e2173085809f, 0x6e1d1ec0f9cad73c, 0xb2fc251a7f802619, - 0xbc1fc17f04f342de, 0x8de8f21ec658e078, 0x72c0f40cbee53fd6, 0x0678244411fc17a1, - 0x1d5837ca166b9bbd, 0xc8cada003c554345, 0x6a2fe2bfb2e58652, 0xfca9d797a6f7988b, - 0x6699e24ac737948b, 0x69623ffcb05789ba, 0x946429c529d95b75, 0x0d14df0b2a13970f, - 0x593d8592c440dfec, 0x2ee176f3d7e74b94, 0xae003f1da3be9e26, 0x0c7b02c4c0f6764a, - 0x3117e2fa1f632462, 0xf0f23265b6f1eaeb, 0x3111255d9b10c137, 0xc82745e509a00397, - 0xbd1d04037005fea7, 0xe104ab0dd22a9036, 0x51b27ce50851ac7a, 0xb2cb9fb21b471b15, - 0x29d298074c5a3e26, 0x6ebdf2058b737418, 0xc4a974041431b96f, 0x1ec5a30ccb6bdaac, - 0xe818beede9bf4425, 0x4b69b1bce67a5555, 0xf5c35f1eb0d62698, 0xf4509bbd8e99867c, - 0xb17206debd52e1bc, 0x35785668c770b3be, 0xe9343987ff5863bc, 0x2ee768499ac73114, - 0x5132bb3426eeaaf4, 0x471bce2c6833c5ff, 0xbb9a2d5428e6f6f9, 0xd5678943c595792d, - 0xab2a65e7f81e479c, 0xa82407bb23990b31, 0xdae321383984923c, 0x01823bb22648e6f1, - 0xda6e8df4214a8b04, 0x0e172bb88e03d94f, 0x552da6c22e362777, 0x7ce67329fb0e90cb, - 0x7b2d7f287ede7ebf, 0xd44f8222500651bd, 0x4acca1ef58fbb8ab, 0x428ecf058df9656b, - 0xd7e1ec6a8987c185, 0x365be6a54b253246, 0x168849be1e271ee8, 0x6a00f3c4151a8db2, - 0x37602727ca94b33d, 0xf6b50f18504fa9ce, 0x1c10817f6bc872de, 0x4bfe1fe42b0f3638, - 0x135fad4b8ef6143b, 0x1b25ad2bafc25f58, 0x41e37f85cf321f92, 0xfc73f75d9d5b9bea, - 0x9eb3694d1e9cb7e1, 0x601d51f08fa83b90, 0x234a2a9b88366f41, 0x63fe903e16f2c3bf, - 0x1cdbd34fa751c0b0, 0x0ce4fc6747c0558c, 0x51ed72afb8bb49aa, 0x20313ba13ca12c96, - 0x271fa38f9ebd54c1, 0x3696a5ac03a8edde, 0x05602be7df625702, 0x11f1ac73790f7a9f, - 0xa2836c099f0810bd, 0xe5ac2e47caa532fa, 0xd9c000a66d39f681, 0xd93d900e6f3d9d5f, - 0x792c81c65b7900f2, 0x5c5dce790ee20da1, 0x74ff1950edec1aee, 0x71fc85fa1e277d8f, - 0x0e77df17d6546cbc, 0x07debad44816c3b4, 0xbafa721581e92a70, 0x8ab6fbe2ed27bba8, - 0xe83243a20dea304a, 0xaa85a63a84c00a07, 0xde0e79917fc4153a, 0x21bb445e83537896, - 0xeedcac49fc0b433a, 0xffb2926a810ae57a, 0xf724be1f41d28702, 0x79cb95746039bb3b, - 0x5a54fe3742a00900, 0xda4768d64922c04f, 0x420396a84a339dae, 0xa171e26ee5e8724e, - 0x4c8da7c5d289c20a, 0x9ebd79a1a8e94742, 0x39235232b97e9782, 0xb75df0be9bba7d80, - 0x0c1d204dd87d48fc, 0x8f81f3e7177266e8, 0xe4a460b39e78d72b, 0x50b98fa151e65351, - 0xb7cb585c3ee1eddc, 0x11cdad9a76ee1dc4, 0xa38054a78595dc1c, 0x92f09e2ec4978edc, - 0xa8f0061b5efdabaa, 0x04bcc4abc224d230, 0xc58606738e692d46, 0xdd2b27b565952433, - 0x19e6ed1b740beec0, 0xceadd49b2ef9891f, 0x328178c28fe95cad, 0xe5ad4c43afe02848, - 0x03c0cb538cd967c0, 0xec4352526d19a630, 0x4c7e99389d39b031, 0xf65dd05362c2deb6, - 0xd1e70daf6879d28d, 0xbe9f57db6309b265, 0xa4b66f370b872bb7, 0xe26896fbc6ee1fd5, - 0xac705e661bfcf7c5, 0xab4d0d07d7f09940, 0x976417c06aeb6267, 0x8161c684a6bd468c, - 0xf77b6b9976dc4601, 0xc6489b779a39c12c, 0xb2aa58d5681cea1a, 0x043b1b40f8c3e04c, - 0x681fcbfadc845430, 0xab8896c921ba8def, 0x57aaf172606f37b2, 0xc3735048cd5eb8d7, - 0xa7078b96955631bd, 0xdd6b3543aa187f33, 0xc7103ea4a2a697fd, 0x8d7b95f6ff1f7407, - 0xe44f419e84709530, 0xf340caa9132cbb0a, 0x2ba407283143c66c, 0xe1be240ca636c844, - 0x90d32f2877ac08bc, 0x5d26e6294b2c8673, 0x4a6b2f5b27c87a44, 0x961fb9043f76d34f, - 0x0afee02d8d3c55d2, 0x6228e3f48c42e5dc, 0xc338e69ee6593675, 0x853f74b16efb7bdd, - 0xd062f40bdd22e687, 0x647164b9ab4c4190, 0xf94689f67d598369, 0x8e4b29d87a5012d7, - 0xaf02b8b925656fbd, 0x7a722a767179a630, 0xb5c8afe937a75ace, 0xfdb8e8d02d279372, - 0x887ef700cb25fae1, 0xcfe9bd912f72cabe, 0xb1d4dedc24f978de, 0x517522d38319cc2a, - 0x7dd87b2b36aab798, 0x579c4ff3046b5a04, 0xf5c5975c5028b7a7, 0x7094579d1000ec84, - 0xbc8d5b1ea70a5291, 0x161b2d783be8855c, 0xd26d0b0d6d18279f, 0x0be1945f02a78bd5, - 0xb822a5a9e045415b, 0x2fe9d68b1ccc3562, 0xb2e375960033d14f, 0x26aca04e49b4ff22, - 0x732a81c862112aea, 0x8bd901ed6e4260b8, 0xe839532c561ad5b0, 0x8fb6e4d517a79b12, - 0x0dd37f8c0be9b429, 0xc8ad87ad12f1b1b0, 0xc51f3aa62b90318b, 0x031a7e8b86c1cefc, - 0xa95547af2b70fc76, 0x9cb3615c5a98801e, 0xa387e3c3341d7032, 0xa087ea52a1debaef, - 0x16325ec9a2e6e835, 0x587944a484c585eb, 0xc8879033bde22ecc, 0xa39dbfce709c464a, - 0x7acc010f99208774, 0x98dd2973a096c5ad, 0x26458b51139f198c, 0x2f5d19575e8c4f02, - 0x726643f0d38af352, 0x44d879b6d73e6e94, 0xa68a03885c980abe, 0x06048acd161c40c0, - 0xa4dab8f89d405d28, 0x7120c880cb04be18, 0xa062ace22a1cf0cf, 0x3901a9daf29704f4, - 0xff08f3ed989db30a, 0x6d22b13e874c67e9, 0x80c6f35518d73f4d, 0xc23c2a521aac6f29, - 0x2e708fd83aaa42e0, 0x7fc3780f55f1b0fd, 0xabb3075c98cf87f2, 0xb4df3f40f7c61143, - 0x2a04418098a76d75, 0x0d9eeee9509b2d37, 0x6be8ae51f4b59cdc, 0xe746cc7c00e4a2ab, - 0x785bc6df9cac597c, 0x33cb6620ce8adc48, 0xc1ba30739bffcef7, 0x6d95771f18e503f7, - 0xf7be3ae2e62652ff, 0xc8d82ffd2a73c62b, 0x8725a3ba5b110973, 0x67ed6b9c724757ec}, - {// seed = 7 - 0xc0272d42c19ff3ae, 0x4694228b43ea043b, 0x5709a6ef8a462841, 0xc9210a1e538805c9, - 0x279b171196113ec2, 0x859b769fc2d9e815, 0x0d5d3125a2bf14d3, 0x22bca1cfefa878ba, - 0x481b6bf58037bd83, 0x4933ba8647728d22, 0xf08c7b6b56f6e1b6, 0x374e8af5a15407c7, - 0xa95c4dc3d2487a5c, 0x9b832808ff11e751, 0xf2048507e9da01d5, 0xa9c576189f544a4a, - 0xf6c2a45b2e9d2b41, 0x9b9874c9f10ecc2f, 0x37d9b5f51f8c149e, 0x93aead54c9de9467, - 0x59cf0b4af262da23, 0xe7e9929af18194b2, 0x9df2644e33eb0178, 0xde4122d6f0671938, - 0xf005786c07f4800b, 0xb1fc9d254b5d1039, 0x0bf1088631f6dd7b, 0x665623f0a4b8f0c7, - 0x60f0113a9187db7c, 0xfd7cceda4f0d23a6, 0x26c01e9d89955940, 0x33afa1dfc0f5a6a0, - 0xeb77daf215e9283c, 0xc7575214bf85edb4, 0xeb0d804bf297e616, 0x84bff4ffd564f747, - 0xc4ac33189246f620, 0x43ef61213ecc1005, 0xcbbb0dea6cd96acd, 0x8ed27abfa8cfcb05, - 0x543b61529cb996b6, 0xa5f987ca41ea5e59, 0x3c50e0ac5254cb7a, 0x4192b0446c06d1e6, - 0x3e86592e21b45388, 0xdb766f06fcc6e51e, 0x0448ee36efe632db, 0x663c9db689253e35, - 0x72e0bd4985331dd4, 0xff501b5bf7d94e74, 0xe911ce758e2113a8, 0xec3a8d03a75a6ba4, - 0xaf6b4b72f56edc83, 0xf284857936c0a391, 0x5ba6feff407d46f4, 0x9d689c26de9d6702, - 0x28c04a9083726b5d, 0x2ccf4a627a029730, 0x7b4719500d4f0c71, 0x76470a9a7da250a8, - 0xcc48409404a1c890, 0xccefbdc7ec9a8055, 0xe0db91bff3cc42d3, 0x0532436426141254, - 0xf2ee9325e6f0ff0b, 0x149c20a5fbb28d9d, 0xe71624cd8d2d14d4, 0x8f01d4dc8cc2dd77, - 0x29cf409b333015b7, 0xba8bebd211884dd1, 0xc3396635e8c8db1d, 0x8ed0f6208d0528b8, - 0x0d90b43fdd0ee334, 0xd73c9a3333a044c7, 0xa2595cd208dbdc38, 0xae93cb264f940c09, - 0x8e0538d8afb07a97, 0x19115ec881385ba2, 0xa886f9e6a8039c6a, 0xcd5d62147ce3ecac, - 0xaecdf9e0bb4969f7, 0x2ddd631c53dcad10, 0x73ad1c97b3412054, 0xb08915fa2722efc6, - 0x97966047e5067eb0, 0x337f1675ed91445c, 0xb3a833d150b96a0d, 0x5940a98fe35e5e2e, - 0xfd03cc354ed0d8ff, 0x4e65b98291a8644a, 0x14a259f2852a60b2, 0x7648e3478c1e8e5f, - 0xbc0fbef6d9a919b4, 0xbec4302081346cf1, 0x57d2ce7aa1c7c511, 0x234c209d8f4e1ac3, - 0x87cf80cc933ce443, 0x7c262c616931e94e, 0xc5e33b049cf9eddf, 0x1a80790ed03ae51b, - 0xf2e8b9494f7220cf, 0x124cb59c14fff3ff, 0xa8a06cbfdb86ce18, 0x9068ef1f80b37653, - 0x0c55417b8d90338f, 0xcd579a523f6bcd30, 0xa31bfe2476a8d2a9, 0x1f8d142208094223, - 0x332dc40a5203cfad, 0xf8792fe5b2d33b4c, 0x443bd9668bf9461e, 0xc9019db0ace1409e, - 0x781bea919a113e8b, 0xb0f11d866abfbeec, 0xcfe139a60db0c26a, 0x869ab8721e6aa39e, - 0xdb48a4977717837a, 0x588a5ff151065b18, 0xe4a251ea0028864d, 0x7f0e43ba408a77c3, - 0x65f66dd50a536135, 0x6f49e934d9331c3e, 0xb8d742e0f0fa6b09, 0xe4e9b272deca2348, - 0xaee132ff902f773c, 0x43f658f7c2a0c90a, 0x28cb4dbc76cc53ea, 0x7d92253aa99ac39b, - 0x4fea3d832370baab, 0xb29e36936e51d78e, 0xea10778712321064, 0xff4f21f8ef274be2, - 0x84eff18ddfa0933f, 0xd0ec6a9f86c758a0, 0xaf82e5973c431ae0, 0x352023c00c045425, - 0xad34d7bc4a2f8961, 0xbdb4a02a24d4dee0, 0x354a4846d97447cf, 0x331a8b944d5bc19f, - 0x5ce04f8e17909035, 0x6497581bad8f4aab, 0x07c503bba647111e, 0x85f412ba78e1f7ff, - 0x7f3b920fd20f4cff, 0x424e1a9a4ce34e2f, 0x3035e2d62e1b9f0a, 0xef63114bff7b729a, - 0xe86a05889ab6bb60, 0xee0830cf095585a1, 0x4a54f7fa47d9c94b, 0x17daeece9fcb556a, - 0xc506d3f391834c6f, 0xb3f24be362e1af64, 0xc435e4e23608efdd, 0xeeba9caaa4cc1768, - 0x5a71f306daddc22d, 0x18e5205f41eba1a0, 0x7b29b4d1f6610925, 0x065cb65a0258d9a9, - 0x3e5ac8faa9fd1f95, 0x3b362362c1ea0470, 0xce0e4f6434db7a2e, 0xf327341098de52f2, - 0xcfca3b9e2a1992c3, 0x7483bf9401233e41, 0xbafbac531c6f9281, 0x4b52dd71b2c106f8, - 0xdf73b66e50b5a1f7, 0x237aec0202a20283, 0x23dd5be23dffdf2b, 0xea9730731ee122ef, - 0x5cb3f846014fbcd3, 0xc3b21c8ffdce9201, 0x06a99a02f91a8760, 0x721a81fa8fd7b7a3, - 0x6aafcdddc53cbcd8, 0xd03b464005a93bcc, 0x8212edc1b1669dcb, 0x71f4c31364c31bc7, - 0xfeeec0eba8772307, 0x1948d00a13d88cf1, 0x19064fd6d943ada8, 0x4ec8d31722697bfd, - 0x596d9a953a516609, 0xc4cb4bff53507da2, 0x1d59f3c5be36e4ca, 0xe5b4fc5bf6044c9b, - 0x1bb74e052232f735, 0x04e8a0db611ddd5d, 0x8d04eaa009b421bf, 0xa7878ae0ac0e6d58, - 0x28c1030217cab2b3, 0x827943767e56a883, 0x28fce5fa02d22809, 0xb30c322fffc8c58e, - 0x1ca5a6a9f8066c5b, 0xb24db5f1462b2513, 0x02f653b89b7e5f6c, 0xe31f8fb5d5f78eee, - 0x266acc514ed93501, 0x936879d1c6fddcc4, 0xcd51be3636af1952, 0x3fdbb6fc332c78c8, - 0x9eb656379fa73094, 0x056146cc92fa0f96, 0xed6c4f1836c027c3, 0x021e0bb5d2113f2a, - 0x8983e42ec1c626b3, 0x73ea9bc6513ad9c9, 0x0c904903b24f4247, 0xacbac1e6243e2525, - 0x0b1069a0c230fb06, 0x77d709fca3fc1ce5, 0x87ad0f65020947e6, 0x555302641c53f4e6, - 0x65ea87871fa9aaee, 0x58aaf4ecc1067bb4, 0x1a66c48cc4c65b3f, 0xca96aca48b2ea969, - 0xa68eb70bad14de2b, 0x5ccdb3d7e00a6f6e, 0xe178fbfec73fe72f, 0x2b63d6a16b83e890, - 0x32fdb7a5330fbae0, 0x2ab5803c8d1bf32c, 0xda838388c1527c94, 0x16a50bdc4de24acb, - 0xe561301f134c074a, 0xd7ae63d2816b4db1, 0x036aabd4df0dd741, 0xc5e0db8783435b9d, - 0x9c4386cf0a07f3b2, 0x6a72ac1aa56a13a1, 0x299bbdb04bb20a23, 0x138c1018fda16b81, - 0x0e354f0b3bda49df, 0x9f4c295b23127437, 0xd133ceb2bd561341, 0xd8b4bfd5a526ac29, - 0xcdd0a70ddc1c7bbd, 0x81dce595bf572225, 0x1c6f925c05f6efd7, 0x8ae5097553856ea0, - 0x3aabeaeef248f60d, 0xd9005809d19a69e2, 0x2a3a1a314311cc27, 0x89bb2dc76b2b624a, - 0x50a2a95d0412e289, 0x9def8df564e68581, 0xf49010a9b2e2ea5c, 0x8602ae175d9ff3f0, - 0xbf037e245369a618, 0x8038164365f6e2b5, 0xe2e1f6163b4e8d08, 0x8df9314914f0857e}, -}; - // create a fake null array class with a GetView method returning 0 always class FakeNullArray { public: diff --git a/cpp/src/parquet/column_chunker_hash.py b/cpp/src/parquet/column_chunker_hash.py deleted file mode 100644 index 179918dd4fe..00000000000 --- a/cpp/src/parquet/column_chunker_hash.py +++ /dev/null @@ -1,27 +0,0 @@ -import hashlib -import sys - - -def gearhash(n: int, seed: int): - value = bytes([seed] * 64 + [n] * 64) - hasher = hashlib.md5(value) - return hasher.hexdigest()[:16] - - -def print_table(seed: int, length=256, comma=True): - table = [gearhash(n, seed=seed) for n in range(length)] - print(f"{{ // seed = {seed}") - for i in range(0, length, 4): - print(" ", end="") - values = [f"0x{value}" for value in table[i:i + 4]] - values = ", ".join(values) - print(f" {values}", end=",\n" if i < length - 4 else "\n") - print(" }", end=", " if comma else "") - - -if __name__ == "__main__": - print("{") - n = int(sys.argv[1]) - for seed in range(n): - print_table(seed, comma=seed < n) - print("}") \ No newline at end of file diff --git a/cpp/src/parquet/column_chunker_hashtable.h b/cpp/src/parquet/column_chunker_hashtable.h new file mode 100644 index 00000000000..b608e658385 --- /dev/null +++ b/cpp/src/parquet/column_chunker_hashtable.h @@ -0,0 +1,547 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once +#include + +namespace parquet { +namespace internal { + +constexpr uint64_t GEARHASH_TABLE[8][256] = { + {// seed = 0 + 0xf09f35a563783945, 0x0dcc5b3bc5ae410a, 0x63f1ea8d22554270, 0xfbe5ee7bd05a7b61, + 0x3f692ed5e9934aba, 0xaab3755952250eb8, 0xdefb168dc2888fa5, 0x501b36f7c77a7d47, + 0xd2fff45d1989642d, 0x80217c1c600e30a6, 0xb9469ee2e43df7ac, 0x3654b76a61999706, + 0x6ea73dfe5de0c6b6, 0xdfd662e1937a589d, 0x0dbe0cc74b188a68, 0xde45f4e6d73ffc6f, + 0xcdf7a7759e70d87e, 0x5d6a951b8d38c310, 0xdc9423c3813fcf2c, 0x25dc2976e167ffce, + 0xc2555baa1d031c84, 0x115bc3f2230a3ab6, 0xd4b10260f350bede, 0xdfd3501ab447d723, + 0x022e79217edaf167, 0x1635e2255c5a7526, 0xa0a750350cc77102, 0xc027133e05d39f56, + 0xd949459779cf0387, 0xb92f1464f5c688c2, 0xd9ac5f3e8b42f2f3, 0xdf02bb6f5ecaac21, + 0x8156f988fac7bfa4, 0xe4580f97bede2ec8, 0x44fe7d17a76fca32, 0x885f59bd54c2014c, + 0x435e63ec655ffae9, 0x5ebc51930967b1f1, 0x5428c2084ac29e47, 0x9465938fec30e36b, + 0xc7cb3de4977772cd, 0x15692d7c201e8c3a, 0x505ee65cdc4b17f4, 0x7d9839a0a7aead6b, + 0xeef5f5b6a0105291, 0x76c2fb232ce7f5bf, 0x5c13893c1c3ff3a9, 0x65b6b547d4442f98, + 0xb8ad7487c8c96fce, 0x906bcf51c99974f8, 0x2f56e48bb943a48c, 0xbc9ab109f82d3a44, + 0xcd5160cdc8c7e735, 0xbe9acb9df3427732, 0x386b91d477d7fade, 0x36be463621dd5af2, + 0xcbe6a2faffd627a8, 0x9c8fd528463a2f5a, 0xb9b88c6bb802b184, 0xb414b4e665c597c7, + 0xbedb142568209556, 0x5360d81c25429dce, 0x63a69a960a952f37, 0xc900d63899e1b503, + 0x1abc63a8b37c7728, 0xa8b3a8b6409080eb, 0x495e391f662959f6, 0xdf1e136f3e12229b, + 0x33d5fc526b0dd38d, 0x321221ae2abfac63, 0x7fde18351fda7395, 0xed79fe5c3a6aa4c3, + 0x2dd6965a4867d8d4, 0x54813ca20fe8799b, 0x5d59ea6456465c39, 0x0de0c294d1936b81, + 0x4aaf0755002c588c, 0x3530a1857ad04c6d, 0xb8a64f4ce184442b, 0xe0def10bceedfa17, + 0x46e38d0a443757ec, 0x9795a1c645ee16d7, 0x7e531def245eac8a, 0x683b25c43a0716cf, + 0x884583d372da219d, 0x5b06b62c910416e5, 0x54b6902fbebd3dbe, 0x931198d40a761a75, + 0xead7d8e830013590, 0x80b4d5dc99bfaced, 0xf98272c8108a1ad2, 0x1adce054289a0ec6, + 0x7d53a1143c56b465, 0x497fbe4f00c92b52, 0x525e4cc2e81ebd69, 0xc94478e0d5508ff6, + 0xb8a5da83c196d07c, 0x7667a921b65b0603, 0xf236fabbdefe6cd1, 0x53da978d19a92b98, + 0xc604f6e97087124d, 0x2cbd27221924b094, 0x65cd1102c985b1d2, 0x08c0755dc1a97eb4, + 0x5e0419e921c0fef1, 0x282d2c1196f84a29, 0xe21117fcfc5793f7, 0xcf4e985dc38e6c2e, + 0xd521f4f264d55616, 0xde69b04c485f2a10, 0x59410e245305178a, 0xceab1d477c943601, + 0xa9805732d71ee5e9, 0x054cd443896974f6, 0xf2b517717a423a3e, 0x09517937fa9fac95, + 0x4938233e9ca871e3, 0x9132cbaf56f83ec0, 0x4703421ed1dd027d, 0xfd9933f4e6f1ec4e, + 0xf237c7fded2274a8, 0xdf4616efe68cd7b4, 0x5e46de0f39f0a380, 0x3d41e0c6d8e095b0, + 0xc5272f8a5bb2df09, 0x68aa78e8301fb964, 0xbf5b5b52c8e32ae0, 0xbf28ed3df74bdcf7, + 0xd6198f64c833815a, 0x8cd99d2974267544, 0xd90560ea4465ff2c, 0x571d65ad7ad59261, + 0x309453518baa367a, 0xa60538377bc79fb2, 0xace515da1ab4183c, 0xf56d3c8d891d1c5b, + 0x5b0d8370b59def49, 0x775866ce7c83c762, 0x3d76085695c8e18a, 0xba064d1a9af1b114, + 0xc84ef7cd7b98b521, 0x90b9231681c2bc37, 0x37e2b13e6f585b6b, 0x1d0a34e55e0f369f, + 0x86bb8019cf41447c, 0x4b95c6ef55b3f71f, 0x3b6ed1660732b310, 0x617eee603d137f21, + 0xf4f6278b464f3bbc, 0xdfb763b720da205a, 0x353478899b871cb7, 0xe45fbbff574cc41e, + 0x1a94b60847907d72, 0xb10eef051eff67a5, 0xf0e012ec6a284d40, 0xcc1cd1a11b926d7c, + 0xcf9d9c5453e19cad, 0x270febcc0fc0e86b, 0xd6567568778b781e, 0x7323b98965eeb46b, + 0xccecd374567086ff, 0xef7b44bfc497a704, 0xebc479c051a9f0a5, 0xc9b7410e3e00a235, + 0x1d084f7ecdf83dab, 0xc8a9a97e33ba8ba3, 0x8c75318f5b2350d6, 0xaa3cd5d0c684bdda, + 0xa81125fe0901bedf, 0xf7bcd76020edfc93, 0x834ee4c12e75874f, 0xb2bb8a7beb44fa14, + 0x32cd26f50a4f4e4d, 0x0fc5817ca55d959a, 0xd6e4ae2e3ae10718, 0x074abdcceb8d6e38, + 0xc0cc5f4f9b3a9c43, 0x1115d364363595b2, 0x69861db2eb19f2e8, 0x59b8d804cf92bc67, + 0x9bac9785e5e4b863, 0x7fa0e17a41869561, 0x10d3c9633f0c709c, 0x534a03deee6bc44a, + 0x73b1f7201257f581, 0x46fd6a11e2e0706b, 0x494abb554946e67a, 0xb5d6da317864dc8e, + 0x402ded9238f39687, 0xd8fa37d2cbd6d290, 0xcc818293fcb06791, 0x6482ab344806cd4d, + 0x0956e6ee9d8eb60b, 0x01fee622d8465ac8, 0xae7ece370cbd9c35, 0x7ff09e937a177279, + 0xa2c29ee7a33ca5f1, 0x990e8dbee083923b, 0x4a819b72f610863a, 0xddecfad79d3f08be, + 0x627372480fac20a7, 0x802154d6eca2db4c, 0x8fcf02e42f805e55, 0x040a911ff8cea977, + 0xbb544485bc64d0d4, 0xaddde1aeb406d0fb, 0xf6b35fae23dce66f, 0xc07a9fb3645d2f9b, + 0xccd113907e9c0fed, 0xd17af369984fd213, 0x9223823c59a083e7, 0xe19d475606b81013, + 0xe181ac116a90e57a, 0x71f7b6258c6def4c, 0x2246f34b45964f7c, 0xd74aedaea2d31751, + 0xb1add86e5dd305d1, 0xeb9ba881f16d6471, 0xef7600e036f5c6ff, 0x1d50bc9735b8fb85, + 0xe63942bd1f3e2969, 0x9241ba9f8b3f4e72, 0xee8bb2bca07d35b6, 0x55cd55dab522654e, + 0x94d0cfa7c1a6845d, 0x02f9845d559884c3, 0x8ce70ea21063b560, 0xd70998028ef08b74, + 0xdfdb5bbee310876b, 0x4e21b2e348256d16, 0xde007a981c13debc, 0xe51950cbbddabfdd, + 0xd223301dbe9957c1, 0x084b8634cc2cce4b, 0x90e551378aa9d70c, 0x833b533ac633e448, + 0x7891e232882da57f, 0xa1bf26f0163ce2b3, 0xf33a0171eb9c68d5, 0x2e7de18ca69b3fa2, + 0x666fd6f175619199, 0x1239d37edb5feb9f, 0xfa9fc9382e61ff5c, 0x3ca4ad427e3c126f, + 0x37c6dd4c2c31ae6e, 0x1f1bacb619d427b2, 0x7dd09f5d10759afe, 0xc8d941432327d733, + 0x2b389ba25e1d43a7, 0xa4e3030c3740ff21, 0xcc56dae13fd37463, 0x2481457c175b560f, + 0x9deb35bde77c5c41, 0x847aa6ea5549a0c3, 0xcde01bb48b6e7f02, 0x15a28844e64cb211}, + {// seed = 1 + 0xecfcba92fe5691a3, 0x71377799fea34699, 0xb284c9096fa614e5, 0x54534170f40de6c8, + 0xbbd804d45884fba3, 0x44929a896388c8a1, 0x79b712508e0fa3b1, 0xeb53ab280af31054, + 0x351ea23a6319da7a, 0x2fbe55d9819d85a2, 0x34f4b6568dcd28b1, 0x8c94ea5e5d82967a, + 0x09068d333a46d3c5, 0x762ad4f64cb73381, 0xd5c6db5ef0e22640, 0x36d8ab5a36175680, + 0xd41fe333cdc3525a, 0xa1f51dbdf20ce781, 0x1410a95e786c8be6, 0x96b7499a670c2b41, + 0x3912e1037835d893, 0x272c5bd83e1e9115, 0x2ea7f91cad82a0d6, 0xcd10e85662ce9931, + 0xedad49be8d5e8b74, 0x7ccd8fe0f37d12bc, 0xfac0482005eed593, 0x4513991681f6c8b0, + 0x2804d612eb0ad37d, 0x7cca9e8412b81d34, 0x85ffd6707192b7b8, 0xea0560aeea954411, + 0x0122d28226102bba, 0xf51c47cdbd22fdd1, 0x3707d851183ff17c, 0xaef5a1465f3e902d, + 0xbcb38c2d8736a04f, 0x4025317e864bef15, 0x8d3f66d86e1ea58f, 0xc16759a3d97ed79a, + 0x1c62abdc0659f2f5, 0x23b3eb4e699bd28f, 0x5083c4fceed3ccaf, 0xa65bf34562cc989c, + 0xaa5865932fd79064, 0xf24d08d268c24593, 0x7fbd00a215196999, 0x7812cd366d752964, + 0x62e8dcb27ef3d945, 0xf08b7984e1b946dc, 0x547d23ad9a5c1dcf, 0x496b1fb249b27fb7, + 0xcd692e1db5f3b3ba, 0x41931e39f1e1bc61, 0x286c6a7d7edae82b, 0x17ef6638b6c4ca6e, + 0x609beb5a2576a934, 0xcc5e16fe4a69b83c, 0xbbd14d08b078fc24, 0x2a617680f481cb94, + 0x81dbbd5f86e6d039, 0xeb8205e1fc8ecc3c, 0xe5e3bb576faa8042, 0x5d6f1eb9d9df01b5, + 0x9a47b8739c10fb44, 0x398a7caad7ea7696, 0x9c0fc1d7c46adde6, 0x67cd6de0a51978a6, + 0x68ccc4b77a21cca4, 0x1e067066b82f415c, 0xf7ddade6535e1819, 0xf2185c884291751b, + 0xc322b7381fcbe34f, 0x242f593e88290b9b, 0x8e11ccc0ea5e84a3, 0x40e3a2e3346db8a2, + 0xf18bfc3ad2931a2c, 0x2468397394b00144, 0xeae199cce14e6817, 0x05b462686c75a1ae, + 0xda096cb859c51673, 0xd87aeb967a906bef, 0xaabc74493cb02fe6, 0x74d48fc2e7da143e, + 0x6ec1c8fed3f2c1fd, 0xe01e0704b463f18e, 0xc3d88a4d3a8056e4, 0xd01ae0ffab6c8f3f, + 0x881ba052620ae7c7, 0xcea033aef0a823a5, 0x8d2cad91d83df1e3, 0x18746d205e66dbe9, + 0x3061f8e58d046650, 0xd819c59f0ce2cf8b, 0x144e89e93635e870, 0x3415e88279b21651, + 0xd6f7ab944b86c3fa, 0x45f1dd15d0f67bdc, 0xbf0d97c7f4fa24f4, 0x34a7de520a57fcd2, + 0x4ba86fda03e9e2bc, 0xa7995265a025b552, 0x698f6819d5f51cf7, 0xd07dbe9d8a156981, + 0x2683945373857fc1, 0x116f8a84f96167de, 0x8bc832bd85595ebf, 0xb206519d74fdfafa, + 0xde9519b2e9b5cc5f, 0x16fdd6f2da1d8163, 0x7ba32bd48ef56f11, 0x6f4e4d7ee8b29717, + 0xd31576dde7468aad, 0x023bb08848676045, 0xf6dcc083178160b7, 0x42035f426250e683, + 0x343732993cfed89f, 0x0640a870a22d3d58, 0x65cff80b53b4ae6a, 0x27996fa17ab05215, + 0xfd5db01401b21a04, 0x894508784bc1673c, 0x5bfcf43a2380e27d, 0x4cd6dcc2715583b7, + 0xa43b3763e7d4c902, 0x6da83e12ef0c1257, 0xfe80a602b0335aff, 0x293a7d8f4ff344de, + 0xb4ae7c2b8956bf5a, 0x6b45432d38254b4d, 0xd086acbdf15d9455, 0xa4d19e43f41ea87b, + 0xf01f13ba4bb87fbf, 0xca582cf301a299ff, 0x0ddad3d45298fa7d, 0x0646a130459c3999, + 0xc08e3af3747e2cee, 0xfc7db8aa9ed67295, 0x783b329e7bd79d5f, 0x732dbc607957af7b, + 0x8e446ac19fb26555, 0xff1dfa4d61dc89a5, 0xb6fbc46bd8d011d8, 0x185147ec5779f0d7, + 0x6eb2cf6149a5380f, 0xb0e773df803a1eae, 0xc07706c5519bfce5, 0xc35abcf54fa95f14, + 0x40a01d99a38608ea, 0x776dcd6f603c277f, 0x6ae12389b1d6d0bb, 0x8bd981448df92bb9, + 0x426a6a7ca21a2c16, 0x87efd5b71c1bad26, 0x71fb7fc4cd41de48, 0xdd9033c45619d463, + 0x40eaab322654cef7, 0xe077fffed6f3e3a2, 0x375a4dbef9384447, 0x2066b009d2c4a100, + 0xeca4a5794a068447, 0x2128f64bddf341a1, 0x738b4bb1be90bd61, 0x433772cf3813d52e, + 0x9540c88add8e4474, 0x0b6d5decd21d3519, 0x654ead966745642d, 0xe1bfb03c3b4bdb4c, + 0x0b977a9937515b1f, 0x0a4587509ef63870, 0xe89f0de1d9cfd44a, 0x23a91390272e7f68, + 0xd92defbc9096b8d8, 0x004db87174612539, 0xc88ecaabdd1a71f1, 0x050de38393073346, + 0x8af1426d7964e038, 0xf352c4fef8ad5c87, 0x6f26bc7408e26548, 0x0d41543fd9bf3084, + 0xfc4e07553a840fc6, 0x5ef117de86a555a9, 0x1f11c42dffb5ae1b, 0x4147648f07490fa5, + 0x09b35fd7671b21aa, 0x1453b14f7ccca481, 0x944f6fcce4c9b2ba, 0x5b08dd2e3583dc06, + 0xe0220df78dc9c22d, 0x1c200b9506cbf666, 0x8a0b7465eadb523b, 0xfbcb43a91a1e2d80, + 0xe697f44be3c36a58, 0x2f8a8e48fb7e350d, 0x7baba71b8920d55f, 0x10edc0216105bc96, + 0x52db07c79d7a7a63, 0x1916e8cef9452ac3, 0x5cbbbf21f867b6cc, 0xadd583365a690a4b, + 0x4e4ca2c8bffc2fdb, 0xf5fe3416d2eebcfe, 0x839af8b85e452476, 0x8496c0c54ad44e16, + 0x6c46f1ecad4482bf, 0xb794cad76ae18715, 0x67b762eec7c62985, 0x52dc9e68df5b3a53, + 0x0cc7e444b422a5f9, 0xadbfe90841c112b0, 0xfe37b136f0ca5c34, 0xcfe9e47948a8d73e, + 0xee90572b86a30d91, 0x549e72d8262830aa, 0x3361564b469f32c6, 0x1e6eba9e0d2648e2, + 0x5f8e2b2ac5fcb4eb, 0xe4224fa5f71f7cc6, 0x7357a9230c76757b, 0xcad70f74aaf6b702, + 0xeef28ced23894cc2, 0x753fdd3352aefd68, 0x1fed6ba90bbeb9d2, 0x05316f4ab4034b4b, + 0x3396df022b9f63d6, 0x82d7125a7cfd0935, 0x3519a71caf1f87f0, 0xd1dfb7a5cc3974be, + 0xbfae40ecbdbbcc2a, 0x152c11778e08dd54, 0x4a96566a6c848554, 0x3a84d621c340cdd7, + 0xfd47aa1887e2fb03, 0xa63cae94b2f1d099, 0xed61783f3e5b75e0, 0xefd44864106019be, + 0x145ff78b80b081aa, 0x34670e5fcea9230e, 0x876ef976328db371, 0x4221f3a5269942a6, + 0x95315cbd85c648f4, 0x3ca344dc7c3b1600, 0x38421ea39ff28780, 0x31dbeee967c0435c, + 0x27437c3e268402e7, 0xdd0cf8343312a654, 0x965ab9dad1d8aa29, 0xf871706dd3e23509, + 0xce23d06c7a25e699, 0x1b37d59382b27589, 0x3407f004723d6324, 0x56efb69cdb5deaa1, + 0xf46cdd2b9fd604e0, 0xcad3ca79fdac69bd, 0x7252802a574e63cb, 0xc281fb8acc6ec1d3}, + {// seed = 2 + 0xdd16cb672ba6979c, 0x3954eaa9ec41ae41, 0x52cb802771d2966d, 0xf57ed8eb0d0294f2, + 0x768be23c71da2219, 0x6131e22d95a84ad3, 0xd849e4e49bb15842, 0x18e8e5c4978cf00d, + 0x3af5e5867ce1f9bd, 0x06c75a9fffe83d63, 0xe8de75a00b58a065, 0x0a773251bc0d755a, + 0x629dc21e54548329, 0x2a168f5e5a883e70, 0x33547375f0996c86, 0xdfcb4c7680451322, + 0x55c1ecaaaa57e397, 0x4546c346c24f5a31, 0x6f8f0401dfabc86c, 0x7760d2d36ee340b4, + 0xf6448e48bdeb229d, 0xba70e1633b4dba65, 0x069cda561e273054, 0xa010b6a84aebf340, + 0x5c23b8229eee34b6, 0xea63c926d90153af, 0x7d7de27b3e43ec1b, 0xea119541eddc3491, + 0xf1259daeddfc724c, 0x2873ca9a67730647, 0xa1e7710dade32607, 0x758de030b61d43fd, + 0xd2c9bcbfa475edb4, 0x18ade47bb8a0aa29, 0xf7a74af0ff1aea88, 0x6f8873274a987162, + 0x6963e8d876f4d282, 0xd435d4fe448c6c5b, 0x93ec80ba404cafff, 0xcf90d24c509e41e7, + 0x5f0fc8a62923e36e, 0x9224878fe458f3a4, 0xd9a039edf1945bcd, 0x0877d1892c288441, + 0x75205491f4b4740b, 0x30f9d2d523a9085b, 0x4b7f4029fa097c99, 0x170bb013745709d4, + 0x7087af537f11ef2e, 0x28c62b88e08fc464, 0x84bbcb3e0bb56271, 0x485a4b099165c681, + 0x357c63357caa9292, 0x819eb7d1aee2d27e, 0xdaa759eb9c0f8c9d, 0x42cdc36729cc3db5, + 0x9489aa852eddbb06, 0x8161e4f85a84e6d4, 0xa964863fdad3eb29, 0xcc095ddbce1a6702, + 0x3ecfadbb8dc2ce58, 0x971316509b95a231, 0xc8f484d1dbc38427, 0xae9c510c463574c0, + 0xdf2b31179600c21a, 0x440de87bada4dfa3, 0xbd8d30f3f6fb7522, 0x84e6d7f678a0e2d0, + 0x0ec4d74323e15975, 0xf6947610dad6d9ab, 0x73a55a95d73fe3a5, 0x3e5f623024d37eda, + 0x8d99a728d95d9344, 0x8b82a7956c4acdc4, 0x7faeaea4385b27f6, 0x540625ff4aa2ff21, + 0x4aa43b3ebd92ce2b, 0x899646a6df2da807, 0x49225115780942d7, 0xe16606636af89525, + 0xb980bcf893888e33, 0xf9ed57695291b0d8, 0x5c6dd14464619afa, 0x50606d69b733d4f3, + 0x7fb1af465b990f97, 0x3fab2634c8bbd936, 0x556da6168838b902, 0x0f15975902a30e1f, + 0xb29d782ae9e1991f, 0xae00e26ff8f7e739, 0xd3da86458bb292d5, 0x4528ee0afb27e4ce, + 0x49882d5ba49fabad, 0x7e873b6a7cf875ee, 0x777edd535113c912, 0x94ed05e7ff149594, + 0x0b8f95fc4211df43, 0x9135c2b42426fef2, 0x411e6c2b47307073, 0x503207d1af0c8cf8, + 0xd76f8619059f9a79, 0x64d24617855dee45, 0xf7bc7a877923196a, 0xd6cc42ed6a65be79, + 0xe3912ff09d4fc574, 0x4192d03b2bc2460a, 0xa0dcc37dad98af85, 0xfc59049b2a5818a4, + 0x2128bae90a5b975f, 0xbe7067ca05ea3294, 0x5bab7e7753064c4f, 0x42cbf0949ef88443, + 0x564df4bbd017492c, 0xf2c2eb500cf80564, 0x5b92e67eb00e92af, 0x8c4103eef59c0341, + 0x83412122b8284998, 0x888daf2da0636b6d, 0x4d54b10303dd07d6, 0x201190e7c1e7b5ed, + 0x3797510bb53a5771, 0x03f7bc598b570b79, 0xdc1e15d67d94f73e, 0x721e8b499ebe02c1, + 0x71f954f606d13fa0, 0x0c7a2e408c168bf0, 0x07df2ef14f69c89d, 0xe295096f46b4baaf, + 0x7a2037916438737e, 0xd1e861aeaf8676ea, 0xb36ebdce368b8108, 0xb7e53b090ddb5d25, + 0x5a606607b390b1aa, 0x475e52994f4a2471, 0xbcc2038ba55b2078, 0x28b8a6b6c80df694, + 0xb5f0130ec972c9a2, 0x7a87cd2a93276b54, 0x4d0eec7ecf92d625, 0xac1a8ce16269a42e, + 0xa4ca0237ca9637b8, 0xd8dc8ff91202b6ff, 0x75b29846799d7678, 0x761b11a5edd9c757, + 0xf2581db294ef3307, 0xe3173c2b6a48e20f, 0xe46fd7d486d65b3c, 0x1352024303580d1f, + 0x2d665dae485c1d6d, 0x4e0905c825d74d3b, 0x14ff470c331c229e, 0xbdc656b8613d8805, + 0x36de38e396345721, 0xaae682c1aa8ff13b, 0x57eb28d7b85a1052, 0xf3145290231d443a, + 0xd0f68095e23cbe39, 0x67f99b3c2570b33d, 0x54575285f3017a83, 0x9b2f7bb03d836a79, + 0xa57b209d303367a9, 0x7ccb545dd0939c79, 0x1392b79a37f4716d, 0x6e81bb91a3c79bcd, + 0x2c2cd80307dddf81, 0xb949e119e2a16cbb, 0x69625382c4c7596f, 0xf19c6d97204fb95c, + 0x1b2ea42a24b6b05e, 0x8976f83cd43d20ac, 0x7149dd3de44c9872, 0xc79f1ae2d2623059, + 0xca17a4f143a414e1, 0x66d7a1a21b6f0185, 0xed2c6198fe73f113, 0x16a5f0295cbe06af, + 0x5f27162e38d98013, 0xf54d9f295bdc0f76, 0x9ba7d562073ef77b, 0xa4a24daaa2cfc571, + 0x49884cf486da43cd, 0x74c641c0e2148a24, 0xbff9dcbff504c482, 0xf8fc2d9403c837ab, + 0x6ccc44828af0bb1e, 0xbcf0d69b4c19dfdb, 0x8fe0d962d47abf8f, 0xa65f1d9d5514271d, + 0x26ff393e62ef6a03, 0xc7153500f283e8fc, 0xea5ed99cdd9d15cd, 0xfc16ac2ba8b48bb7, + 0xf49694b70041c67a, 0xbd35dd30f5d15f72, 0xcf10ad7385f83f98, 0x709e52e27339cdc2, + 0xe9505cb3ec893b71, 0x2ffa610e4a229af7, 0x12e1bc774d1f0e52, 0xe301a3bb7eacccc8, + 0x1fdd3b6dcd877ebf, 0x56a7e8bda59c05aa, 0x99acd421035d6ab4, 0xfd21e401cecd2808, + 0x9a89d23df8b8d46f, 0x4e26b1f1eb297b9c, 0x9df24d973e1eae07, 0xe6cdc74da62a6318, + 0xfc360d74df992db0, 0xf4eca0a739514c98, 0x481c515ba9bf5215, 0xce89cce80f5f3022, + 0xf487a10fc80e4777, 0x235b379a87e41832, 0x76f72e028371f194, 0xd044d4a201325a7d, + 0x47d8e855e0ffbdde, 0x268ae196fe7334b0, 0x123f2b26db46faa8, 0x11741175b86eb083, + 0x72ee185a423e6e31, 0x8da113dfe6f6df89, 0x286b72e338bbd548, 0xa922246204973592, + 0x7237b4f939a6b629, 0x31babda9bedf039a, 0xb2e8f18c6aeec258, 0x0f5f6ce6dd65a45e, + 0x8f9071a0f23e57d3, 0x71307115ba598423, 0xcbe70264c0e1768c, 0x1c23729f955681a8, + 0xfbc829099bc2fc24, 0x9619355cbc37d5d6, 0xea694d4e59b59a74, 0xb41cf8d3a7c4f638, + 0xae1e792df721cd0b, 0x7cd855d28aac11f6, 0xca11ba0efec11238, 0x7c433e554ce261d8, + 0xe3140366f042b6ba, 0x8a59d68642b3b18c, 0x094fcdd5d7bccac2, 0x9517d80356362c37, + 0x4a20a9949c6c74e8, 0xc25bcf1699d3b326, 0xa8893f1d1ed2f340, 0x9b58986e0e8a886e, + 0x29d78c647587ce41, 0x3b210181df471767, 0xd45e8e807627849d, 0x1ec56bc3f2b653e3, + 0x974ff23068558b00, 0xdb72bdac5d34262c, 0x23225143bb206b57, 0xd0a34cfe027cbb7e}, + {// seed = 3 + 0x39209fb3eb541043, 0xee0cd3754563088f, 0x36c05fc545bf8abe, 0x842cb6381a9d396b, + 0xd5059dcb443ce3bf, 0xe92545a8dfa7097e, 0xb9d47558d8049174, 0xc6389e426f4c2fc0, + 0xd8e0a6e4c0b850d3, 0x7730e54360bd0d0d, 0x6ecb4d4c50d050d5, 0x07a16584d4eb229f, + 0x13305d05f4a92267, 0xb278ddd75db4baec, 0x32381b774138608f, 0x61fe7a7163948057, + 0x460c58a9092efee6, 0x553bf895d9b5ff62, 0x899daf2dabfd0189, 0xf388ab9c1c4b6f70, + 0xd600fe47027ea4cd, 0x16d527ec2b5ef355, 0x5ac1f58ff6908c81, 0xa08d79ff8ee9ffe8, + 0xc1060a80b7a5e117, 0x14b2c23118c60bda, 0x8cc0defbb890df8f, 0xe29540fd94c6d28b, + 0xa604f003f82d5b71, 0xa67583d4eb066d18, 0xd62cbd796322b3fc, 0x070cfe244cdcccf3, + 0x73557c30b3af47e5, 0x2e544e31153a2163, 0x996eef7464d5bead, 0xbc71cb5ab0586cdc, + 0x0bfcb6c1b517ed69, 0x62b4f1fcc82e8ca0, 0x0edbc68f544965c5, 0x40fa39baa24af412, + 0xf39aeb2413dab165, 0x17e6013e7afee738, 0x8109bff1c8d42a9d, 0x3cd99863390989b5, + 0x02021a4cc9c336c8, 0xa06060778cb60aa4, 0xd96591db60bc1e06, 0xd2727175183f4022, + 0xcdc1f1c5bce3e7ce, 0xb393ccc447872a37, 0xdf6efe63257ead3a, 0x20729d0340dbceb6, + 0x9f3d2d26fc0ea0d7, 0xf392e0885189bd79, 0xdf2ee01eb212b8b6, 0x6e103a0c0f97e2c3, + 0x96c604a763bd841b, 0x9fc590c43bba0169, 0xf92dcd5ddc248c40, 0x113a8b54446941dc, + 0x5943eda146b46bb8, 0xbf657901a36a39a7, 0x5a4e0e7ea6568971, 0xb94c635bae9f9117, + 0x2626fb65b3a4ef81, 0xa59bfd5478ce97de, 0x79112ba9cc1a1c63, 0xf41f102f002cf39c, + 0x0a589bcbfb7ff1c8, 0xa1478c53540c4fa1, 0x60d55e72c86dfaca, 0x312e7b6840ea7a39, + 0x8aae72dcccfe1f75, 0xff2f51f55bf0247a, 0x3c2e4b109edb4a90, 0x5c6d73f6525c7637, + 0xe49acb04a199f61c, 0x27860642d966df7f, 0x541ce75fb1e21c30, 0xd9fcd6f90806c7cc, + 0xb87c27bc93a7969b, 0x92f77a1179b8f8dc, 0xb1f29379deb89ed4, 0x7e63ead35808efe7, + 0x13545183d7fa5420, 0x575f593e34cf029d, 0x27f1199fb07344ae, 0xe67f95f7dc741455, + 0x49b478b761ab850b, 0xd7bedf794adfc21e, 0xdc788dcd2dda40ae, 0x14673eb9f4d8ad35, + 0x0cced3c71ecf5eb1, 0xe62d4e6c84471180, 0xdfe1b9e2cb4ada7d, 0x70185a8fce980426, + 0x0ce2db5e8f9553d6, 0x1fedc57bb37b7264, 0xb9310a2e970b3760, 0x989ff8ab9805e87d, + 0x0b912d7eb712d9ee, 0x1fe272830379e67c, 0x16e6a73aff4738fb, 0xeed196d98ba43866, + 0x7088ca12d356cbe2, 0x23539aa43a71eee0, 0xed52f0311fa0f7ad, 0xa12b16233f302eea, + 0xc477786f0870ecb4, 0xd603674717a93920, 0x4abe0ae17fa62a4c, 0xa18f1ad79e4edc8d, + 0xc49fe6db967c6981, 0xcc154d7e3c1271e9, 0xdd075d640013c0c0, 0xc026cd797d10922a, + 0xead7339703f95572, 0x4342f6f11739eb4b, 0x9862f4657d15c197, 0x4f3cb1d4d392f9ff, + 0xe35bffa018b97d03, 0x600c755031939ad3, 0xb8c6557ffea83abf, 0x14c9e7f2f8a122ea, + 0x0a2eb9285ee95a7c, 0x8823fec19840c46f, 0x2c4c445c736ed1d0, 0x83181dff233449f1, + 0x15ed3fca3107bef5, 0x305e9adb688a4c71, 0x7dbef196f68a3e2e, 0x93e47ece3e249187, + 0x8353c5e890ead93c, 0xea8a7ae66abafdf7, 0xf956dbb6becf7f74, 0x9f37c494fbfdb6e4, + 0x11c6cbaa2485dd32, 0x206f336fcca11320, 0x9befe9a59135d8fe, 0x5f3ef8b8db92c7db, + 0xbb305e556ce0ce9a, 0xf26bdafb1305887f, 0xcbf28abe23f08c61, 0x0bc64173b914e00b, + 0x9168da52e983f54a, 0x6ea41d09c3574a3e, 0x78aa44d4a74459ae, 0x2931422878387bf5, + 0x018f64a3a92c2d9c, 0x9be43f6752e66b34, 0xae378890decd1152, 0x07325329a1cb7623, + 0x3b96f4ee3dd9c525, 0x2d6ebcdbe77d61a3, 0x10e32b0e975f510c, 0xffc007b9da959bf9, + 0x38bf66c6559e5d90, 0xbe22bdf0bf8899fe, 0x87807d7a991632a8, 0x149a0d702816766a, + 0x026f723db057e9ab, 0xeeecb83625ec6798, 0xcec2ed5984208148, 0xd985a78e97f03c84, + 0xf96c279e7927b116, 0x99d5027b3204f6e2, 0x13a84878c3d34c55, 0x5cf5ec96229e9676, + 0x0bc36b07e4f8e289, 0xbed33b80a069914d, 0x2fbfbdd1ff4b9396, 0xab352bb6982da90f, + 0x154d219e4fa3f62b, 0x4d087512bb6b9be7, 0xc582e31775ee400e, 0x7dadb002ae8c4a4e, + 0xaae2957375c1aee2, 0x5f36ca643356625b, 0xf87cf8eb76e07fb7, 0x46f432a755e02cc3, + 0x36087e07aba09642, 0xe5642c1e4ebb9939, 0xb9152d22338eefad, 0xf7ba44278a22cf7f, + 0xd3b8013502acd838, 0x7761511da6482659, 0xb0857621638e8e50, 0x552eddb4a8b1d5f5, + 0xc43d9861e812c3ea, 0xd765c2aada47910c, 0x21c935b68f552b19, 0x6256d5641a2b47dc, + 0xab711d8e6c94bc79, 0xa8d0b91a2a01ab81, 0x5e6d66141e8d632a, 0x7638285124d5d602, + 0x794876dbca3e471f, 0x951937d8682670ce, 0x0f99cb1f52ed466a, 0x8c7cd205543b804c, + 0x2fd24d74a9c33783, 0xe5dcb7b7762e5af1, 0x45e6749cca4af77c, 0x540ac7ee61f2259f, + 0x89c505c72802ce86, 0xeab83b9d2d8000d1, 0x9f01d5e76748d005, 0xc740aaef3035b6d0, + 0x49afcd31d582d054, 0xcba5dc4c1efb5ddc, 0xc0a4c07434350ca1, 0xfc8dfaddcc65ee80, + 0x157c9780f6e4b2d9, 0x9762a872e1797617, 0xc4afae2cf3c7e1bd, 0x71cde14591b595d4, + 0x8843c3e0e641f3b9, 0xd92ecd91dce28750, 0x1474e7a1742cb19f, 0xec198e22764fa06b, + 0x39394edb47330c7d, 0x00ba1d925242533d, 0xaed8702536c6fb30, 0x6d3618e531c2967a, + 0x77f7cedcd7cc0411, 0xbc1e2ab82be5b752, 0x07b0cf9223676977, 0x596c693b099edd53, + 0xbb7f570f5b9b2811, 0x96bfdad3c4a6840c, 0x668015e79b60c534, 0x3ad38d72123f1366, + 0x6b994d81d2fcbb09, 0x70885f022c5052d8, 0xc891ee79d9306a7b, 0x2c4df05c0ed02497, + 0x19ebc13816898be2, 0xea7c64df11c392a2, 0xb7663e88dd12e1bd, 0x79f768cb8e154c21, + 0x1fb21b12e945933b, 0xe6a9045643f6906e, 0x544c47acd7e15371, 0xb7709b14f727e3d1, + 0x326ee36a46942971, 0x477f1cf7b0e2d847, 0x88b8f6b82b3b0c24, 0x18bc357b80e3cd5c, + 0x3333de70e4d66e0b, 0x4fd4c5e148583cf6, 0xae1b62f3008c0af3, 0xc49f419b6ab29cf5, + 0x2c29fa65afc3fa28, 0x4b19d93734d03009, 0x7dd6c09e589276ad, 0x1cece97f30de48ad}, + {// seed = 4 + 0x58bdf4338602e4fb, 0x71a5620b02c926d5, 0x3811c960129c2d9f, 0x29c2fb11fccac567, + 0x0d6b1ea7780f1352, 0xcc4d3ddfae3f87b3, 0xfdd30257362a586b, 0xabc948fde69f25f1, + 0x51b3523469d30f7b, 0xe0f0322724405ace, 0xd3729266d896da1e, 0xb10c37e5147915bf, + 0x8b577039f9fa32a3, 0xe677c6a9cbfb44b3, 0x7317a756ebb51a03, 0xf8e988ef37359485, + 0x600fc1ef3f469ff3, 0xbf0b8f8520444e01, 0x3711168b08b63d73, 0x34146f2944a6cb36, + 0x717feb263862cdde, 0x7185f8347db00412, 0x900798d82127e693, 0x84089e976a473268, + 0x10f8308c0d293719, 0xf62a618d4e5719b8, 0x8bdbd257a1a9516f, 0xf49f666fd7a75110, + 0xbaf45e2db7864339, 0xe4efa1ea0c627697, 0x3e71d4c82a09fe10, 0x54a2a51cf12127bb, + 0xa0592c9f54ba14cd, 0x27dd627a101c7a42, 0x3d2ceb44b3d20d72, 0x7ee1f94a68ca8f5d, + 0x7e8cb8651b006c36, 0xbd9fa7ca3a475259, 0x856de173586a7b34, 0xcedb291b594cb1b5, + 0xa3d6e462fd21cddc, 0x74561d10af9118e4, 0x13a3d389fc2d4b36, 0xeea8594a4a054856, + 0xf56d7474d9ba4b13, 0x25ddce2f6490b2fd, 0x920653ff3a8d830b, 0xcd8c0c9cdac740d1, + 0x2c348a738db9c4a0, 0x2967ccbe8ea44c22, 0x47963f69adb049f8, 0xf9d01eb5b4cf7eb6, + 0x7a5c26eb63a86bd2, 0x62ad8b7a71fa0566, 0xb373213179f250ae, 0x589d4e9a88245a4d, + 0x433dafebe2d558a8, 0x521fbef2c8fe4399, 0x62a31f9ff9ccd46b, 0x51602203eba7c1a6, + 0x9afc8c451b06c99f, 0xb529085bdbaffcea, 0xac251825cc75892b, 0x94976a5bce23d58e, + 0xdd17925b6c71b515, 0x568fd07a57bce92e, 0xefac31200d8bd340, 0x716c3e466b540ef9, + 0x3d2c9e380063c69b, 0x14168f9a3662dd83, 0xd298c7504dbc412f, 0x74490a94f016719f, + 0x0e0da431e1ab80c8, 0xe321f63dc6b169ae, 0xf08671544febc95a, 0x39324450cc394b3b, + 0xea6e3d35f1aa3a70, 0x8ef8a886508ce486, 0xdc1a631ef0a17f06, 0xfda2b3fbcd79e87b, + 0xd75bcae936403b10, 0xf88b5bd9f035f875, 0xc43efec2e3792dd4, 0xe9fac21a9d47cd94, + 0xc2876f0c4b7d47c3, 0xaba156cf49f368b4, 0x5ccda2170fa58bf9, 0xadc92c879ed18df7, + 0x110c1b227354e6c8, 0x298ee7a603249200, 0xde92142ede0e8ee7, 0x88e4a4610644ba9e, + 0xbb62d277e7641d3a, 0xb9be1985b7bf8073, 0x29024e5426cdb0d1, 0xf6aefd01f3092ab8, + 0x2a07087b313133aa, 0x6d71f445d6dfc839, 0x1e2412ff12e5526b, 0xed5cdeba6617b9e1, + 0x20b1d0d5e5f8760e, 0x12ff15705c368260, 0x7bf4338b7c387203, 0x34ff25f00cd06185, + 0x1148c706c518cf28, 0x5c04f0623388f025, 0xcb9d649275d87d79, 0x9b5f0c24fabc42ec, + 0x1a7b5e7964e33858, 0x2a81bbd8efdc6793, 0x8d05431ffe42752e, 0x83915cd511002677, + 0x580ed4d791837b31, 0x5982e041d19ff306, 0xcad0d08fa5d864ca, 0x867bee6efe1afa63, + 0x26467b0320f23009, 0xd842414dfda4ec36, 0x047fcdcbc0a76725, 0xbddb340a3768aeca, + 0xef4ce6fa6e99ab45, 0x88c5b66c7762bf9b, 0x5679f1c51ffb225d, 0xdab79048317d77ee, + 0xf14e9b8a8ba03803, 0xe77f07f7731184c1, 0x4c2aab9a108c1ef5, 0xa137795718e6ad97, + 0x8d6c7cc73350b88b, 0x5c34e2ae74131a49, 0xd4828f579570a056, 0xb7862594da5336fc, + 0x6fd590a4a2bed7a5, 0x138d327de35e0ec1, 0xe8290eb33d585b0b, 0xcee01d52cdf88833, + 0x165c7c76484f160e, 0x7232653da72fc7f6, 0x66600f13445ca481, 0x6bbdf0a01f7b127d, + 0xd7b71d6a1992c73b, 0xcf259d37ae3fda4a, 0xf570c70d05895acf, 0x1e01e6a3e8f60155, + 0x2dacbb83c2bd3671, 0x9c291f5a5bca81af, 0xd976826c68b4ee90, 0x95112eec1f6310a2, + 0x11ebc7f623bc4c9a, 0x18471781b1122b30, 0x48f7c65414b00187, 0x6834b03efa2f5c30, + 0x0875ef5c2c56b164, 0x45248d4f2a60ba71, 0x5a7d466e7f7ba830, 0x2bebe6a5e42c4a1d, + 0xd871d8483db51d10, 0x6ee37decd2fd392f, 0x7d724392010cede3, 0x8e96ef11e1c9bcc8, + 0x804a61d86b89d178, 0xbb1b83ce956055ec, 0xcb44e107410ff64f, 0xc426bb09ee0ba955, + 0x057c08f42c3dd7f1, 0x40ea1ec148602bdf, 0xc24688deeb65d7f1, 0xd8bcc53c768ba4e4, + 0x16e0e3af65c1106c, 0xfc12f7e7d647218b, 0x70d6e1d3ee93cef4, 0x01d2a505c4541ef9, + 0x1ef79e16e764d5c3, 0x0363d14d13870b98, 0xb56ef64345d06b11, 0xe653d557ebb7c346, + 0x8304a8597c2b2706, 0x1536e1322ce7e7bb, 0x525aec08a65af822, 0x91f66d6e98d28e43, + 0xe65af12c0b5c0274, 0xdf6ae56b7d5ea4c2, 0x5cef621cedf3c81c, 0x41e8b1ffd4889944, + 0xb5c0f452c213c3e5, 0x77af86f3e67e499b, 0xe20e76ea5b010704, 0xbdc205ab0c889ec0, + 0xc76d93eb0469cd83, 0x17ac27f65cab0034, 0xd49ec4531fd62133, 0x07a873ea2f1b9984, + 0xbff270dfef0032ee, 0x1764dbe91592f255, 0xe40363126f79e859, 0xa06cad3ab46971f6, + 0x0be596e90dedd875, 0x3387cce5c1658461, 0x44246acf88a9585e, 0xe0ad82b92d5ecb2c, + 0x2177491c9a1600a6, 0x16e7c4aac0f02422, 0x75792eeeec15c4e1, 0x2309cd359d08ee30, + 0x7cd9831dd1b83b0a, 0x374914a7c4ee8cf0, 0x0dd17765c9ac2e54, 0xb7847470ba9a7688, + 0xfba4f4bbe2991173, 0x422b203fc3de040e, 0x63bfcaf2ecf2ab0e, 0x0c5559f3a192946e, + 0xfdf80675c1847695, 0xf5f570accab842c9, 0x65cc5a448767afea, 0x1efeb0a7ee234f2f, + 0x9b05f03d81e7b5d2, 0xe7c31317a8626cf4, 0x620f2a53081d0398, 0x1b6de96cdd9943ae, + 0x8c226a436777d303, 0xa08fbbd50fafb10d, 0x6a64c5ec20104883, 0x9c9c653502c0f671, + 0x678a02b2174f52a0, 0x68e008ba16bbad4b, 0xa317c16d2efb860f, 0xeab2075d17ed714c, + 0x565eeeddf0c4ea15, 0x8ec8e94d242a6c19, 0x139e8e27d9000fae, 0xc977a7ff1b33d2f5, + 0x1d0accca84420346, 0xc9e82602cd436e03, 0x6a2231da53d2ccd3, 0xb44b12d917826e2a, + 0x4f4567c6a74cf0b9, 0xd8e115a42fc6da8f, 0xb6bbe79d95742a74, 0x5686c647f1707dab, + 0xa70d58eb6c008fc5, 0xaaedc2dbe4418026, 0x6661e2267bdcfd3d, 0x4882a6eda7706f9e, + 0xf6c2d2c912dafdd0, 0x2f2298c142fd61f9, 0x31d75afeb17143a8, 0x1f9b96580a2a982f, + 0xa6cd3e5604a8ad49, 0x0dae2a80aad17419, 0xdb9a9d12868124ac, 0x66b6109f80877fac, + 0x9a81d9c703a94029, 0xbd3b381b1e03c647, 0xe88bc07b70f31083, 0x4e17878356a55822}, + {// seed = 5 + 0xb3c58c2483ad5ead, 0x6570847428cdcf6c, 0x2b38adbf813ac866, 0x8cb9945d37eb9ad3, + 0xf5b409ec3d1aed1c, 0xa35f4bffc9bb5a93, 0x5db89cde3c9e9340, 0xff1225231b2afb2b, + 0x157b0b212b9cc47d, 0xf03faf97a2b2e04d, 0x86fdab8544a20f87, 0xfcb8732744ae5c1c, + 0xd91744c0787986d5, 0x5f8db2a76d65ad05, 0xcff605cbed17a90d, 0xf80284980a3164e7, + 0x59cc24e713fccc7d, 0x268982cada117ce4, 0xcd020e63896e730e, 0xe760dc46e9fe9885, + 0x6aaece8ab49c6b5d, 0x7451194d597aae3e, 0x35d4385900332457, 0xa40fb563a096583d, + 0xa797b612f7f11b76, 0x2fed6eb68e6a2b9b, 0x2f06ee64aeffd943, 0x9dd0e49d9ca45330, + 0x97d48f08bd7f1d8f, 0x1cfa7fe3ebe4d8ee, 0x2a2ba076bd397d42, 0x68c4344f7472f333, + 0xce21ec31987d74b5, 0xb73dabdc91d84088, 0x801aadee592222fe, 0xaf41345398ebc3f5, + 0x8a8f653d7f15ee46, 0xce2d065ff2ba2965, 0x4e05da515da2adb7, 0xa6dbdb8aa25f0fd4, + 0xca9f9666bbd2d5a9, 0x6b917ce50bd46408, 0x1550cc564ba6c84d, 0xb3063ae043506504, + 0x84e5f96bb796653d, 0xe2364798096cf6e3, 0x3b0dfedf6d3a53d0, 0xb7e4c7c77bde8d93, + 0xe99545bac9ab418a, 0xa0e31f96889507bb, 0x883c74f80c346885, 0xf674ae0b039fd341, + 0x8bb6ce2d5e8d1c75, 0x0c48737966a7ed7c, 0x04fcdf897b34c61c, 0xe96ac181bacbd4d6, + 0x5a9c55a6106a9c01, 0x2520f020de4f45d3, 0x935730955e94d208, 0xce5ad4d7f3f67d3b, + 0xa4b6d107fe2d81ca, 0x4f0033f50ae7944e, 0x32c5d28dd8a645a7, 0x57ce018223ef1039, + 0x2cbab15a661ab68e, 0x6de08798c0b5bec2, 0xee197fb2c5c007c6, 0x31b630ac63e7bda2, + 0xab98785aefe9efe3, 0xa36006158a606bf7, 0x7b20376b9f4af635, 0xa40762fdc3c08680, + 0x943b5faffd0ebee2, 0x7f39f41d0b81f06e, 0x7c4b399b116a90f8, 0x24e1662ac92bc9f3, + 0xcf586fc4e8e6c7db, 0xe46e0d047eeb12d7, 0xe8021076e4ea9958, 0x11fc13492e3ca22a, + 0xd61eae01410397e3, 0x7e8c4a58036a8e9f, 0x068a6de267970745, 0x64faab129bef1a41, + 0xb4a6f720943dad01, 0x631491058d73a9d5, 0xdad4fe95eab3ec02, 0x0a8b141c5c3a44f6, + 0x9fc69d4c2b335b98, 0x94d5f84a07d6e4cd, 0x1b73965de143c608, 0x443932c2dda54bcc, + 0x7397818fb0b04cd2, 0xef4ab03a1202b277, 0xf3d2ee459c0c2b92, 0x182d4daf8b058a87, + 0x90e63035d7b51368, 0xba4cd8b9a95d45fd, 0x12a7392c76731090, 0x890d264ec5d082d2, + 0xeeaf5c363da4994e, 0xd6aad756902123fb, 0xb531ebebdb28f191, 0xe71ce659fc59babd, + 0x37c1b94f63f2dcb5, 0xe4e3abeb311f9b96, 0x4a31b72ccb8695d3, 0x52cae1f0629fdce4, + 0xe5b0475e2ed71369, 0x2724e8c3506414fb, 0xbab0367920672deb, 0x0161a781c305449f, + 0x37b70f40f5bb60be, 0xddd1094c50251a01, 0x3b28283afd17224e, 0x06dec0cfe889fc6b, + 0x47608ea95bb4902d, 0xad883ebc12c00e82, 0x9e8d7ae0f7a8df29, 0xa79443e9f7c013a1, + 0xcfa26f68b7c68b71, 0x33ae6cc19bda1f23, 0xd9741e22b407887f, 0xf2bff78066d46b1c, + 0x794123191c9d32d4, 0x56cb6b903764ec76, 0x98775d0ef91e1a5a, 0xae7b713bc15c1db9, + 0x3b4c1a7870ed7a0d, 0x46666965f305cc34, 0x0ea0c3b2e9c6b3cd, 0x4dc387039a143bff, + 0x5f38bb9229ef9477, 0xea5d39ba72af7850, 0x69a5ed0174ce2b6d, 0x06969a36bfe7594d, + 0x0adee8e4065ccaa3, 0x908a581d57113718, 0x64822d6c5a8190ed, 0x8c5068b56ace4e4c, + 0x88ba3b4fb4e30bef, 0xa6ec0b8bb5896cfe, 0x4e23fcc6b47996fd, 0xe18e75b0dd549c7a, + 0xcd90f17e106cf939, 0x1666fdfb2ef7c52f, 0x4fae325f206dd88c, 0xe7bc1160e25b062d, + 0x3cc999cb246db950, 0xc5930a7326cd5c37, 0xb008a48a211367bd, 0xc5559da145a88fd4, + 0x1e3ad46655fac69c, 0x7834266b4841bfd7, 0xa764450fbffc58cc, 0x54d8cf93a939c667, + 0x93c51f11b21b2d9d, 0x0964112082ed65cc, 0x4c2df21213e7fb03, 0xf0405bc877468615, + 0x17b4fc835d116ab4, 0xa6b112ae5f3cb4ef, 0x23cfc8a7fd38a46e, 0x8e0a360dc2774808, + 0x24ca9c8092105ad5, 0xafd3f75524f2e0d5, 0x4f39ed7dbaddc24c, 0xe5e362c7679a7875, + 0x00914a916b07b389, 0xdfe1119b7d5ab5da, 0xabd6ed9940e46161, 0x630ed2044171e22c, + 0xdecc244157dd1601, 0x777e6d5b4b4868d5, 0x9b3530bee67017d8, 0xd2faf08b291fdcb9, + 0x006e99455d6523de, 0xd559b5817f6955b5, 0xefcc1063b0088c61, 0xed73145ae0f00ae7, + 0xab2af402cf5b7421, 0x897767f537644926, 0x26c9c0473ca83695, 0x192e34e1881b2962, + 0xf7cf666ec3b3d020, 0x27f9b79c7404afb7, 0xe533e8bed3010767, 0xe5817838e11d05d3, + 0x65659c531bd36517, 0xd427c5e0a23836fd, 0xf3eab7ea58fa3528, 0x07683adae1289f35, + 0x201d6af7e896dd32, 0xd5da938b9a21ad88, 0x843fb73ad67bc316, 0x1782ec7d5feef21b, + 0x943f66f6ec772877, 0x7e9112e7b26da097, 0xeac8161f8663c2c7, 0xe8600db480a9ebf4, + 0x07807fc90f6eaf5f, 0xe0e4c9deb41abf83, 0xbdf533db271f9c15, 0xb398411b0497afe2, + 0xdebb45ef25448940, 0xe7a5decefcd376c4, 0xaf1ef3c728c83735, 0xb8b83a99355cb15a, + 0x6444a0344f1611e4, 0xe8bb7f5cf3c60179, 0x77ab5c5177e75ff7, 0xc38fd6fa849d585d, + 0x390d57d53029060a, 0xa66327eb7b8b593c, 0x6350a14f6fcd5ac9, 0x2c08125bcd7008b4, + 0x2d00c299a6a6bf8e, 0x6b0039c1f68d1445, 0x0035150c5d06f143, 0xa34d01628cc927e1, + 0xdf5b3164d7b2ede1, 0x8167db1d0583d72e, 0x4e13b341cd2ae8bc, 0xa693d9b1f416e306, + 0xc15ed7ca0bc67609, 0xdc344313c1c4f0af, 0x88b6887ccf772bb4, 0x6326d8f93ca0b20e, + 0x6964fad667dc2f11, 0xe9783dd38fc6d515, 0x359ed258fa022718, 0x27ac934d1f7fd60a, + 0xd68130437294dbcc, 0xaf5f869921f8f416, 0x2b8f149b4ab4bf9f, 0xc41caca607e421cb, + 0x7746976904238ef9, 0x604cb5529b1532f0, 0x1c94cd17c4c4e4ab, 0xe833274b734d6bbe, + 0xe9f1d3ef674539ce, 0x64f56ed68d193c6a, 0xe34192343d8ecfc1, 0xcb162f6c3aa71fe8, + 0x99eaf25f4c0f8fa4, 0x92f11e7361cb8d02, 0xb89170cddff37197, 0x4f86e68a51e071e3, + 0x31abf6afd911a75b, 0x6d20cf259c269333, 0x4150b9f88fcb6513, 0x705063989ebf7451, + 0x559231d927c84410, 0x1ca8ec4b098bc687, 0xebed22405c9180e0, 0xaa815b37d052af59}, + {// seed = 6 + 0x946ac62246e04460, 0x9cebee264fcbc1ae, 0x8af54943a415652b, 0x2b327ed3b17b8682, + 0x983fde47b3c3847e, 0x10a3013f99a2ad33, 0x6e230bb92d2721ef, 0x1cf8b8369e5c5c50, + 0x7f64017f2b7b3738, 0xd393248a62417fa1, 0x9ff01c0b20a372c5, 0xb0e44abce7e7c220, + 0xcebb9f88d48a815f, 0xdb7df6bd09033886, 0x7844fc82b6fa9091, 0x72d095449863b8ec, + 0xc13e678c89da2c7e, 0x6caf4d5ad231d12f, 0x2e0ab7b5fcf35c49, 0xf410720cb932a70f, + 0xd66ea581f16fce06, 0x175c9f002f57dc98, 0xccbcfd0d32988775, 0xfde4c407d3b0a232, + 0x5db2931ae7e97223, 0x6e07e2173085809f, 0x6e1d1ec0f9cad73c, 0xb2fc251a7f802619, + 0xbc1fc17f04f342de, 0x8de8f21ec658e078, 0x72c0f40cbee53fd6, 0x0678244411fc17a1, + 0x1d5837ca166b9bbd, 0xc8cada003c554345, 0x6a2fe2bfb2e58652, 0xfca9d797a6f7988b, + 0x6699e24ac737948b, 0x69623ffcb05789ba, 0x946429c529d95b75, 0x0d14df0b2a13970f, + 0x593d8592c440dfec, 0x2ee176f3d7e74b94, 0xae003f1da3be9e26, 0x0c7b02c4c0f6764a, + 0x3117e2fa1f632462, 0xf0f23265b6f1eaeb, 0x3111255d9b10c137, 0xc82745e509a00397, + 0xbd1d04037005fea7, 0xe104ab0dd22a9036, 0x51b27ce50851ac7a, 0xb2cb9fb21b471b15, + 0x29d298074c5a3e26, 0x6ebdf2058b737418, 0xc4a974041431b96f, 0x1ec5a30ccb6bdaac, + 0xe818beede9bf4425, 0x4b69b1bce67a5555, 0xf5c35f1eb0d62698, 0xf4509bbd8e99867c, + 0xb17206debd52e1bc, 0x35785668c770b3be, 0xe9343987ff5863bc, 0x2ee768499ac73114, + 0x5132bb3426eeaaf4, 0x471bce2c6833c5ff, 0xbb9a2d5428e6f6f9, 0xd5678943c595792d, + 0xab2a65e7f81e479c, 0xa82407bb23990b31, 0xdae321383984923c, 0x01823bb22648e6f1, + 0xda6e8df4214a8b04, 0x0e172bb88e03d94f, 0x552da6c22e362777, 0x7ce67329fb0e90cb, + 0x7b2d7f287ede7ebf, 0xd44f8222500651bd, 0x4acca1ef58fbb8ab, 0x428ecf058df9656b, + 0xd7e1ec6a8987c185, 0x365be6a54b253246, 0x168849be1e271ee8, 0x6a00f3c4151a8db2, + 0x37602727ca94b33d, 0xf6b50f18504fa9ce, 0x1c10817f6bc872de, 0x4bfe1fe42b0f3638, + 0x135fad4b8ef6143b, 0x1b25ad2bafc25f58, 0x41e37f85cf321f92, 0xfc73f75d9d5b9bea, + 0x9eb3694d1e9cb7e1, 0x601d51f08fa83b90, 0x234a2a9b88366f41, 0x63fe903e16f2c3bf, + 0x1cdbd34fa751c0b0, 0x0ce4fc6747c0558c, 0x51ed72afb8bb49aa, 0x20313ba13ca12c96, + 0x271fa38f9ebd54c1, 0x3696a5ac03a8edde, 0x05602be7df625702, 0x11f1ac73790f7a9f, + 0xa2836c099f0810bd, 0xe5ac2e47caa532fa, 0xd9c000a66d39f681, 0xd93d900e6f3d9d5f, + 0x792c81c65b7900f2, 0x5c5dce790ee20da1, 0x74ff1950edec1aee, 0x71fc85fa1e277d8f, + 0x0e77df17d6546cbc, 0x07debad44816c3b4, 0xbafa721581e92a70, 0x8ab6fbe2ed27bba8, + 0xe83243a20dea304a, 0xaa85a63a84c00a07, 0xde0e79917fc4153a, 0x21bb445e83537896, + 0xeedcac49fc0b433a, 0xffb2926a810ae57a, 0xf724be1f41d28702, 0x79cb95746039bb3b, + 0x5a54fe3742a00900, 0xda4768d64922c04f, 0x420396a84a339dae, 0xa171e26ee5e8724e, + 0x4c8da7c5d289c20a, 0x9ebd79a1a8e94742, 0x39235232b97e9782, 0xb75df0be9bba7d80, + 0x0c1d204dd87d48fc, 0x8f81f3e7177266e8, 0xe4a460b39e78d72b, 0x50b98fa151e65351, + 0xb7cb585c3ee1eddc, 0x11cdad9a76ee1dc4, 0xa38054a78595dc1c, 0x92f09e2ec4978edc, + 0xa8f0061b5efdabaa, 0x04bcc4abc224d230, 0xc58606738e692d46, 0xdd2b27b565952433, + 0x19e6ed1b740beec0, 0xceadd49b2ef9891f, 0x328178c28fe95cad, 0xe5ad4c43afe02848, + 0x03c0cb538cd967c0, 0xec4352526d19a630, 0x4c7e99389d39b031, 0xf65dd05362c2deb6, + 0xd1e70daf6879d28d, 0xbe9f57db6309b265, 0xa4b66f370b872bb7, 0xe26896fbc6ee1fd5, + 0xac705e661bfcf7c5, 0xab4d0d07d7f09940, 0x976417c06aeb6267, 0x8161c684a6bd468c, + 0xf77b6b9976dc4601, 0xc6489b779a39c12c, 0xb2aa58d5681cea1a, 0x043b1b40f8c3e04c, + 0x681fcbfadc845430, 0xab8896c921ba8def, 0x57aaf172606f37b2, 0xc3735048cd5eb8d7, + 0xa7078b96955631bd, 0xdd6b3543aa187f33, 0xc7103ea4a2a697fd, 0x8d7b95f6ff1f7407, + 0xe44f419e84709530, 0xf340caa9132cbb0a, 0x2ba407283143c66c, 0xe1be240ca636c844, + 0x90d32f2877ac08bc, 0x5d26e6294b2c8673, 0x4a6b2f5b27c87a44, 0x961fb9043f76d34f, + 0x0afee02d8d3c55d2, 0x6228e3f48c42e5dc, 0xc338e69ee6593675, 0x853f74b16efb7bdd, + 0xd062f40bdd22e687, 0x647164b9ab4c4190, 0xf94689f67d598369, 0x8e4b29d87a5012d7, + 0xaf02b8b925656fbd, 0x7a722a767179a630, 0xb5c8afe937a75ace, 0xfdb8e8d02d279372, + 0x887ef700cb25fae1, 0xcfe9bd912f72cabe, 0xb1d4dedc24f978de, 0x517522d38319cc2a, + 0x7dd87b2b36aab798, 0x579c4ff3046b5a04, 0xf5c5975c5028b7a7, 0x7094579d1000ec84, + 0xbc8d5b1ea70a5291, 0x161b2d783be8855c, 0xd26d0b0d6d18279f, 0x0be1945f02a78bd5, + 0xb822a5a9e045415b, 0x2fe9d68b1ccc3562, 0xb2e375960033d14f, 0x26aca04e49b4ff22, + 0x732a81c862112aea, 0x8bd901ed6e4260b8, 0xe839532c561ad5b0, 0x8fb6e4d517a79b12, + 0x0dd37f8c0be9b429, 0xc8ad87ad12f1b1b0, 0xc51f3aa62b90318b, 0x031a7e8b86c1cefc, + 0xa95547af2b70fc76, 0x9cb3615c5a98801e, 0xa387e3c3341d7032, 0xa087ea52a1debaef, + 0x16325ec9a2e6e835, 0x587944a484c585eb, 0xc8879033bde22ecc, 0xa39dbfce709c464a, + 0x7acc010f99208774, 0x98dd2973a096c5ad, 0x26458b51139f198c, 0x2f5d19575e8c4f02, + 0x726643f0d38af352, 0x44d879b6d73e6e94, 0xa68a03885c980abe, 0x06048acd161c40c0, + 0xa4dab8f89d405d28, 0x7120c880cb04be18, 0xa062ace22a1cf0cf, 0x3901a9daf29704f4, + 0xff08f3ed989db30a, 0x6d22b13e874c67e9, 0x80c6f35518d73f4d, 0xc23c2a521aac6f29, + 0x2e708fd83aaa42e0, 0x7fc3780f55f1b0fd, 0xabb3075c98cf87f2, 0xb4df3f40f7c61143, + 0x2a04418098a76d75, 0x0d9eeee9509b2d37, 0x6be8ae51f4b59cdc, 0xe746cc7c00e4a2ab, + 0x785bc6df9cac597c, 0x33cb6620ce8adc48, 0xc1ba30739bffcef7, 0x6d95771f18e503f7, + 0xf7be3ae2e62652ff, 0xc8d82ffd2a73c62b, 0x8725a3ba5b110973, 0x67ed6b9c724757ec}, + {// seed = 7 + 0xc0272d42c19ff3ae, 0x4694228b43ea043b, 0x5709a6ef8a462841, 0xc9210a1e538805c9, + 0x279b171196113ec2, 0x859b769fc2d9e815, 0x0d5d3125a2bf14d3, 0x22bca1cfefa878ba, + 0x481b6bf58037bd83, 0x4933ba8647728d22, 0xf08c7b6b56f6e1b6, 0x374e8af5a15407c7, + 0xa95c4dc3d2487a5c, 0x9b832808ff11e751, 0xf2048507e9da01d5, 0xa9c576189f544a4a, + 0xf6c2a45b2e9d2b41, 0x9b9874c9f10ecc2f, 0x37d9b5f51f8c149e, 0x93aead54c9de9467, + 0x59cf0b4af262da23, 0xe7e9929af18194b2, 0x9df2644e33eb0178, 0xde4122d6f0671938, + 0xf005786c07f4800b, 0xb1fc9d254b5d1039, 0x0bf1088631f6dd7b, 0x665623f0a4b8f0c7, + 0x60f0113a9187db7c, 0xfd7cceda4f0d23a6, 0x26c01e9d89955940, 0x33afa1dfc0f5a6a0, + 0xeb77daf215e9283c, 0xc7575214bf85edb4, 0xeb0d804bf297e616, 0x84bff4ffd564f747, + 0xc4ac33189246f620, 0x43ef61213ecc1005, 0xcbbb0dea6cd96acd, 0x8ed27abfa8cfcb05, + 0x543b61529cb996b6, 0xa5f987ca41ea5e59, 0x3c50e0ac5254cb7a, 0x4192b0446c06d1e6, + 0x3e86592e21b45388, 0xdb766f06fcc6e51e, 0x0448ee36efe632db, 0x663c9db689253e35, + 0x72e0bd4985331dd4, 0xff501b5bf7d94e74, 0xe911ce758e2113a8, 0xec3a8d03a75a6ba4, + 0xaf6b4b72f56edc83, 0xf284857936c0a391, 0x5ba6feff407d46f4, 0x9d689c26de9d6702, + 0x28c04a9083726b5d, 0x2ccf4a627a029730, 0x7b4719500d4f0c71, 0x76470a9a7da250a8, + 0xcc48409404a1c890, 0xccefbdc7ec9a8055, 0xe0db91bff3cc42d3, 0x0532436426141254, + 0xf2ee9325e6f0ff0b, 0x149c20a5fbb28d9d, 0xe71624cd8d2d14d4, 0x8f01d4dc8cc2dd77, + 0x29cf409b333015b7, 0xba8bebd211884dd1, 0xc3396635e8c8db1d, 0x8ed0f6208d0528b8, + 0x0d90b43fdd0ee334, 0xd73c9a3333a044c7, 0xa2595cd208dbdc38, 0xae93cb264f940c09, + 0x8e0538d8afb07a97, 0x19115ec881385ba2, 0xa886f9e6a8039c6a, 0xcd5d62147ce3ecac, + 0xaecdf9e0bb4969f7, 0x2ddd631c53dcad10, 0x73ad1c97b3412054, 0xb08915fa2722efc6, + 0x97966047e5067eb0, 0x337f1675ed91445c, 0xb3a833d150b96a0d, 0x5940a98fe35e5e2e, + 0xfd03cc354ed0d8ff, 0x4e65b98291a8644a, 0x14a259f2852a60b2, 0x7648e3478c1e8e5f, + 0xbc0fbef6d9a919b4, 0xbec4302081346cf1, 0x57d2ce7aa1c7c511, 0x234c209d8f4e1ac3, + 0x87cf80cc933ce443, 0x7c262c616931e94e, 0xc5e33b049cf9eddf, 0x1a80790ed03ae51b, + 0xf2e8b9494f7220cf, 0x124cb59c14fff3ff, 0xa8a06cbfdb86ce18, 0x9068ef1f80b37653, + 0x0c55417b8d90338f, 0xcd579a523f6bcd30, 0xa31bfe2476a8d2a9, 0x1f8d142208094223, + 0x332dc40a5203cfad, 0xf8792fe5b2d33b4c, 0x443bd9668bf9461e, 0xc9019db0ace1409e, + 0x781bea919a113e8b, 0xb0f11d866abfbeec, 0xcfe139a60db0c26a, 0x869ab8721e6aa39e, + 0xdb48a4977717837a, 0x588a5ff151065b18, 0xe4a251ea0028864d, 0x7f0e43ba408a77c3, + 0x65f66dd50a536135, 0x6f49e934d9331c3e, 0xb8d742e0f0fa6b09, 0xe4e9b272deca2348, + 0xaee132ff902f773c, 0x43f658f7c2a0c90a, 0x28cb4dbc76cc53ea, 0x7d92253aa99ac39b, + 0x4fea3d832370baab, 0xb29e36936e51d78e, 0xea10778712321064, 0xff4f21f8ef274be2, + 0x84eff18ddfa0933f, 0xd0ec6a9f86c758a0, 0xaf82e5973c431ae0, 0x352023c00c045425, + 0xad34d7bc4a2f8961, 0xbdb4a02a24d4dee0, 0x354a4846d97447cf, 0x331a8b944d5bc19f, + 0x5ce04f8e17909035, 0x6497581bad8f4aab, 0x07c503bba647111e, 0x85f412ba78e1f7ff, + 0x7f3b920fd20f4cff, 0x424e1a9a4ce34e2f, 0x3035e2d62e1b9f0a, 0xef63114bff7b729a, + 0xe86a05889ab6bb60, 0xee0830cf095585a1, 0x4a54f7fa47d9c94b, 0x17daeece9fcb556a, + 0xc506d3f391834c6f, 0xb3f24be362e1af64, 0xc435e4e23608efdd, 0xeeba9caaa4cc1768, + 0x5a71f306daddc22d, 0x18e5205f41eba1a0, 0x7b29b4d1f6610925, 0x065cb65a0258d9a9, + 0x3e5ac8faa9fd1f95, 0x3b362362c1ea0470, 0xce0e4f6434db7a2e, 0xf327341098de52f2, + 0xcfca3b9e2a1992c3, 0x7483bf9401233e41, 0xbafbac531c6f9281, 0x4b52dd71b2c106f8, + 0xdf73b66e50b5a1f7, 0x237aec0202a20283, 0x23dd5be23dffdf2b, 0xea9730731ee122ef, + 0x5cb3f846014fbcd3, 0xc3b21c8ffdce9201, 0x06a99a02f91a8760, 0x721a81fa8fd7b7a3, + 0x6aafcdddc53cbcd8, 0xd03b464005a93bcc, 0x8212edc1b1669dcb, 0x71f4c31364c31bc7, + 0xfeeec0eba8772307, 0x1948d00a13d88cf1, 0x19064fd6d943ada8, 0x4ec8d31722697bfd, + 0x596d9a953a516609, 0xc4cb4bff53507da2, 0x1d59f3c5be36e4ca, 0xe5b4fc5bf6044c9b, + 0x1bb74e052232f735, 0x04e8a0db611ddd5d, 0x8d04eaa009b421bf, 0xa7878ae0ac0e6d58, + 0x28c1030217cab2b3, 0x827943767e56a883, 0x28fce5fa02d22809, 0xb30c322fffc8c58e, + 0x1ca5a6a9f8066c5b, 0xb24db5f1462b2513, 0x02f653b89b7e5f6c, 0xe31f8fb5d5f78eee, + 0x266acc514ed93501, 0x936879d1c6fddcc4, 0xcd51be3636af1952, 0x3fdbb6fc332c78c8, + 0x9eb656379fa73094, 0x056146cc92fa0f96, 0xed6c4f1836c027c3, 0x021e0bb5d2113f2a, + 0x8983e42ec1c626b3, 0x73ea9bc6513ad9c9, 0x0c904903b24f4247, 0xacbac1e6243e2525, + 0x0b1069a0c230fb06, 0x77d709fca3fc1ce5, 0x87ad0f65020947e6, 0x555302641c53f4e6, + 0x65ea87871fa9aaee, 0x58aaf4ecc1067bb4, 0x1a66c48cc4c65b3f, 0xca96aca48b2ea969, + 0xa68eb70bad14de2b, 0x5ccdb3d7e00a6f6e, 0xe178fbfec73fe72f, 0x2b63d6a16b83e890, + 0x32fdb7a5330fbae0, 0x2ab5803c8d1bf32c, 0xda838388c1527c94, 0x16a50bdc4de24acb, + 0xe561301f134c074a, 0xd7ae63d2816b4db1, 0x036aabd4df0dd741, 0xc5e0db8783435b9d, + 0x9c4386cf0a07f3b2, 0x6a72ac1aa56a13a1, 0x299bbdb04bb20a23, 0x138c1018fda16b81, + 0x0e354f0b3bda49df, 0x9f4c295b23127437, 0xd133ceb2bd561341, 0xd8b4bfd5a526ac29, + 0xcdd0a70ddc1c7bbd, 0x81dce595bf572225, 0x1c6f925c05f6efd7, 0x8ae5097553856ea0, + 0x3aabeaeef248f60d, 0xd9005809d19a69e2, 0x2a3a1a314311cc27, 0x89bb2dc76b2b624a, + 0x50a2a95d0412e289, 0x9def8df564e68581, 0xf49010a9b2e2ea5c, 0x8602ae175d9ff3f0, + 0xbf037e245369a618, 0x8038164365f6e2b5, 0xe2e1f6163b4e8d08, 0x8df9314914f0857e}}; + +} // namespace internal +} // namespace parquet diff --git a/cpp/src/parquet/column_chunker_hashtable.py b/cpp/src/parquet/column_chunker_hashtable.py new file mode 100644 index 00000000000..8addcc3af26 --- /dev/null +++ b/cpp/src/parquet/column_chunker_hashtable.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import hashlib +import pathlib +import sys +from io import StringIO + + +template = """\ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once +#include + +namespace parquet {{ +namespace internal {{ + +constexpr uint64_t GEARHASH_TABLE[8][256] = {{ +{content}}}; + +}} // namespace internal +}} // namespace parquet +""" + + +def generate_hash(n: int, seed: int): + value = bytes([seed] * 64 + [n] * 64) + hasher = hashlib.md5(value) + return hasher.hexdigest()[:16] + + +def generate_hashtable(seed: int, length=256, comma=True): + table = [generate_hash(n, seed=seed) for n in range(length)] + + out = StringIO() + out.write(f" {{// seed = {seed}\n") + for i in range(0, length, 4): + values = [f"0x{value}" for value in table[i:i + 4]] + values = ", ".join(values) + out.write(f" {values}") + if i < length - 4: + out.write(",\n") + out.write("}") + + return out.getvalue() + + +def generate_header(ntables=8, relative_path="column_chunker_hashtable.h"): + path = pathlib.Path(__file__).parent / relative_path + + tables = [generate_hashtable(seed) for seed in range(ntables)] + text = template.format(content=",\n".join(tables)) + path.write_text(text) + + +if __name__ == "__main__": + ntables = int(sys.argv[1]) if len(sys.argv) > 1 else 8 + generate_header(ntables) \ No newline at end of file diff --git a/cpp/src/parquet/column_chunker_test.cc b/cpp/src/parquet/column_chunker_test.cc index f7b1e8fe1c7..99301179ebd 100644 --- a/cpp/src/parquet/column_chunker_test.cc +++ b/cpp/src/parquet/column_chunker_test.cc @@ -846,6 +846,24 @@ TEST_P(TestColumnCDC, Append) { } } +TEST_P(TestColumnCDC, EmptyTable) { + auto [dtype, nullable, _] = GetParam(); + + auto schema = ::arrow::schema({::arrow::field("f0", dtype, nullable)}); + ASSERT_OK_AND_ASSIGN(auto empty_table, GenerateTable(schema, 0, 0)); + ASSERT_EQ(empty_table->num_rows(), 0); + + for (bool enable_dictionary : {false, true}) { + ASSERT_OK_AND_ASSIGN(auto result, + WriteAndGetPageSizes(empty_table, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + + // An empty table should result in no data pages + ASSERT_TRUE(result.lengths.empty()); + ASSERT_TRUE(result.sizes.empty()); + } +} + // TODO(kszucs): add extension type and dictionary type INSTANTIATE_TEST_SUITE_P( FixedSizedTypes, TestColumnCDC, @@ -888,4 +906,3 @@ INSTANTIATE_TEST_SUITE_P( // TODO: // - test multiple row groups -// - test empty From 5d187d520530600750557a0dc05b13d29677e98a Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Mon, 3 Mar 2025 14:26:45 +0100 Subject: [PATCH 026/102] more CDC docstrings --- cpp/src/parquet/column_chunker.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/cpp/src/parquet/column_chunker.h b/cpp/src/parquet/column_chunker.h index 1921efc0492..c9a5c4e62e8 100644 --- a/cpp/src/parquet/column_chunker.h +++ b/cpp/src/parquet/column_chunker.h @@ -121,11 +121,20 @@ class ContentDefinedChunker { const ::arrow::Array& values); private: + // Update the rolling hash with a compile-time known sized value, set has_matched_ to + // true if the hash matches the mask. template void Roll(const T value); + + // Update the rolling hash with a binary-like value, set has_matched_ to true if the + // hash matches the mask. void Roll(std::string_view value); + + // Evaluate whether a new chunk should be created based on the has_matched_, nth_run_ + // and chunk_size_ state. inline bool NeedNewChunk(); + // Calculate the chunk boundaries for typed Arrow arrays. template const std::vector Calculate(const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, const T& leaf_array); @@ -142,9 +151,16 @@ class ContentDefinedChunker { // factor. const uint64_t hash_mask_; + // Whether the rolling hash has matched the mask since the last chunk creation. This + // flag is set true by the Roll() function when the mask is matched and reset to false + // by NeedNewChunk() method. bool has_matched_ = false; + // The current run of the rolling hash, used to normalize the chunk size distribution + // by requiring multiple consecutive matches to create a new chunk. uint64_t nth_run_ = 0; + // Current chunk size in bytes, reset to 0 when a new chunk is created. uint64_t chunk_size_ = 0; + // Rolling hash state, never reset only initialized once for the entire column. uint64_t rolling_hash_ = 0; }; From 8b8722d86e2abf5984ef145a83940b99a95ae32f Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 5 Mar 2025 11:48:34 +0100 Subject: [PATCH 027/102] address review comments --- cpp/src/parquet/column_chunker.cc | 6 ++---- cpp/src/parquet/column_chunker.h | 9 +++------ cpp/src/parquet/column_chunker_hashtable.h | 6 ++---- cpp/src/parquet/column_writer.h | 1 - 4 files changed, 7 insertions(+), 15 deletions(-) diff --git a/cpp/src/parquet/column_chunker.cc b/cpp/src/parquet/column_chunker.cc index 90979bbd25a..11db84dbe54 100644 --- a/cpp/src/parquet/column_chunker.cc +++ b/cpp/src/parquet/column_chunker.cc @@ -24,8 +24,7 @@ #include "parquet/column_chunker_hashtable.h" #include "parquet/level_conversion.h" -namespace parquet { -namespace internal { +namespace parquet::internal { // create a fake null array class with a GetView method returning 0 always class FakeNullArray { @@ -247,5 +246,4 @@ const ::arrow::Result> ContentDefinedChunker::GetBoundaries( } } -} // namespace internal -} // namespace parquet +} // namespace parquet::internal diff --git a/cpp/src/parquet/column_chunker.h b/cpp/src/parquet/column_chunker.h index c9a5c4e62e8..30b04df6d7c 100644 --- a/cpp/src/parquet/column_chunker.h +++ b/cpp/src/parquet/column_chunker.h @@ -23,9 +23,7 @@ #include "arrow/array.h" #include "parquet/level_conversion.h" -namespace parquet { - -namespace internal { +namespace parquet::internal { // Represents a chunk of data with level offsets and value offsets due to the // record shredding for nested data. @@ -85,7 +83,7 @@ struct Chunk { /// and goes over the (def_level, rep_level, value) triplets one by one while adjusting /// the column-global rolling hash based on the triplet. Whenever the rolling hash matches /// a predefined mask, the chunker creates a new chunk. The chunker returns a vector of -/// Chunk objects that represent the boundaries of the chunks/// +/// Chunk objects that represent the boundaries of the chunks. /// Note that the boundaries are deterministically calculated exclusively based on the /// data itself, so the same data will always produce the same chunks - given the same /// chunker configuration. @@ -164,5 +162,4 @@ class ContentDefinedChunker { uint64_t rolling_hash_ = 0; }; -} // namespace internal -} // namespace parquet +} // namespace parquet::internal diff --git a/cpp/src/parquet/column_chunker_hashtable.h b/cpp/src/parquet/column_chunker_hashtable.h index b608e658385..63812cfec84 100644 --- a/cpp/src/parquet/column_chunker_hashtable.h +++ b/cpp/src/parquet/column_chunker_hashtable.h @@ -18,8 +18,7 @@ #pragma once #include -namespace parquet { -namespace internal { +namespace parquet::internal { constexpr uint64_t GEARHASH_TABLE[8][256] = { {// seed = 0 @@ -543,5 +542,4 @@ constexpr uint64_t GEARHASH_TABLE[8][256] = { 0x50a2a95d0412e289, 0x9def8df564e68581, 0xf49010a9b2e2ea5c, 0x8602ae175d9ff3f0, 0xbf037e245369a618, 0x8038164365f6e2b5, 0xe2e1f6163b4e8d08, 0x8df9314914f0857e}}; -} // namespace internal -} // namespace parquet +} // namespace parquet::internal diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 2ef549150b3..bd329d61053 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -23,7 +23,6 @@ #include "arrow/type_fwd.h" #include "arrow/util/compression.h" -#include "parquet/column_chunker.h" #include "parquet/exception.h" #include "parquet/platform.h" #include "parquet/types.h" From 6d63050fa6d81a16ca045a021ea82558bd088610 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 5 Mar 2025 13:05:44 +0100 Subject: [PATCH 028/102] rename files to chunker_internal_* to avoid installing the headers --- cpp/src/parquet/CMakeLists.txt | 5 +++-- cpp/src/parquet/{column_chunker.cc => chunker_internal.cc} | 5 +++-- cpp/src/parquet/{column_chunker.h => chunker_internal.h} | 0 ...lumn_chunker_hashtable.h => chunker_internal_hashtable.h} | 0 ...mn_chunker_hashtable.py => chunker_internal_hashtable.py} | 0 .../{column_chunker_test.cc => chunker_internal_test.cc} | 0 cpp/src/parquet/column_writer.cc | 2 +- cpp/src/parquet/properties.h | 1 - 8 files changed, 7 insertions(+), 6 deletions(-) rename cpp/src/parquet/{column_chunker.cc => chunker_internal.cc} (99%) rename cpp/src/parquet/{column_chunker.h => chunker_internal.h} (100%) rename cpp/src/parquet/{column_chunker_hashtable.h => chunker_internal_hashtable.h} (100%) rename cpp/src/parquet/{column_chunker_hashtable.py => chunker_internal_hashtable.py} (100%) rename cpp/src/parquet/{column_chunker_test.cc => chunker_internal_test.cc} (100%) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 31dbcdd7cc3..a8d92f190c3 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -160,7 +160,7 @@ set(PARQUET_SRCS arrow/writer.cc bloom_filter.cc bloom_filter_reader.cc - column_chunker.cc + chunker_internal.cc column_reader.cc column_scanner.cc column_writer.cc @@ -396,11 +396,12 @@ add_parquet_test(reader-test add_parquet_test(writer-test SOURCES - column_chunker_test.cc column_writer_test.cc file_serialize_test.cc stream_writer_test.cc) +add_parquet_test(chunker-test SOURCES chunker_internal_test.cc) + add_parquet_test(arrow-test SOURCES arrow/arrow_metadata_test.cc diff --git a/cpp/src/parquet/column_chunker.cc b/cpp/src/parquet/chunker_internal.cc similarity index 99% rename from cpp/src/parquet/column_chunker.cc rename to cpp/src/parquet/chunker_internal.cc index 11db84dbe54..1c463bde5f6 100644 --- a/cpp/src/parquet/column_chunker.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -15,13 +15,14 @@ // specific language governing permissions and limitations // under the License. -#include "parquet/column_chunker.h" +#include "parquet/chunker_internal.h" + #include #include #include #include "arrow/array.h" #include "arrow/util/logging.h" -#include "parquet/column_chunker_hashtable.h" +#include "parquet/chunker_internal_hashtable.h" #include "parquet/level_conversion.h" namespace parquet::internal { diff --git a/cpp/src/parquet/column_chunker.h b/cpp/src/parquet/chunker_internal.h similarity index 100% rename from cpp/src/parquet/column_chunker.h rename to cpp/src/parquet/chunker_internal.h diff --git a/cpp/src/parquet/column_chunker_hashtable.h b/cpp/src/parquet/chunker_internal_hashtable.h similarity index 100% rename from cpp/src/parquet/column_chunker_hashtable.h rename to cpp/src/parquet/chunker_internal_hashtable.h diff --git a/cpp/src/parquet/column_chunker_hashtable.py b/cpp/src/parquet/chunker_internal_hashtable.py similarity index 100% rename from cpp/src/parquet/column_chunker_hashtable.py rename to cpp/src/parquet/chunker_internal_hashtable.py diff --git a/cpp/src/parquet/column_chunker_test.cc b/cpp/src/parquet/chunker_internal_test.cc similarity index 100% rename from cpp/src/parquet/column_chunker_test.cc rename to cpp/src/parquet/chunker_internal_test.cc diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index c7d15f4d5a7..a0bb2bad669 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -45,7 +45,7 @@ #include "arrow/util/rle_encoding_internal.h" #include "arrow/util/type_traits.h" #include "arrow/visit_array_inline.h" -#include "parquet/column_chunker.h" +#include "parquet/chunker_internal.h" #include "parquet/column_page.h" #include "parquet/encoding.h" #include "parquet/encryption/encryption_internal.h" diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index a980ab29d80..a730348e3dc 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -27,7 +27,6 @@ #include "arrow/type.h" #include "arrow/util/compression.h" #include "arrow/util/type_fwd.h" -#include "parquet/column_chunker.h" #include "parquet/encryption/encryption.h" #include "parquet/exception.h" #include "parquet/parquet_version.h" From 614f5df4dda6f331e98acac64ebf63a11cdaef7d Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 5 Mar 2025 13:13:07 +0100 Subject: [PATCH 029/102] prefer to throw parquet exception rather than returning arrow status --- cpp/src/parquet/chunker_internal.cc | 8 +++++--- cpp/src/parquet/chunker_internal.h | 17 ++++++++--------- cpp/src/parquet/column_writer.cc | 5 ++--- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index 1c463bde5f6..e04d77c4234 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -23,6 +23,7 @@ #include "arrow/array.h" #include "arrow/util/logging.h" #include "parquet/chunker_internal_hashtable.h" +#include "parquet/exception.h" #include "parquet/level_conversion.h" namespace parquet::internal { @@ -204,7 +205,7 @@ const std::vector ContentDefinedChunker::Calculate(const int16_t* def_lev return Calculate(def_levels, rep_levels, num_levels, \ static_cast(values)); -const ::arrow::Result> ContentDefinedChunker::GetBoundaries( +const std::vector ContentDefinedChunker::GetBoundaries( const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, const ::arrow::Array& values) { auto type_id = values.type()->id(); @@ -242,8 +243,9 @@ const ::arrow::Result> ContentDefinedChunker::GetBoundaries( FakeNullArray fake_null_array; return Calculate(def_levels, rep_levels, num_levels, fake_null_array); default: - return ::arrow::Status::NotImplemented("Unsupported type " + - values.type()->ToString()); + throw ParquetException("Unsupported Arrow array type " + values.type()->ToString()); + // return ::arrow::Status::NotImplemented("Unsupported type " + + // values.type()->ToString()); } } diff --git a/cpp/src/parquet/chunker_internal.h b/cpp/src/parquet/chunker_internal.h index 30b04df6d7c..02414cf651d 100644 --- a/cpp/src/parquet/chunker_internal.h +++ b/cpp/src/parquet/chunker_internal.h @@ -60,11 +60,11 @@ struct Chunk { /// File1: [Page1][Page2][Page3]... /// File2: [Page4][Page2][Page3]... /// -/// Then the parquet file is being uploaded to a content addressable storage systems (CAS) -/// which split the bytes stream into content defined blobs. The CAS system will calculate -/// a unique identifier for each blob, then store the blob in a key-value store. If the -/// same blob is encountered again, the system can refer to the hash instead of physically -/// storing the blob again. In the example above, the CAS system would phiysically store +/// Then the parquet file is being uploaded to a content addressable storage system (CAS) +/// which splits the bytes stream into content defined blobs. The CAS system will +/// calculate a unique identifier for each blob, then store the blob in a key-value store. +/// If the same blob is encountered again, the system can refer to the hash instead of +/// physically storing the blob again. In the example above, the CAS system would store /// Page1, Page2, Page3, and Page4 only once and the required metadata to reassemble the /// files. /// While the deduplication is performed by the CAS system, the parquet chunker makes it @@ -113,10 +113,9 @@ class ContentDefinedChunker { /// @param num_levels Number of levels /// @param values Column values as an Arrow array /// @return Vector of Chunk objects representing the chunk boundaries - const ::arrow::Result> GetBoundaries(const int16_t* def_levels, - const int16_t* rep_levels, - int64_t num_levels, - const ::arrow::Array& values); + const std::vector GetBoundaries(const int16_t* def_levels, + const int16_t* rep_levels, int64_t num_levels, + const ::arrow::Array& values); private: // Update the rolling hash with a compile-time known sized value, set has_matched_ to diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index a0bb2bad669..fa58a760d9a 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1363,9 +1363,8 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, } if (properties_->cdc_enabled()) { - ARROW_ASSIGN_OR_RAISE(auto boundaries, - content_defined_chunker_.GetBoundaries( - def_levels, rep_levels, num_levels, leaf_array)); + auto boundaries = content_defined_chunker_.GetBoundaries(def_levels, rep_levels, + num_levels, leaf_array); for (auto chunk : boundaries) { auto chunk_array = leaf_array.Slice(chunk.value_offset); auto chunk_def_levels = AddIfNotNull(def_levels, chunk.level_offset); From a2c15b09f9bad8f73f602d3806b721086d5148dc Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 5 Mar 2025 13:32:11 +0100 Subject: [PATCH 030/102] add reference to chunk size normalization --- cpp/src/parquet/chunker_internal.cc | 2 -- cpp/src/parquet/chunker_internal.h | 6 ++++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index e04d77c4234..60b797c8146 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -244,8 +244,6 @@ const std::vector ContentDefinedChunker::GetBoundaries( return Calculate(def_levels, rep_levels, num_levels, fake_null_array); default: throw ParquetException("Unsupported Arrow array type " + values.type()->ToString()); - // return ::arrow::Status::NotImplemented("Unsupported type " + - // values.type()->ToString()); } } diff --git a/cpp/src/parquet/chunker_internal.h b/cpp/src/parquet/chunker_internal.h index 02414cf651d..53d8df0a799 100644 --- a/cpp/src/parquet/chunker_internal.h +++ b/cpp/src/parquet/chunker_internal.h @@ -89,9 +89,11 @@ struct Chunk { /// chunker configuration. /// /// References: -/// - FastCDC paper: "FastCDC: a Fast and Efficient Content-Defined Chunking Approach for -/// Data Deduplication" +/// - FastCDC: a Fast and Efficient Content-Defined Chunking Approach for Data +/// Deduplication /// https://www.usenix.org/system/files/conference/atc16/atc16-paper-xia.pdf +/// - Git is for Data (chunk size normalization used here is described in section 6.2.1): +/// https://www.cidrdb.org/cidr2023/papers/p43-low.pdf class ContentDefinedChunker { public: /// Create a new ContentDefinedChunker instance From dd21d23019885ed84e69ebe8248270fae7ec5ced Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 5 Mar 2025 13:44:13 +0100 Subject: [PATCH 031/102] add a comment about AddDataPage() at the end of each chunk --- cpp/src/parquet/column_writer.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index fa58a760d9a..280704c4247 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1379,6 +1379,10 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, maybe_parent_nulls)); } if (num_buffered_values_ > 0) { + // Explicitly add a new data page according to the content-defined chunk + // boundaries. This way the same chunks will have the same byte-sequence + // in the resulting file, which can be identified by content addressible + // storage. AddDataPage(); } } From 5154d01f9f3f8f40fa5c1d420be8f8c48c076c5d Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Thu, 6 Mar 2025 19:51:02 +0100 Subject: [PATCH 032/102] address review comments --- .gitattributes | 1 + cpp/src/parquet/chunker_internal.cc | 20 +- cpp/src/parquet/chunker_internal.h | 3 +- ...shtable.py => chunker_internal_codegen.py} | 12 +- ...shtable.h => chunker_internal_generated.h} | 2 +- cpp/src/parquet/column_chunker_generated.h | 545 ++++++++++++++++++ cpp/src/parquet/column_writer.cc | 3 +- 7 files changed, 565 insertions(+), 21 deletions(-) rename cpp/src/parquet/{chunker_internal_hashtable.py => chunker_internal_codegen.py} (90%) rename cpp/src/parquet/{chunker_internal_hashtable.h => chunker_internal_generated.h} (99%) create mode 100644 cpp/src/parquet/column_chunker_generated.h diff --git a/.gitattributes b/.gitattributes index 70007c26c8b..18396af4933 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,4 +1,5 @@ cpp/src/arrow/util/bpacking_*_generated.h linguist-generated=true +cpp/src/parquet/chunker_*_generated.h linguist-generated=true cpp/src/generated/*.cpp linguist-generated=true cpp/src/generated/*.h linguist-generated=true go/**/*.s linguist-generated=true diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index 60b797c8146..2ded58c7bdd 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -22,7 +22,7 @@ #include #include "arrow/array.h" #include "arrow/util/logging.h" -#include "parquet/chunker_internal_hashtable.h" +#include "parquet/chunker_internal_generated.h" #include "parquet/exception.h" #include "parquet/level_conversion.h" @@ -52,12 +52,12 @@ static uint64_t GetMask(uint64_t min_size, uint64_t max_size, uint8_t norm_facto } ContentDefinedChunker::ContentDefinedChunker(const LevelInfo& level_info, - std::pair size_range, + uint64_t min_size, uint64_t max_size, uint8_t norm_factor) : level_info_(level_info), - min_size_(size_range.first), - max_size_(size_range.second), - hash_mask_(GetMask(size_range.first, size_range.second, norm_factor)) {} + min_size_(min_size), + max_size_(max_size), + hash_mask_(GetMask(min_size, max_size, norm_factor)) {} template void ContentDefinedChunker::Roll(const T value) { @@ -70,7 +70,7 @@ void ContentDefinedChunker::Roll(const T value) { } auto bytes = reinterpret_cast(&value); for (size_t i = 0; i < BYTE_WIDTH; ++i) { - rolling_hash_ = (rolling_hash_ << 1) + GEARHASH_TABLE[nth_run_][bytes[i]]; + rolling_hash_ = (rolling_hash_ << 1) + kGearhashTable[nth_run_][bytes[i]]; has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0); } } @@ -84,7 +84,7 @@ void ContentDefinedChunker::Roll(std::string_view value) { } for (char c : value) { rolling_hash_ = - (rolling_hash_ << 1) + GEARHASH_TABLE[nth_run_][static_cast(c)]; + (rolling_hash_ << 1) + kGearhashTable[nth_run_][static_cast(c)]; has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0); } } @@ -105,9 +105,9 @@ bool ContentDefinedChunker::NeedNewChunk() { } } if (ARROW_PREDICT_FALSE(chunk_size_ >= max_size_)) { - // we have a hard limit on the maximum chunk size, not that we don't reset the rolling - // hash state here, so the next NeedNewChunk() call will continue from the current - // state + // we have a hard limit on the maximum chunk size, note that we don't reset the + // rolling hash state here, so the next NeedNewChunk() call will continue from the + // current state chunk_size_ = 0; return true; } diff --git a/cpp/src/parquet/chunker_internal.h b/cpp/src/parquet/chunker_internal.h index 53d8df0a799..b7334d3f6a5 100644 --- a/cpp/src/parquet/chunker_internal.h +++ b/cpp/src/parquet/chunker_internal.h @@ -104,8 +104,7 @@ class ContentDefinedChunker { /// @param norm_factor Normalization factor to center the chunk size around the average /// size more aggressively. By increasing the normalization factor, /// probability of finding a chunk boundary increases. - ContentDefinedChunker(const LevelInfo& level_info, - std::pair size_range, + ContentDefinedChunker(const LevelInfo& level_info, uint64_t min_size, uint64_t max_size, uint8_t norm_factor = 0); /// Get the chunk boundaries for the given column data diff --git a/cpp/src/parquet/chunker_internal_hashtable.py b/cpp/src/parquet/chunker_internal_codegen.py similarity index 90% rename from cpp/src/parquet/chunker_internal_hashtable.py rename to cpp/src/parquet/chunker_internal_codegen.py index 8addcc3af26..063eda4b92f 100644 --- a/cpp/src/parquet/chunker_internal_hashtable.py +++ b/cpp/src/parquet/chunker_internal_codegen.py @@ -44,14 +44,12 @@ #pragma once #include -namespace parquet {{ -namespace internal {{ +namespace parquet::internal {{ -constexpr uint64_t GEARHASH_TABLE[8][256] = {{ +constexpr uint64_t kGearhashTable[8][256] = {{ {content}}}; -}} // namespace internal -}} // namespace parquet +}} // namespace parquet::internal """ @@ -61,7 +59,7 @@ def generate_hash(n: int, seed: int): return hasher.hexdigest()[:16] -def generate_hashtable(seed: int, length=256, comma=True): +def generate_hashtable(seed: int, length=256): table = [generate_hash(n, seed=seed) for n in range(length)] out = StringIO() @@ -77,7 +75,7 @@ def generate_hashtable(seed: int, length=256, comma=True): return out.getvalue() -def generate_header(ntables=8, relative_path="column_chunker_hashtable.h"): +def generate_header(ntables=8, relative_path="column_chunker_generated.h"): path = pathlib.Path(__file__).parent / relative_path tables = [generate_hashtable(seed) for seed in range(ntables)] diff --git a/cpp/src/parquet/chunker_internal_hashtable.h b/cpp/src/parquet/chunker_internal_generated.h similarity index 99% rename from cpp/src/parquet/chunker_internal_hashtable.h rename to cpp/src/parquet/chunker_internal_generated.h index 63812cfec84..13a47984b74 100644 --- a/cpp/src/parquet/chunker_internal_hashtable.h +++ b/cpp/src/parquet/chunker_internal_generated.h @@ -20,7 +20,7 @@ namespace parquet::internal { -constexpr uint64_t GEARHASH_TABLE[8][256] = { +constexpr uint64_t kGearhashTable[8][256] = { {// seed = 0 0xf09f35a563783945, 0x0dcc5b3bc5ae410a, 0x63f1ea8d22554270, 0xfbe5ee7bd05a7b61, 0x3f692ed5e9934aba, 0xaab3755952250eb8, 0xdefb168dc2888fa5, 0x501b36f7c77a7d47, diff --git a/cpp/src/parquet/column_chunker_generated.h b/cpp/src/parquet/column_chunker_generated.h new file mode 100644 index 00000000000..13a47984b74 --- /dev/null +++ b/cpp/src/parquet/column_chunker_generated.h @@ -0,0 +1,545 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once +#include + +namespace parquet::internal { + +constexpr uint64_t kGearhashTable[8][256] = { + {// seed = 0 + 0xf09f35a563783945, 0x0dcc5b3bc5ae410a, 0x63f1ea8d22554270, 0xfbe5ee7bd05a7b61, + 0x3f692ed5e9934aba, 0xaab3755952250eb8, 0xdefb168dc2888fa5, 0x501b36f7c77a7d47, + 0xd2fff45d1989642d, 0x80217c1c600e30a6, 0xb9469ee2e43df7ac, 0x3654b76a61999706, + 0x6ea73dfe5de0c6b6, 0xdfd662e1937a589d, 0x0dbe0cc74b188a68, 0xde45f4e6d73ffc6f, + 0xcdf7a7759e70d87e, 0x5d6a951b8d38c310, 0xdc9423c3813fcf2c, 0x25dc2976e167ffce, + 0xc2555baa1d031c84, 0x115bc3f2230a3ab6, 0xd4b10260f350bede, 0xdfd3501ab447d723, + 0x022e79217edaf167, 0x1635e2255c5a7526, 0xa0a750350cc77102, 0xc027133e05d39f56, + 0xd949459779cf0387, 0xb92f1464f5c688c2, 0xd9ac5f3e8b42f2f3, 0xdf02bb6f5ecaac21, + 0x8156f988fac7bfa4, 0xe4580f97bede2ec8, 0x44fe7d17a76fca32, 0x885f59bd54c2014c, + 0x435e63ec655ffae9, 0x5ebc51930967b1f1, 0x5428c2084ac29e47, 0x9465938fec30e36b, + 0xc7cb3de4977772cd, 0x15692d7c201e8c3a, 0x505ee65cdc4b17f4, 0x7d9839a0a7aead6b, + 0xeef5f5b6a0105291, 0x76c2fb232ce7f5bf, 0x5c13893c1c3ff3a9, 0x65b6b547d4442f98, + 0xb8ad7487c8c96fce, 0x906bcf51c99974f8, 0x2f56e48bb943a48c, 0xbc9ab109f82d3a44, + 0xcd5160cdc8c7e735, 0xbe9acb9df3427732, 0x386b91d477d7fade, 0x36be463621dd5af2, + 0xcbe6a2faffd627a8, 0x9c8fd528463a2f5a, 0xb9b88c6bb802b184, 0xb414b4e665c597c7, + 0xbedb142568209556, 0x5360d81c25429dce, 0x63a69a960a952f37, 0xc900d63899e1b503, + 0x1abc63a8b37c7728, 0xa8b3a8b6409080eb, 0x495e391f662959f6, 0xdf1e136f3e12229b, + 0x33d5fc526b0dd38d, 0x321221ae2abfac63, 0x7fde18351fda7395, 0xed79fe5c3a6aa4c3, + 0x2dd6965a4867d8d4, 0x54813ca20fe8799b, 0x5d59ea6456465c39, 0x0de0c294d1936b81, + 0x4aaf0755002c588c, 0x3530a1857ad04c6d, 0xb8a64f4ce184442b, 0xe0def10bceedfa17, + 0x46e38d0a443757ec, 0x9795a1c645ee16d7, 0x7e531def245eac8a, 0x683b25c43a0716cf, + 0x884583d372da219d, 0x5b06b62c910416e5, 0x54b6902fbebd3dbe, 0x931198d40a761a75, + 0xead7d8e830013590, 0x80b4d5dc99bfaced, 0xf98272c8108a1ad2, 0x1adce054289a0ec6, + 0x7d53a1143c56b465, 0x497fbe4f00c92b52, 0x525e4cc2e81ebd69, 0xc94478e0d5508ff6, + 0xb8a5da83c196d07c, 0x7667a921b65b0603, 0xf236fabbdefe6cd1, 0x53da978d19a92b98, + 0xc604f6e97087124d, 0x2cbd27221924b094, 0x65cd1102c985b1d2, 0x08c0755dc1a97eb4, + 0x5e0419e921c0fef1, 0x282d2c1196f84a29, 0xe21117fcfc5793f7, 0xcf4e985dc38e6c2e, + 0xd521f4f264d55616, 0xde69b04c485f2a10, 0x59410e245305178a, 0xceab1d477c943601, + 0xa9805732d71ee5e9, 0x054cd443896974f6, 0xf2b517717a423a3e, 0x09517937fa9fac95, + 0x4938233e9ca871e3, 0x9132cbaf56f83ec0, 0x4703421ed1dd027d, 0xfd9933f4e6f1ec4e, + 0xf237c7fded2274a8, 0xdf4616efe68cd7b4, 0x5e46de0f39f0a380, 0x3d41e0c6d8e095b0, + 0xc5272f8a5bb2df09, 0x68aa78e8301fb964, 0xbf5b5b52c8e32ae0, 0xbf28ed3df74bdcf7, + 0xd6198f64c833815a, 0x8cd99d2974267544, 0xd90560ea4465ff2c, 0x571d65ad7ad59261, + 0x309453518baa367a, 0xa60538377bc79fb2, 0xace515da1ab4183c, 0xf56d3c8d891d1c5b, + 0x5b0d8370b59def49, 0x775866ce7c83c762, 0x3d76085695c8e18a, 0xba064d1a9af1b114, + 0xc84ef7cd7b98b521, 0x90b9231681c2bc37, 0x37e2b13e6f585b6b, 0x1d0a34e55e0f369f, + 0x86bb8019cf41447c, 0x4b95c6ef55b3f71f, 0x3b6ed1660732b310, 0x617eee603d137f21, + 0xf4f6278b464f3bbc, 0xdfb763b720da205a, 0x353478899b871cb7, 0xe45fbbff574cc41e, + 0x1a94b60847907d72, 0xb10eef051eff67a5, 0xf0e012ec6a284d40, 0xcc1cd1a11b926d7c, + 0xcf9d9c5453e19cad, 0x270febcc0fc0e86b, 0xd6567568778b781e, 0x7323b98965eeb46b, + 0xccecd374567086ff, 0xef7b44bfc497a704, 0xebc479c051a9f0a5, 0xc9b7410e3e00a235, + 0x1d084f7ecdf83dab, 0xc8a9a97e33ba8ba3, 0x8c75318f5b2350d6, 0xaa3cd5d0c684bdda, + 0xa81125fe0901bedf, 0xf7bcd76020edfc93, 0x834ee4c12e75874f, 0xb2bb8a7beb44fa14, + 0x32cd26f50a4f4e4d, 0x0fc5817ca55d959a, 0xd6e4ae2e3ae10718, 0x074abdcceb8d6e38, + 0xc0cc5f4f9b3a9c43, 0x1115d364363595b2, 0x69861db2eb19f2e8, 0x59b8d804cf92bc67, + 0x9bac9785e5e4b863, 0x7fa0e17a41869561, 0x10d3c9633f0c709c, 0x534a03deee6bc44a, + 0x73b1f7201257f581, 0x46fd6a11e2e0706b, 0x494abb554946e67a, 0xb5d6da317864dc8e, + 0x402ded9238f39687, 0xd8fa37d2cbd6d290, 0xcc818293fcb06791, 0x6482ab344806cd4d, + 0x0956e6ee9d8eb60b, 0x01fee622d8465ac8, 0xae7ece370cbd9c35, 0x7ff09e937a177279, + 0xa2c29ee7a33ca5f1, 0x990e8dbee083923b, 0x4a819b72f610863a, 0xddecfad79d3f08be, + 0x627372480fac20a7, 0x802154d6eca2db4c, 0x8fcf02e42f805e55, 0x040a911ff8cea977, + 0xbb544485bc64d0d4, 0xaddde1aeb406d0fb, 0xf6b35fae23dce66f, 0xc07a9fb3645d2f9b, + 0xccd113907e9c0fed, 0xd17af369984fd213, 0x9223823c59a083e7, 0xe19d475606b81013, + 0xe181ac116a90e57a, 0x71f7b6258c6def4c, 0x2246f34b45964f7c, 0xd74aedaea2d31751, + 0xb1add86e5dd305d1, 0xeb9ba881f16d6471, 0xef7600e036f5c6ff, 0x1d50bc9735b8fb85, + 0xe63942bd1f3e2969, 0x9241ba9f8b3f4e72, 0xee8bb2bca07d35b6, 0x55cd55dab522654e, + 0x94d0cfa7c1a6845d, 0x02f9845d559884c3, 0x8ce70ea21063b560, 0xd70998028ef08b74, + 0xdfdb5bbee310876b, 0x4e21b2e348256d16, 0xde007a981c13debc, 0xe51950cbbddabfdd, + 0xd223301dbe9957c1, 0x084b8634cc2cce4b, 0x90e551378aa9d70c, 0x833b533ac633e448, + 0x7891e232882da57f, 0xa1bf26f0163ce2b3, 0xf33a0171eb9c68d5, 0x2e7de18ca69b3fa2, + 0x666fd6f175619199, 0x1239d37edb5feb9f, 0xfa9fc9382e61ff5c, 0x3ca4ad427e3c126f, + 0x37c6dd4c2c31ae6e, 0x1f1bacb619d427b2, 0x7dd09f5d10759afe, 0xc8d941432327d733, + 0x2b389ba25e1d43a7, 0xa4e3030c3740ff21, 0xcc56dae13fd37463, 0x2481457c175b560f, + 0x9deb35bde77c5c41, 0x847aa6ea5549a0c3, 0xcde01bb48b6e7f02, 0x15a28844e64cb211}, + {// seed = 1 + 0xecfcba92fe5691a3, 0x71377799fea34699, 0xb284c9096fa614e5, 0x54534170f40de6c8, + 0xbbd804d45884fba3, 0x44929a896388c8a1, 0x79b712508e0fa3b1, 0xeb53ab280af31054, + 0x351ea23a6319da7a, 0x2fbe55d9819d85a2, 0x34f4b6568dcd28b1, 0x8c94ea5e5d82967a, + 0x09068d333a46d3c5, 0x762ad4f64cb73381, 0xd5c6db5ef0e22640, 0x36d8ab5a36175680, + 0xd41fe333cdc3525a, 0xa1f51dbdf20ce781, 0x1410a95e786c8be6, 0x96b7499a670c2b41, + 0x3912e1037835d893, 0x272c5bd83e1e9115, 0x2ea7f91cad82a0d6, 0xcd10e85662ce9931, + 0xedad49be8d5e8b74, 0x7ccd8fe0f37d12bc, 0xfac0482005eed593, 0x4513991681f6c8b0, + 0x2804d612eb0ad37d, 0x7cca9e8412b81d34, 0x85ffd6707192b7b8, 0xea0560aeea954411, + 0x0122d28226102bba, 0xf51c47cdbd22fdd1, 0x3707d851183ff17c, 0xaef5a1465f3e902d, + 0xbcb38c2d8736a04f, 0x4025317e864bef15, 0x8d3f66d86e1ea58f, 0xc16759a3d97ed79a, + 0x1c62abdc0659f2f5, 0x23b3eb4e699bd28f, 0x5083c4fceed3ccaf, 0xa65bf34562cc989c, + 0xaa5865932fd79064, 0xf24d08d268c24593, 0x7fbd00a215196999, 0x7812cd366d752964, + 0x62e8dcb27ef3d945, 0xf08b7984e1b946dc, 0x547d23ad9a5c1dcf, 0x496b1fb249b27fb7, + 0xcd692e1db5f3b3ba, 0x41931e39f1e1bc61, 0x286c6a7d7edae82b, 0x17ef6638b6c4ca6e, + 0x609beb5a2576a934, 0xcc5e16fe4a69b83c, 0xbbd14d08b078fc24, 0x2a617680f481cb94, + 0x81dbbd5f86e6d039, 0xeb8205e1fc8ecc3c, 0xe5e3bb576faa8042, 0x5d6f1eb9d9df01b5, + 0x9a47b8739c10fb44, 0x398a7caad7ea7696, 0x9c0fc1d7c46adde6, 0x67cd6de0a51978a6, + 0x68ccc4b77a21cca4, 0x1e067066b82f415c, 0xf7ddade6535e1819, 0xf2185c884291751b, + 0xc322b7381fcbe34f, 0x242f593e88290b9b, 0x8e11ccc0ea5e84a3, 0x40e3a2e3346db8a2, + 0xf18bfc3ad2931a2c, 0x2468397394b00144, 0xeae199cce14e6817, 0x05b462686c75a1ae, + 0xda096cb859c51673, 0xd87aeb967a906bef, 0xaabc74493cb02fe6, 0x74d48fc2e7da143e, + 0x6ec1c8fed3f2c1fd, 0xe01e0704b463f18e, 0xc3d88a4d3a8056e4, 0xd01ae0ffab6c8f3f, + 0x881ba052620ae7c7, 0xcea033aef0a823a5, 0x8d2cad91d83df1e3, 0x18746d205e66dbe9, + 0x3061f8e58d046650, 0xd819c59f0ce2cf8b, 0x144e89e93635e870, 0x3415e88279b21651, + 0xd6f7ab944b86c3fa, 0x45f1dd15d0f67bdc, 0xbf0d97c7f4fa24f4, 0x34a7de520a57fcd2, + 0x4ba86fda03e9e2bc, 0xa7995265a025b552, 0x698f6819d5f51cf7, 0xd07dbe9d8a156981, + 0x2683945373857fc1, 0x116f8a84f96167de, 0x8bc832bd85595ebf, 0xb206519d74fdfafa, + 0xde9519b2e9b5cc5f, 0x16fdd6f2da1d8163, 0x7ba32bd48ef56f11, 0x6f4e4d7ee8b29717, + 0xd31576dde7468aad, 0x023bb08848676045, 0xf6dcc083178160b7, 0x42035f426250e683, + 0x343732993cfed89f, 0x0640a870a22d3d58, 0x65cff80b53b4ae6a, 0x27996fa17ab05215, + 0xfd5db01401b21a04, 0x894508784bc1673c, 0x5bfcf43a2380e27d, 0x4cd6dcc2715583b7, + 0xa43b3763e7d4c902, 0x6da83e12ef0c1257, 0xfe80a602b0335aff, 0x293a7d8f4ff344de, + 0xb4ae7c2b8956bf5a, 0x6b45432d38254b4d, 0xd086acbdf15d9455, 0xa4d19e43f41ea87b, + 0xf01f13ba4bb87fbf, 0xca582cf301a299ff, 0x0ddad3d45298fa7d, 0x0646a130459c3999, + 0xc08e3af3747e2cee, 0xfc7db8aa9ed67295, 0x783b329e7bd79d5f, 0x732dbc607957af7b, + 0x8e446ac19fb26555, 0xff1dfa4d61dc89a5, 0xb6fbc46bd8d011d8, 0x185147ec5779f0d7, + 0x6eb2cf6149a5380f, 0xb0e773df803a1eae, 0xc07706c5519bfce5, 0xc35abcf54fa95f14, + 0x40a01d99a38608ea, 0x776dcd6f603c277f, 0x6ae12389b1d6d0bb, 0x8bd981448df92bb9, + 0x426a6a7ca21a2c16, 0x87efd5b71c1bad26, 0x71fb7fc4cd41de48, 0xdd9033c45619d463, + 0x40eaab322654cef7, 0xe077fffed6f3e3a2, 0x375a4dbef9384447, 0x2066b009d2c4a100, + 0xeca4a5794a068447, 0x2128f64bddf341a1, 0x738b4bb1be90bd61, 0x433772cf3813d52e, + 0x9540c88add8e4474, 0x0b6d5decd21d3519, 0x654ead966745642d, 0xe1bfb03c3b4bdb4c, + 0x0b977a9937515b1f, 0x0a4587509ef63870, 0xe89f0de1d9cfd44a, 0x23a91390272e7f68, + 0xd92defbc9096b8d8, 0x004db87174612539, 0xc88ecaabdd1a71f1, 0x050de38393073346, + 0x8af1426d7964e038, 0xf352c4fef8ad5c87, 0x6f26bc7408e26548, 0x0d41543fd9bf3084, + 0xfc4e07553a840fc6, 0x5ef117de86a555a9, 0x1f11c42dffb5ae1b, 0x4147648f07490fa5, + 0x09b35fd7671b21aa, 0x1453b14f7ccca481, 0x944f6fcce4c9b2ba, 0x5b08dd2e3583dc06, + 0xe0220df78dc9c22d, 0x1c200b9506cbf666, 0x8a0b7465eadb523b, 0xfbcb43a91a1e2d80, + 0xe697f44be3c36a58, 0x2f8a8e48fb7e350d, 0x7baba71b8920d55f, 0x10edc0216105bc96, + 0x52db07c79d7a7a63, 0x1916e8cef9452ac3, 0x5cbbbf21f867b6cc, 0xadd583365a690a4b, + 0x4e4ca2c8bffc2fdb, 0xf5fe3416d2eebcfe, 0x839af8b85e452476, 0x8496c0c54ad44e16, + 0x6c46f1ecad4482bf, 0xb794cad76ae18715, 0x67b762eec7c62985, 0x52dc9e68df5b3a53, + 0x0cc7e444b422a5f9, 0xadbfe90841c112b0, 0xfe37b136f0ca5c34, 0xcfe9e47948a8d73e, + 0xee90572b86a30d91, 0x549e72d8262830aa, 0x3361564b469f32c6, 0x1e6eba9e0d2648e2, + 0x5f8e2b2ac5fcb4eb, 0xe4224fa5f71f7cc6, 0x7357a9230c76757b, 0xcad70f74aaf6b702, + 0xeef28ced23894cc2, 0x753fdd3352aefd68, 0x1fed6ba90bbeb9d2, 0x05316f4ab4034b4b, + 0x3396df022b9f63d6, 0x82d7125a7cfd0935, 0x3519a71caf1f87f0, 0xd1dfb7a5cc3974be, + 0xbfae40ecbdbbcc2a, 0x152c11778e08dd54, 0x4a96566a6c848554, 0x3a84d621c340cdd7, + 0xfd47aa1887e2fb03, 0xa63cae94b2f1d099, 0xed61783f3e5b75e0, 0xefd44864106019be, + 0x145ff78b80b081aa, 0x34670e5fcea9230e, 0x876ef976328db371, 0x4221f3a5269942a6, + 0x95315cbd85c648f4, 0x3ca344dc7c3b1600, 0x38421ea39ff28780, 0x31dbeee967c0435c, + 0x27437c3e268402e7, 0xdd0cf8343312a654, 0x965ab9dad1d8aa29, 0xf871706dd3e23509, + 0xce23d06c7a25e699, 0x1b37d59382b27589, 0x3407f004723d6324, 0x56efb69cdb5deaa1, + 0xf46cdd2b9fd604e0, 0xcad3ca79fdac69bd, 0x7252802a574e63cb, 0xc281fb8acc6ec1d3}, + {// seed = 2 + 0xdd16cb672ba6979c, 0x3954eaa9ec41ae41, 0x52cb802771d2966d, 0xf57ed8eb0d0294f2, + 0x768be23c71da2219, 0x6131e22d95a84ad3, 0xd849e4e49bb15842, 0x18e8e5c4978cf00d, + 0x3af5e5867ce1f9bd, 0x06c75a9fffe83d63, 0xe8de75a00b58a065, 0x0a773251bc0d755a, + 0x629dc21e54548329, 0x2a168f5e5a883e70, 0x33547375f0996c86, 0xdfcb4c7680451322, + 0x55c1ecaaaa57e397, 0x4546c346c24f5a31, 0x6f8f0401dfabc86c, 0x7760d2d36ee340b4, + 0xf6448e48bdeb229d, 0xba70e1633b4dba65, 0x069cda561e273054, 0xa010b6a84aebf340, + 0x5c23b8229eee34b6, 0xea63c926d90153af, 0x7d7de27b3e43ec1b, 0xea119541eddc3491, + 0xf1259daeddfc724c, 0x2873ca9a67730647, 0xa1e7710dade32607, 0x758de030b61d43fd, + 0xd2c9bcbfa475edb4, 0x18ade47bb8a0aa29, 0xf7a74af0ff1aea88, 0x6f8873274a987162, + 0x6963e8d876f4d282, 0xd435d4fe448c6c5b, 0x93ec80ba404cafff, 0xcf90d24c509e41e7, + 0x5f0fc8a62923e36e, 0x9224878fe458f3a4, 0xd9a039edf1945bcd, 0x0877d1892c288441, + 0x75205491f4b4740b, 0x30f9d2d523a9085b, 0x4b7f4029fa097c99, 0x170bb013745709d4, + 0x7087af537f11ef2e, 0x28c62b88e08fc464, 0x84bbcb3e0bb56271, 0x485a4b099165c681, + 0x357c63357caa9292, 0x819eb7d1aee2d27e, 0xdaa759eb9c0f8c9d, 0x42cdc36729cc3db5, + 0x9489aa852eddbb06, 0x8161e4f85a84e6d4, 0xa964863fdad3eb29, 0xcc095ddbce1a6702, + 0x3ecfadbb8dc2ce58, 0x971316509b95a231, 0xc8f484d1dbc38427, 0xae9c510c463574c0, + 0xdf2b31179600c21a, 0x440de87bada4dfa3, 0xbd8d30f3f6fb7522, 0x84e6d7f678a0e2d0, + 0x0ec4d74323e15975, 0xf6947610dad6d9ab, 0x73a55a95d73fe3a5, 0x3e5f623024d37eda, + 0x8d99a728d95d9344, 0x8b82a7956c4acdc4, 0x7faeaea4385b27f6, 0x540625ff4aa2ff21, + 0x4aa43b3ebd92ce2b, 0x899646a6df2da807, 0x49225115780942d7, 0xe16606636af89525, + 0xb980bcf893888e33, 0xf9ed57695291b0d8, 0x5c6dd14464619afa, 0x50606d69b733d4f3, + 0x7fb1af465b990f97, 0x3fab2634c8bbd936, 0x556da6168838b902, 0x0f15975902a30e1f, + 0xb29d782ae9e1991f, 0xae00e26ff8f7e739, 0xd3da86458bb292d5, 0x4528ee0afb27e4ce, + 0x49882d5ba49fabad, 0x7e873b6a7cf875ee, 0x777edd535113c912, 0x94ed05e7ff149594, + 0x0b8f95fc4211df43, 0x9135c2b42426fef2, 0x411e6c2b47307073, 0x503207d1af0c8cf8, + 0xd76f8619059f9a79, 0x64d24617855dee45, 0xf7bc7a877923196a, 0xd6cc42ed6a65be79, + 0xe3912ff09d4fc574, 0x4192d03b2bc2460a, 0xa0dcc37dad98af85, 0xfc59049b2a5818a4, + 0x2128bae90a5b975f, 0xbe7067ca05ea3294, 0x5bab7e7753064c4f, 0x42cbf0949ef88443, + 0x564df4bbd017492c, 0xf2c2eb500cf80564, 0x5b92e67eb00e92af, 0x8c4103eef59c0341, + 0x83412122b8284998, 0x888daf2da0636b6d, 0x4d54b10303dd07d6, 0x201190e7c1e7b5ed, + 0x3797510bb53a5771, 0x03f7bc598b570b79, 0xdc1e15d67d94f73e, 0x721e8b499ebe02c1, + 0x71f954f606d13fa0, 0x0c7a2e408c168bf0, 0x07df2ef14f69c89d, 0xe295096f46b4baaf, + 0x7a2037916438737e, 0xd1e861aeaf8676ea, 0xb36ebdce368b8108, 0xb7e53b090ddb5d25, + 0x5a606607b390b1aa, 0x475e52994f4a2471, 0xbcc2038ba55b2078, 0x28b8a6b6c80df694, + 0xb5f0130ec972c9a2, 0x7a87cd2a93276b54, 0x4d0eec7ecf92d625, 0xac1a8ce16269a42e, + 0xa4ca0237ca9637b8, 0xd8dc8ff91202b6ff, 0x75b29846799d7678, 0x761b11a5edd9c757, + 0xf2581db294ef3307, 0xe3173c2b6a48e20f, 0xe46fd7d486d65b3c, 0x1352024303580d1f, + 0x2d665dae485c1d6d, 0x4e0905c825d74d3b, 0x14ff470c331c229e, 0xbdc656b8613d8805, + 0x36de38e396345721, 0xaae682c1aa8ff13b, 0x57eb28d7b85a1052, 0xf3145290231d443a, + 0xd0f68095e23cbe39, 0x67f99b3c2570b33d, 0x54575285f3017a83, 0x9b2f7bb03d836a79, + 0xa57b209d303367a9, 0x7ccb545dd0939c79, 0x1392b79a37f4716d, 0x6e81bb91a3c79bcd, + 0x2c2cd80307dddf81, 0xb949e119e2a16cbb, 0x69625382c4c7596f, 0xf19c6d97204fb95c, + 0x1b2ea42a24b6b05e, 0x8976f83cd43d20ac, 0x7149dd3de44c9872, 0xc79f1ae2d2623059, + 0xca17a4f143a414e1, 0x66d7a1a21b6f0185, 0xed2c6198fe73f113, 0x16a5f0295cbe06af, + 0x5f27162e38d98013, 0xf54d9f295bdc0f76, 0x9ba7d562073ef77b, 0xa4a24daaa2cfc571, + 0x49884cf486da43cd, 0x74c641c0e2148a24, 0xbff9dcbff504c482, 0xf8fc2d9403c837ab, + 0x6ccc44828af0bb1e, 0xbcf0d69b4c19dfdb, 0x8fe0d962d47abf8f, 0xa65f1d9d5514271d, + 0x26ff393e62ef6a03, 0xc7153500f283e8fc, 0xea5ed99cdd9d15cd, 0xfc16ac2ba8b48bb7, + 0xf49694b70041c67a, 0xbd35dd30f5d15f72, 0xcf10ad7385f83f98, 0x709e52e27339cdc2, + 0xe9505cb3ec893b71, 0x2ffa610e4a229af7, 0x12e1bc774d1f0e52, 0xe301a3bb7eacccc8, + 0x1fdd3b6dcd877ebf, 0x56a7e8bda59c05aa, 0x99acd421035d6ab4, 0xfd21e401cecd2808, + 0x9a89d23df8b8d46f, 0x4e26b1f1eb297b9c, 0x9df24d973e1eae07, 0xe6cdc74da62a6318, + 0xfc360d74df992db0, 0xf4eca0a739514c98, 0x481c515ba9bf5215, 0xce89cce80f5f3022, + 0xf487a10fc80e4777, 0x235b379a87e41832, 0x76f72e028371f194, 0xd044d4a201325a7d, + 0x47d8e855e0ffbdde, 0x268ae196fe7334b0, 0x123f2b26db46faa8, 0x11741175b86eb083, + 0x72ee185a423e6e31, 0x8da113dfe6f6df89, 0x286b72e338bbd548, 0xa922246204973592, + 0x7237b4f939a6b629, 0x31babda9bedf039a, 0xb2e8f18c6aeec258, 0x0f5f6ce6dd65a45e, + 0x8f9071a0f23e57d3, 0x71307115ba598423, 0xcbe70264c0e1768c, 0x1c23729f955681a8, + 0xfbc829099bc2fc24, 0x9619355cbc37d5d6, 0xea694d4e59b59a74, 0xb41cf8d3a7c4f638, + 0xae1e792df721cd0b, 0x7cd855d28aac11f6, 0xca11ba0efec11238, 0x7c433e554ce261d8, + 0xe3140366f042b6ba, 0x8a59d68642b3b18c, 0x094fcdd5d7bccac2, 0x9517d80356362c37, + 0x4a20a9949c6c74e8, 0xc25bcf1699d3b326, 0xa8893f1d1ed2f340, 0x9b58986e0e8a886e, + 0x29d78c647587ce41, 0x3b210181df471767, 0xd45e8e807627849d, 0x1ec56bc3f2b653e3, + 0x974ff23068558b00, 0xdb72bdac5d34262c, 0x23225143bb206b57, 0xd0a34cfe027cbb7e}, + {// seed = 3 + 0x39209fb3eb541043, 0xee0cd3754563088f, 0x36c05fc545bf8abe, 0x842cb6381a9d396b, + 0xd5059dcb443ce3bf, 0xe92545a8dfa7097e, 0xb9d47558d8049174, 0xc6389e426f4c2fc0, + 0xd8e0a6e4c0b850d3, 0x7730e54360bd0d0d, 0x6ecb4d4c50d050d5, 0x07a16584d4eb229f, + 0x13305d05f4a92267, 0xb278ddd75db4baec, 0x32381b774138608f, 0x61fe7a7163948057, + 0x460c58a9092efee6, 0x553bf895d9b5ff62, 0x899daf2dabfd0189, 0xf388ab9c1c4b6f70, + 0xd600fe47027ea4cd, 0x16d527ec2b5ef355, 0x5ac1f58ff6908c81, 0xa08d79ff8ee9ffe8, + 0xc1060a80b7a5e117, 0x14b2c23118c60bda, 0x8cc0defbb890df8f, 0xe29540fd94c6d28b, + 0xa604f003f82d5b71, 0xa67583d4eb066d18, 0xd62cbd796322b3fc, 0x070cfe244cdcccf3, + 0x73557c30b3af47e5, 0x2e544e31153a2163, 0x996eef7464d5bead, 0xbc71cb5ab0586cdc, + 0x0bfcb6c1b517ed69, 0x62b4f1fcc82e8ca0, 0x0edbc68f544965c5, 0x40fa39baa24af412, + 0xf39aeb2413dab165, 0x17e6013e7afee738, 0x8109bff1c8d42a9d, 0x3cd99863390989b5, + 0x02021a4cc9c336c8, 0xa06060778cb60aa4, 0xd96591db60bc1e06, 0xd2727175183f4022, + 0xcdc1f1c5bce3e7ce, 0xb393ccc447872a37, 0xdf6efe63257ead3a, 0x20729d0340dbceb6, + 0x9f3d2d26fc0ea0d7, 0xf392e0885189bd79, 0xdf2ee01eb212b8b6, 0x6e103a0c0f97e2c3, + 0x96c604a763bd841b, 0x9fc590c43bba0169, 0xf92dcd5ddc248c40, 0x113a8b54446941dc, + 0x5943eda146b46bb8, 0xbf657901a36a39a7, 0x5a4e0e7ea6568971, 0xb94c635bae9f9117, + 0x2626fb65b3a4ef81, 0xa59bfd5478ce97de, 0x79112ba9cc1a1c63, 0xf41f102f002cf39c, + 0x0a589bcbfb7ff1c8, 0xa1478c53540c4fa1, 0x60d55e72c86dfaca, 0x312e7b6840ea7a39, + 0x8aae72dcccfe1f75, 0xff2f51f55bf0247a, 0x3c2e4b109edb4a90, 0x5c6d73f6525c7637, + 0xe49acb04a199f61c, 0x27860642d966df7f, 0x541ce75fb1e21c30, 0xd9fcd6f90806c7cc, + 0xb87c27bc93a7969b, 0x92f77a1179b8f8dc, 0xb1f29379deb89ed4, 0x7e63ead35808efe7, + 0x13545183d7fa5420, 0x575f593e34cf029d, 0x27f1199fb07344ae, 0xe67f95f7dc741455, + 0x49b478b761ab850b, 0xd7bedf794adfc21e, 0xdc788dcd2dda40ae, 0x14673eb9f4d8ad35, + 0x0cced3c71ecf5eb1, 0xe62d4e6c84471180, 0xdfe1b9e2cb4ada7d, 0x70185a8fce980426, + 0x0ce2db5e8f9553d6, 0x1fedc57bb37b7264, 0xb9310a2e970b3760, 0x989ff8ab9805e87d, + 0x0b912d7eb712d9ee, 0x1fe272830379e67c, 0x16e6a73aff4738fb, 0xeed196d98ba43866, + 0x7088ca12d356cbe2, 0x23539aa43a71eee0, 0xed52f0311fa0f7ad, 0xa12b16233f302eea, + 0xc477786f0870ecb4, 0xd603674717a93920, 0x4abe0ae17fa62a4c, 0xa18f1ad79e4edc8d, + 0xc49fe6db967c6981, 0xcc154d7e3c1271e9, 0xdd075d640013c0c0, 0xc026cd797d10922a, + 0xead7339703f95572, 0x4342f6f11739eb4b, 0x9862f4657d15c197, 0x4f3cb1d4d392f9ff, + 0xe35bffa018b97d03, 0x600c755031939ad3, 0xb8c6557ffea83abf, 0x14c9e7f2f8a122ea, + 0x0a2eb9285ee95a7c, 0x8823fec19840c46f, 0x2c4c445c736ed1d0, 0x83181dff233449f1, + 0x15ed3fca3107bef5, 0x305e9adb688a4c71, 0x7dbef196f68a3e2e, 0x93e47ece3e249187, + 0x8353c5e890ead93c, 0xea8a7ae66abafdf7, 0xf956dbb6becf7f74, 0x9f37c494fbfdb6e4, + 0x11c6cbaa2485dd32, 0x206f336fcca11320, 0x9befe9a59135d8fe, 0x5f3ef8b8db92c7db, + 0xbb305e556ce0ce9a, 0xf26bdafb1305887f, 0xcbf28abe23f08c61, 0x0bc64173b914e00b, + 0x9168da52e983f54a, 0x6ea41d09c3574a3e, 0x78aa44d4a74459ae, 0x2931422878387bf5, + 0x018f64a3a92c2d9c, 0x9be43f6752e66b34, 0xae378890decd1152, 0x07325329a1cb7623, + 0x3b96f4ee3dd9c525, 0x2d6ebcdbe77d61a3, 0x10e32b0e975f510c, 0xffc007b9da959bf9, + 0x38bf66c6559e5d90, 0xbe22bdf0bf8899fe, 0x87807d7a991632a8, 0x149a0d702816766a, + 0x026f723db057e9ab, 0xeeecb83625ec6798, 0xcec2ed5984208148, 0xd985a78e97f03c84, + 0xf96c279e7927b116, 0x99d5027b3204f6e2, 0x13a84878c3d34c55, 0x5cf5ec96229e9676, + 0x0bc36b07e4f8e289, 0xbed33b80a069914d, 0x2fbfbdd1ff4b9396, 0xab352bb6982da90f, + 0x154d219e4fa3f62b, 0x4d087512bb6b9be7, 0xc582e31775ee400e, 0x7dadb002ae8c4a4e, + 0xaae2957375c1aee2, 0x5f36ca643356625b, 0xf87cf8eb76e07fb7, 0x46f432a755e02cc3, + 0x36087e07aba09642, 0xe5642c1e4ebb9939, 0xb9152d22338eefad, 0xf7ba44278a22cf7f, + 0xd3b8013502acd838, 0x7761511da6482659, 0xb0857621638e8e50, 0x552eddb4a8b1d5f5, + 0xc43d9861e812c3ea, 0xd765c2aada47910c, 0x21c935b68f552b19, 0x6256d5641a2b47dc, + 0xab711d8e6c94bc79, 0xa8d0b91a2a01ab81, 0x5e6d66141e8d632a, 0x7638285124d5d602, + 0x794876dbca3e471f, 0x951937d8682670ce, 0x0f99cb1f52ed466a, 0x8c7cd205543b804c, + 0x2fd24d74a9c33783, 0xe5dcb7b7762e5af1, 0x45e6749cca4af77c, 0x540ac7ee61f2259f, + 0x89c505c72802ce86, 0xeab83b9d2d8000d1, 0x9f01d5e76748d005, 0xc740aaef3035b6d0, + 0x49afcd31d582d054, 0xcba5dc4c1efb5ddc, 0xc0a4c07434350ca1, 0xfc8dfaddcc65ee80, + 0x157c9780f6e4b2d9, 0x9762a872e1797617, 0xc4afae2cf3c7e1bd, 0x71cde14591b595d4, + 0x8843c3e0e641f3b9, 0xd92ecd91dce28750, 0x1474e7a1742cb19f, 0xec198e22764fa06b, + 0x39394edb47330c7d, 0x00ba1d925242533d, 0xaed8702536c6fb30, 0x6d3618e531c2967a, + 0x77f7cedcd7cc0411, 0xbc1e2ab82be5b752, 0x07b0cf9223676977, 0x596c693b099edd53, + 0xbb7f570f5b9b2811, 0x96bfdad3c4a6840c, 0x668015e79b60c534, 0x3ad38d72123f1366, + 0x6b994d81d2fcbb09, 0x70885f022c5052d8, 0xc891ee79d9306a7b, 0x2c4df05c0ed02497, + 0x19ebc13816898be2, 0xea7c64df11c392a2, 0xb7663e88dd12e1bd, 0x79f768cb8e154c21, + 0x1fb21b12e945933b, 0xe6a9045643f6906e, 0x544c47acd7e15371, 0xb7709b14f727e3d1, + 0x326ee36a46942971, 0x477f1cf7b0e2d847, 0x88b8f6b82b3b0c24, 0x18bc357b80e3cd5c, + 0x3333de70e4d66e0b, 0x4fd4c5e148583cf6, 0xae1b62f3008c0af3, 0xc49f419b6ab29cf5, + 0x2c29fa65afc3fa28, 0x4b19d93734d03009, 0x7dd6c09e589276ad, 0x1cece97f30de48ad}, + {// seed = 4 + 0x58bdf4338602e4fb, 0x71a5620b02c926d5, 0x3811c960129c2d9f, 0x29c2fb11fccac567, + 0x0d6b1ea7780f1352, 0xcc4d3ddfae3f87b3, 0xfdd30257362a586b, 0xabc948fde69f25f1, + 0x51b3523469d30f7b, 0xe0f0322724405ace, 0xd3729266d896da1e, 0xb10c37e5147915bf, + 0x8b577039f9fa32a3, 0xe677c6a9cbfb44b3, 0x7317a756ebb51a03, 0xf8e988ef37359485, + 0x600fc1ef3f469ff3, 0xbf0b8f8520444e01, 0x3711168b08b63d73, 0x34146f2944a6cb36, + 0x717feb263862cdde, 0x7185f8347db00412, 0x900798d82127e693, 0x84089e976a473268, + 0x10f8308c0d293719, 0xf62a618d4e5719b8, 0x8bdbd257a1a9516f, 0xf49f666fd7a75110, + 0xbaf45e2db7864339, 0xe4efa1ea0c627697, 0x3e71d4c82a09fe10, 0x54a2a51cf12127bb, + 0xa0592c9f54ba14cd, 0x27dd627a101c7a42, 0x3d2ceb44b3d20d72, 0x7ee1f94a68ca8f5d, + 0x7e8cb8651b006c36, 0xbd9fa7ca3a475259, 0x856de173586a7b34, 0xcedb291b594cb1b5, + 0xa3d6e462fd21cddc, 0x74561d10af9118e4, 0x13a3d389fc2d4b36, 0xeea8594a4a054856, + 0xf56d7474d9ba4b13, 0x25ddce2f6490b2fd, 0x920653ff3a8d830b, 0xcd8c0c9cdac740d1, + 0x2c348a738db9c4a0, 0x2967ccbe8ea44c22, 0x47963f69adb049f8, 0xf9d01eb5b4cf7eb6, + 0x7a5c26eb63a86bd2, 0x62ad8b7a71fa0566, 0xb373213179f250ae, 0x589d4e9a88245a4d, + 0x433dafebe2d558a8, 0x521fbef2c8fe4399, 0x62a31f9ff9ccd46b, 0x51602203eba7c1a6, + 0x9afc8c451b06c99f, 0xb529085bdbaffcea, 0xac251825cc75892b, 0x94976a5bce23d58e, + 0xdd17925b6c71b515, 0x568fd07a57bce92e, 0xefac31200d8bd340, 0x716c3e466b540ef9, + 0x3d2c9e380063c69b, 0x14168f9a3662dd83, 0xd298c7504dbc412f, 0x74490a94f016719f, + 0x0e0da431e1ab80c8, 0xe321f63dc6b169ae, 0xf08671544febc95a, 0x39324450cc394b3b, + 0xea6e3d35f1aa3a70, 0x8ef8a886508ce486, 0xdc1a631ef0a17f06, 0xfda2b3fbcd79e87b, + 0xd75bcae936403b10, 0xf88b5bd9f035f875, 0xc43efec2e3792dd4, 0xe9fac21a9d47cd94, + 0xc2876f0c4b7d47c3, 0xaba156cf49f368b4, 0x5ccda2170fa58bf9, 0xadc92c879ed18df7, + 0x110c1b227354e6c8, 0x298ee7a603249200, 0xde92142ede0e8ee7, 0x88e4a4610644ba9e, + 0xbb62d277e7641d3a, 0xb9be1985b7bf8073, 0x29024e5426cdb0d1, 0xf6aefd01f3092ab8, + 0x2a07087b313133aa, 0x6d71f445d6dfc839, 0x1e2412ff12e5526b, 0xed5cdeba6617b9e1, + 0x20b1d0d5e5f8760e, 0x12ff15705c368260, 0x7bf4338b7c387203, 0x34ff25f00cd06185, + 0x1148c706c518cf28, 0x5c04f0623388f025, 0xcb9d649275d87d79, 0x9b5f0c24fabc42ec, + 0x1a7b5e7964e33858, 0x2a81bbd8efdc6793, 0x8d05431ffe42752e, 0x83915cd511002677, + 0x580ed4d791837b31, 0x5982e041d19ff306, 0xcad0d08fa5d864ca, 0x867bee6efe1afa63, + 0x26467b0320f23009, 0xd842414dfda4ec36, 0x047fcdcbc0a76725, 0xbddb340a3768aeca, + 0xef4ce6fa6e99ab45, 0x88c5b66c7762bf9b, 0x5679f1c51ffb225d, 0xdab79048317d77ee, + 0xf14e9b8a8ba03803, 0xe77f07f7731184c1, 0x4c2aab9a108c1ef5, 0xa137795718e6ad97, + 0x8d6c7cc73350b88b, 0x5c34e2ae74131a49, 0xd4828f579570a056, 0xb7862594da5336fc, + 0x6fd590a4a2bed7a5, 0x138d327de35e0ec1, 0xe8290eb33d585b0b, 0xcee01d52cdf88833, + 0x165c7c76484f160e, 0x7232653da72fc7f6, 0x66600f13445ca481, 0x6bbdf0a01f7b127d, + 0xd7b71d6a1992c73b, 0xcf259d37ae3fda4a, 0xf570c70d05895acf, 0x1e01e6a3e8f60155, + 0x2dacbb83c2bd3671, 0x9c291f5a5bca81af, 0xd976826c68b4ee90, 0x95112eec1f6310a2, + 0x11ebc7f623bc4c9a, 0x18471781b1122b30, 0x48f7c65414b00187, 0x6834b03efa2f5c30, + 0x0875ef5c2c56b164, 0x45248d4f2a60ba71, 0x5a7d466e7f7ba830, 0x2bebe6a5e42c4a1d, + 0xd871d8483db51d10, 0x6ee37decd2fd392f, 0x7d724392010cede3, 0x8e96ef11e1c9bcc8, + 0x804a61d86b89d178, 0xbb1b83ce956055ec, 0xcb44e107410ff64f, 0xc426bb09ee0ba955, + 0x057c08f42c3dd7f1, 0x40ea1ec148602bdf, 0xc24688deeb65d7f1, 0xd8bcc53c768ba4e4, + 0x16e0e3af65c1106c, 0xfc12f7e7d647218b, 0x70d6e1d3ee93cef4, 0x01d2a505c4541ef9, + 0x1ef79e16e764d5c3, 0x0363d14d13870b98, 0xb56ef64345d06b11, 0xe653d557ebb7c346, + 0x8304a8597c2b2706, 0x1536e1322ce7e7bb, 0x525aec08a65af822, 0x91f66d6e98d28e43, + 0xe65af12c0b5c0274, 0xdf6ae56b7d5ea4c2, 0x5cef621cedf3c81c, 0x41e8b1ffd4889944, + 0xb5c0f452c213c3e5, 0x77af86f3e67e499b, 0xe20e76ea5b010704, 0xbdc205ab0c889ec0, + 0xc76d93eb0469cd83, 0x17ac27f65cab0034, 0xd49ec4531fd62133, 0x07a873ea2f1b9984, + 0xbff270dfef0032ee, 0x1764dbe91592f255, 0xe40363126f79e859, 0xa06cad3ab46971f6, + 0x0be596e90dedd875, 0x3387cce5c1658461, 0x44246acf88a9585e, 0xe0ad82b92d5ecb2c, + 0x2177491c9a1600a6, 0x16e7c4aac0f02422, 0x75792eeeec15c4e1, 0x2309cd359d08ee30, + 0x7cd9831dd1b83b0a, 0x374914a7c4ee8cf0, 0x0dd17765c9ac2e54, 0xb7847470ba9a7688, + 0xfba4f4bbe2991173, 0x422b203fc3de040e, 0x63bfcaf2ecf2ab0e, 0x0c5559f3a192946e, + 0xfdf80675c1847695, 0xf5f570accab842c9, 0x65cc5a448767afea, 0x1efeb0a7ee234f2f, + 0x9b05f03d81e7b5d2, 0xe7c31317a8626cf4, 0x620f2a53081d0398, 0x1b6de96cdd9943ae, + 0x8c226a436777d303, 0xa08fbbd50fafb10d, 0x6a64c5ec20104883, 0x9c9c653502c0f671, + 0x678a02b2174f52a0, 0x68e008ba16bbad4b, 0xa317c16d2efb860f, 0xeab2075d17ed714c, + 0x565eeeddf0c4ea15, 0x8ec8e94d242a6c19, 0x139e8e27d9000fae, 0xc977a7ff1b33d2f5, + 0x1d0accca84420346, 0xc9e82602cd436e03, 0x6a2231da53d2ccd3, 0xb44b12d917826e2a, + 0x4f4567c6a74cf0b9, 0xd8e115a42fc6da8f, 0xb6bbe79d95742a74, 0x5686c647f1707dab, + 0xa70d58eb6c008fc5, 0xaaedc2dbe4418026, 0x6661e2267bdcfd3d, 0x4882a6eda7706f9e, + 0xf6c2d2c912dafdd0, 0x2f2298c142fd61f9, 0x31d75afeb17143a8, 0x1f9b96580a2a982f, + 0xa6cd3e5604a8ad49, 0x0dae2a80aad17419, 0xdb9a9d12868124ac, 0x66b6109f80877fac, + 0x9a81d9c703a94029, 0xbd3b381b1e03c647, 0xe88bc07b70f31083, 0x4e17878356a55822}, + {// seed = 5 + 0xb3c58c2483ad5ead, 0x6570847428cdcf6c, 0x2b38adbf813ac866, 0x8cb9945d37eb9ad3, + 0xf5b409ec3d1aed1c, 0xa35f4bffc9bb5a93, 0x5db89cde3c9e9340, 0xff1225231b2afb2b, + 0x157b0b212b9cc47d, 0xf03faf97a2b2e04d, 0x86fdab8544a20f87, 0xfcb8732744ae5c1c, + 0xd91744c0787986d5, 0x5f8db2a76d65ad05, 0xcff605cbed17a90d, 0xf80284980a3164e7, + 0x59cc24e713fccc7d, 0x268982cada117ce4, 0xcd020e63896e730e, 0xe760dc46e9fe9885, + 0x6aaece8ab49c6b5d, 0x7451194d597aae3e, 0x35d4385900332457, 0xa40fb563a096583d, + 0xa797b612f7f11b76, 0x2fed6eb68e6a2b9b, 0x2f06ee64aeffd943, 0x9dd0e49d9ca45330, + 0x97d48f08bd7f1d8f, 0x1cfa7fe3ebe4d8ee, 0x2a2ba076bd397d42, 0x68c4344f7472f333, + 0xce21ec31987d74b5, 0xb73dabdc91d84088, 0x801aadee592222fe, 0xaf41345398ebc3f5, + 0x8a8f653d7f15ee46, 0xce2d065ff2ba2965, 0x4e05da515da2adb7, 0xa6dbdb8aa25f0fd4, + 0xca9f9666bbd2d5a9, 0x6b917ce50bd46408, 0x1550cc564ba6c84d, 0xb3063ae043506504, + 0x84e5f96bb796653d, 0xe2364798096cf6e3, 0x3b0dfedf6d3a53d0, 0xb7e4c7c77bde8d93, + 0xe99545bac9ab418a, 0xa0e31f96889507bb, 0x883c74f80c346885, 0xf674ae0b039fd341, + 0x8bb6ce2d5e8d1c75, 0x0c48737966a7ed7c, 0x04fcdf897b34c61c, 0xe96ac181bacbd4d6, + 0x5a9c55a6106a9c01, 0x2520f020de4f45d3, 0x935730955e94d208, 0xce5ad4d7f3f67d3b, + 0xa4b6d107fe2d81ca, 0x4f0033f50ae7944e, 0x32c5d28dd8a645a7, 0x57ce018223ef1039, + 0x2cbab15a661ab68e, 0x6de08798c0b5bec2, 0xee197fb2c5c007c6, 0x31b630ac63e7bda2, + 0xab98785aefe9efe3, 0xa36006158a606bf7, 0x7b20376b9f4af635, 0xa40762fdc3c08680, + 0x943b5faffd0ebee2, 0x7f39f41d0b81f06e, 0x7c4b399b116a90f8, 0x24e1662ac92bc9f3, + 0xcf586fc4e8e6c7db, 0xe46e0d047eeb12d7, 0xe8021076e4ea9958, 0x11fc13492e3ca22a, + 0xd61eae01410397e3, 0x7e8c4a58036a8e9f, 0x068a6de267970745, 0x64faab129bef1a41, + 0xb4a6f720943dad01, 0x631491058d73a9d5, 0xdad4fe95eab3ec02, 0x0a8b141c5c3a44f6, + 0x9fc69d4c2b335b98, 0x94d5f84a07d6e4cd, 0x1b73965de143c608, 0x443932c2dda54bcc, + 0x7397818fb0b04cd2, 0xef4ab03a1202b277, 0xf3d2ee459c0c2b92, 0x182d4daf8b058a87, + 0x90e63035d7b51368, 0xba4cd8b9a95d45fd, 0x12a7392c76731090, 0x890d264ec5d082d2, + 0xeeaf5c363da4994e, 0xd6aad756902123fb, 0xb531ebebdb28f191, 0xe71ce659fc59babd, + 0x37c1b94f63f2dcb5, 0xe4e3abeb311f9b96, 0x4a31b72ccb8695d3, 0x52cae1f0629fdce4, + 0xe5b0475e2ed71369, 0x2724e8c3506414fb, 0xbab0367920672deb, 0x0161a781c305449f, + 0x37b70f40f5bb60be, 0xddd1094c50251a01, 0x3b28283afd17224e, 0x06dec0cfe889fc6b, + 0x47608ea95bb4902d, 0xad883ebc12c00e82, 0x9e8d7ae0f7a8df29, 0xa79443e9f7c013a1, + 0xcfa26f68b7c68b71, 0x33ae6cc19bda1f23, 0xd9741e22b407887f, 0xf2bff78066d46b1c, + 0x794123191c9d32d4, 0x56cb6b903764ec76, 0x98775d0ef91e1a5a, 0xae7b713bc15c1db9, + 0x3b4c1a7870ed7a0d, 0x46666965f305cc34, 0x0ea0c3b2e9c6b3cd, 0x4dc387039a143bff, + 0x5f38bb9229ef9477, 0xea5d39ba72af7850, 0x69a5ed0174ce2b6d, 0x06969a36bfe7594d, + 0x0adee8e4065ccaa3, 0x908a581d57113718, 0x64822d6c5a8190ed, 0x8c5068b56ace4e4c, + 0x88ba3b4fb4e30bef, 0xa6ec0b8bb5896cfe, 0x4e23fcc6b47996fd, 0xe18e75b0dd549c7a, + 0xcd90f17e106cf939, 0x1666fdfb2ef7c52f, 0x4fae325f206dd88c, 0xe7bc1160e25b062d, + 0x3cc999cb246db950, 0xc5930a7326cd5c37, 0xb008a48a211367bd, 0xc5559da145a88fd4, + 0x1e3ad46655fac69c, 0x7834266b4841bfd7, 0xa764450fbffc58cc, 0x54d8cf93a939c667, + 0x93c51f11b21b2d9d, 0x0964112082ed65cc, 0x4c2df21213e7fb03, 0xf0405bc877468615, + 0x17b4fc835d116ab4, 0xa6b112ae5f3cb4ef, 0x23cfc8a7fd38a46e, 0x8e0a360dc2774808, + 0x24ca9c8092105ad5, 0xafd3f75524f2e0d5, 0x4f39ed7dbaddc24c, 0xe5e362c7679a7875, + 0x00914a916b07b389, 0xdfe1119b7d5ab5da, 0xabd6ed9940e46161, 0x630ed2044171e22c, + 0xdecc244157dd1601, 0x777e6d5b4b4868d5, 0x9b3530bee67017d8, 0xd2faf08b291fdcb9, + 0x006e99455d6523de, 0xd559b5817f6955b5, 0xefcc1063b0088c61, 0xed73145ae0f00ae7, + 0xab2af402cf5b7421, 0x897767f537644926, 0x26c9c0473ca83695, 0x192e34e1881b2962, + 0xf7cf666ec3b3d020, 0x27f9b79c7404afb7, 0xe533e8bed3010767, 0xe5817838e11d05d3, + 0x65659c531bd36517, 0xd427c5e0a23836fd, 0xf3eab7ea58fa3528, 0x07683adae1289f35, + 0x201d6af7e896dd32, 0xd5da938b9a21ad88, 0x843fb73ad67bc316, 0x1782ec7d5feef21b, + 0x943f66f6ec772877, 0x7e9112e7b26da097, 0xeac8161f8663c2c7, 0xe8600db480a9ebf4, + 0x07807fc90f6eaf5f, 0xe0e4c9deb41abf83, 0xbdf533db271f9c15, 0xb398411b0497afe2, + 0xdebb45ef25448940, 0xe7a5decefcd376c4, 0xaf1ef3c728c83735, 0xb8b83a99355cb15a, + 0x6444a0344f1611e4, 0xe8bb7f5cf3c60179, 0x77ab5c5177e75ff7, 0xc38fd6fa849d585d, + 0x390d57d53029060a, 0xa66327eb7b8b593c, 0x6350a14f6fcd5ac9, 0x2c08125bcd7008b4, + 0x2d00c299a6a6bf8e, 0x6b0039c1f68d1445, 0x0035150c5d06f143, 0xa34d01628cc927e1, + 0xdf5b3164d7b2ede1, 0x8167db1d0583d72e, 0x4e13b341cd2ae8bc, 0xa693d9b1f416e306, + 0xc15ed7ca0bc67609, 0xdc344313c1c4f0af, 0x88b6887ccf772bb4, 0x6326d8f93ca0b20e, + 0x6964fad667dc2f11, 0xe9783dd38fc6d515, 0x359ed258fa022718, 0x27ac934d1f7fd60a, + 0xd68130437294dbcc, 0xaf5f869921f8f416, 0x2b8f149b4ab4bf9f, 0xc41caca607e421cb, + 0x7746976904238ef9, 0x604cb5529b1532f0, 0x1c94cd17c4c4e4ab, 0xe833274b734d6bbe, + 0xe9f1d3ef674539ce, 0x64f56ed68d193c6a, 0xe34192343d8ecfc1, 0xcb162f6c3aa71fe8, + 0x99eaf25f4c0f8fa4, 0x92f11e7361cb8d02, 0xb89170cddff37197, 0x4f86e68a51e071e3, + 0x31abf6afd911a75b, 0x6d20cf259c269333, 0x4150b9f88fcb6513, 0x705063989ebf7451, + 0x559231d927c84410, 0x1ca8ec4b098bc687, 0xebed22405c9180e0, 0xaa815b37d052af59}, + {// seed = 6 + 0x946ac62246e04460, 0x9cebee264fcbc1ae, 0x8af54943a415652b, 0x2b327ed3b17b8682, + 0x983fde47b3c3847e, 0x10a3013f99a2ad33, 0x6e230bb92d2721ef, 0x1cf8b8369e5c5c50, + 0x7f64017f2b7b3738, 0xd393248a62417fa1, 0x9ff01c0b20a372c5, 0xb0e44abce7e7c220, + 0xcebb9f88d48a815f, 0xdb7df6bd09033886, 0x7844fc82b6fa9091, 0x72d095449863b8ec, + 0xc13e678c89da2c7e, 0x6caf4d5ad231d12f, 0x2e0ab7b5fcf35c49, 0xf410720cb932a70f, + 0xd66ea581f16fce06, 0x175c9f002f57dc98, 0xccbcfd0d32988775, 0xfde4c407d3b0a232, + 0x5db2931ae7e97223, 0x6e07e2173085809f, 0x6e1d1ec0f9cad73c, 0xb2fc251a7f802619, + 0xbc1fc17f04f342de, 0x8de8f21ec658e078, 0x72c0f40cbee53fd6, 0x0678244411fc17a1, + 0x1d5837ca166b9bbd, 0xc8cada003c554345, 0x6a2fe2bfb2e58652, 0xfca9d797a6f7988b, + 0x6699e24ac737948b, 0x69623ffcb05789ba, 0x946429c529d95b75, 0x0d14df0b2a13970f, + 0x593d8592c440dfec, 0x2ee176f3d7e74b94, 0xae003f1da3be9e26, 0x0c7b02c4c0f6764a, + 0x3117e2fa1f632462, 0xf0f23265b6f1eaeb, 0x3111255d9b10c137, 0xc82745e509a00397, + 0xbd1d04037005fea7, 0xe104ab0dd22a9036, 0x51b27ce50851ac7a, 0xb2cb9fb21b471b15, + 0x29d298074c5a3e26, 0x6ebdf2058b737418, 0xc4a974041431b96f, 0x1ec5a30ccb6bdaac, + 0xe818beede9bf4425, 0x4b69b1bce67a5555, 0xf5c35f1eb0d62698, 0xf4509bbd8e99867c, + 0xb17206debd52e1bc, 0x35785668c770b3be, 0xe9343987ff5863bc, 0x2ee768499ac73114, + 0x5132bb3426eeaaf4, 0x471bce2c6833c5ff, 0xbb9a2d5428e6f6f9, 0xd5678943c595792d, + 0xab2a65e7f81e479c, 0xa82407bb23990b31, 0xdae321383984923c, 0x01823bb22648e6f1, + 0xda6e8df4214a8b04, 0x0e172bb88e03d94f, 0x552da6c22e362777, 0x7ce67329fb0e90cb, + 0x7b2d7f287ede7ebf, 0xd44f8222500651bd, 0x4acca1ef58fbb8ab, 0x428ecf058df9656b, + 0xd7e1ec6a8987c185, 0x365be6a54b253246, 0x168849be1e271ee8, 0x6a00f3c4151a8db2, + 0x37602727ca94b33d, 0xf6b50f18504fa9ce, 0x1c10817f6bc872de, 0x4bfe1fe42b0f3638, + 0x135fad4b8ef6143b, 0x1b25ad2bafc25f58, 0x41e37f85cf321f92, 0xfc73f75d9d5b9bea, + 0x9eb3694d1e9cb7e1, 0x601d51f08fa83b90, 0x234a2a9b88366f41, 0x63fe903e16f2c3bf, + 0x1cdbd34fa751c0b0, 0x0ce4fc6747c0558c, 0x51ed72afb8bb49aa, 0x20313ba13ca12c96, + 0x271fa38f9ebd54c1, 0x3696a5ac03a8edde, 0x05602be7df625702, 0x11f1ac73790f7a9f, + 0xa2836c099f0810bd, 0xe5ac2e47caa532fa, 0xd9c000a66d39f681, 0xd93d900e6f3d9d5f, + 0x792c81c65b7900f2, 0x5c5dce790ee20da1, 0x74ff1950edec1aee, 0x71fc85fa1e277d8f, + 0x0e77df17d6546cbc, 0x07debad44816c3b4, 0xbafa721581e92a70, 0x8ab6fbe2ed27bba8, + 0xe83243a20dea304a, 0xaa85a63a84c00a07, 0xde0e79917fc4153a, 0x21bb445e83537896, + 0xeedcac49fc0b433a, 0xffb2926a810ae57a, 0xf724be1f41d28702, 0x79cb95746039bb3b, + 0x5a54fe3742a00900, 0xda4768d64922c04f, 0x420396a84a339dae, 0xa171e26ee5e8724e, + 0x4c8da7c5d289c20a, 0x9ebd79a1a8e94742, 0x39235232b97e9782, 0xb75df0be9bba7d80, + 0x0c1d204dd87d48fc, 0x8f81f3e7177266e8, 0xe4a460b39e78d72b, 0x50b98fa151e65351, + 0xb7cb585c3ee1eddc, 0x11cdad9a76ee1dc4, 0xa38054a78595dc1c, 0x92f09e2ec4978edc, + 0xa8f0061b5efdabaa, 0x04bcc4abc224d230, 0xc58606738e692d46, 0xdd2b27b565952433, + 0x19e6ed1b740beec0, 0xceadd49b2ef9891f, 0x328178c28fe95cad, 0xe5ad4c43afe02848, + 0x03c0cb538cd967c0, 0xec4352526d19a630, 0x4c7e99389d39b031, 0xf65dd05362c2deb6, + 0xd1e70daf6879d28d, 0xbe9f57db6309b265, 0xa4b66f370b872bb7, 0xe26896fbc6ee1fd5, + 0xac705e661bfcf7c5, 0xab4d0d07d7f09940, 0x976417c06aeb6267, 0x8161c684a6bd468c, + 0xf77b6b9976dc4601, 0xc6489b779a39c12c, 0xb2aa58d5681cea1a, 0x043b1b40f8c3e04c, + 0x681fcbfadc845430, 0xab8896c921ba8def, 0x57aaf172606f37b2, 0xc3735048cd5eb8d7, + 0xa7078b96955631bd, 0xdd6b3543aa187f33, 0xc7103ea4a2a697fd, 0x8d7b95f6ff1f7407, + 0xe44f419e84709530, 0xf340caa9132cbb0a, 0x2ba407283143c66c, 0xe1be240ca636c844, + 0x90d32f2877ac08bc, 0x5d26e6294b2c8673, 0x4a6b2f5b27c87a44, 0x961fb9043f76d34f, + 0x0afee02d8d3c55d2, 0x6228e3f48c42e5dc, 0xc338e69ee6593675, 0x853f74b16efb7bdd, + 0xd062f40bdd22e687, 0x647164b9ab4c4190, 0xf94689f67d598369, 0x8e4b29d87a5012d7, + 0xaf02b8b925656fbd, 0x7a722a767179a630, 0xb5c8afe937a75ace, 0xfdb8e8d02d279372, + 0x887ef700cb25fae1, 0xcfe9bd912f72cabe, 0xb1d4dedc24f978de, 0x517522d38319cc2a, + 0x7dd87b2b36aab798, 0x579c4ff3046b5a04, 0xf5c5975c5028b7a7, 0x7094579d1000ec84, + 0xbc8d5b1ea70a5291, 0x161b2d783be8855c, 0xd26d0b0d6d18279f, 0x0be1945f02a78bd5, + 0xb822a5a9e045415b, 0x2fe9d68b1ccc3562, 0xb2e375960033d14f, 0x26aca04e49b4ff22, + 0x732a81c862112aea, 0x8bd901ed6e4260b8, 0xe839532c561ad5b0, 0x8fb6e4d517a79b12, + 0x0dd37f8c0be9b429, 0xc8ad87ad12f1b1b0, 0xc51f3aa62b90318b, 0x031a7e8b86c1cefc, + 0xa95547af2b70fc76, 0x9cb3615c5a98801e, 0xa387e3c3341d7032, 0xa087ea52a1debaef, + 0x16325ec9a2e6e835, 0x587944a484c585eb, 0xc8879033bde22ecc, 0xa39dbfce709c464a, + 0x7acc010f99208774, 0x98dd2973a096c5ad, 0x26458b51139f198c, 0x2f5d19575e8c4f02, + 0x726643f0d38af352, 0x44d879b6d73e6e94, 0xa68a03885c980abe, 0x06048acd161c40c0, + 0xa4dab8f89d405d28, 0x7120c880cb04be18, 0xa062ace22a1cf0cf, 0x3901a9daf29704f4, + 0xff08f3ed989db30a, 0x6d22b13e874c67e9, 0x80c6f35518d73f4d, 0xc23c2a521aac6f29, + 0x2e708fd83aaa42e0, 0x7fc3780f55f1b0fd, 0xabb3075c98cf87f2, 0xb4df3f40f7c61143, + 0x2a04418098a76d75, 0x0d9eeee9509b2d37, 0x6be8ae51f4b59cdc, 0xe746cc7c00e4a2ab, + 0x785bc6df9cac597c, 0x33cb6620ce8adc48, 0xc1ba30739bffcef7, 0x6d95771f18e503f7, + 0xf7be3ae2e62652ff, 0xc8d82ffd2a73c62b, 0x8725a3ba5b110973, 0x67ed6b9c724757ec}, + {// seed = 7 + 0xc0272d42c19ff3ae, 0x4694228b43ea043b, 0x5709a6ef8a462841, 0xc9210a1e538805c9, + 0x279b171196113ec2, 0x859b769fc2d9e815, 0x0d5d3125a2bf14d3, 0x22bca1cfefa878ba, + 0x481b6bf58037bd83, 0x4933ba8647728d22, 0xf08c7b6b56f6e1b6, 0x374e8af5a15407c7, + 0xa95c4dc3d2487a5c, 0x9b832808ff11e751, 0xf2048507e9da01d5, 0xa9c576189f544a4a, + 0xf6c2a45b2e9d2b41, 0x9b9874c9f10ecc2f, 0x37d9b5f51f8c149e, 0x93aead54c9de9467, + 0x59cf0b4af262da23, 0xe7e9929af18194b2, 0x9df2644e33eb0178, 0xde4122d6f0671938, + 0xf005786c07f4800b, 0xb1fc9d254b5d1039, 0x0bf1088631f6dd7b, 0x665623f0a4b8f0c7, + 0x60f0113a9187db7c, 0xfd7cceda4f0d23a6, 0x26c01e9d89955940, 0x33afa1dfc0f5a6a0, + 0xeb77daf215e9283c, 0xc7575214bf85edb4, 0xeb0d804bf297e616, 0x84bff4ffd564f747, + 0xc4ac33189246f620, 0x43ef61213ecc1005, 0xcbbb0dea6cd96acd, 0x8ed27abfa8cfcb05, + 0x543b61529cb996b6, 0xa5f987ca41ea5e59, 0x3c50e0ac5254cb7a, 0x4192b0446c06d1e6, + 0x3e86592e21b45388, 0xdb766f06fcc6e51e, 0x0448ee36efe632db, 0x663c9db689253e35, + 0x72e0bd4985331dd4, 0xff501b5bf7d94e74, 0xe911ce758e2113a8, 0xec3a8d03a75a6ba4, + 0xaf6b4b72f56edc83, 0xf284857936c0a391, 0x5ba6feff407d46f4, 0x9d689c26de9d6702, + 0x28c04a9083726b5d, 0x2ccf4a627a029730, 0x7b4719500d4f0c71, 0x76470a9a7da250a8, + 0xcc48409404a1c890, 0xccefbdc7ec9a8055, 0xe0db91bff3cc42d3, 0x0532436426141254, + 0xf2ee9325e6f0ff0b, 0x149c20a5fbb28d9d, 0xe71624cd8d2d14d4, 0x8f01d4dc8cc2dd77, + 0x29cf409b333015b7, 0xba8bebd211884dd1, 0xc3396635e8c8db1d, 0x8ed0f6208d0528b8, + 0x0d90b43fdd0ee334, 0xd73c9a3333a044c7, 0xa2595cd208dbdc38, 0xae93cb264f940c09, + 0x8e0538d8afb07a97, 0x19115ec881385ba2, 0xa886f9e6a8039c6a, 0xcd5d62147ce3ecac, + 0xaecdf9e0bb4969f7, 0x2ddd631c53dcad10, 0x73ad1c97b3412054, 0xb08915fa2722efc6, + 0x97966047e5067eb0, 0x337f1675ed91445c, 0xb3a833d150b96a0d, 0x5940a98fe35e5e2e, + 0xfd03cc354ed0d8ff, 0x4e65b98291a8644a, 0x14a259f2852a60b2, 0x7648e3478c1e8e5f, + 0xbc0fbef6d9a919b4, 0xbec4302081346cf1, 0x57d2ce7aa1c7c511, 0x234c209d8f4e1ac3, + 0x87cf80cc933ce443, 0x7c262c616931e94e, 0xc5e33b049cf9eddf, 0x1a80790ed03ae51b, + 0xf2e8b9494f7220cf, 0x124cb59c14fff3ff, 0xa8a06cbfdb86ce18, 0x9068ef1f80b37653, + 0x0c55417b8d90338f, 0xcd579a523f6bcd30, 0xa31bfe2476a8d2a9, 0x1f8d142208094223, + 0x332dc40a5203cfad, 0xf8792fe5b2d33b4c, 0x443bd9668bf9461e, 0xc9019db0ace1409e, + 0x781bea919a113e8b, 0xb0f11d866abfbeec, 0xcfe139a60db0c26a, 0x869ab8721e6aa39e, + 0xdb48a4977717837a, 0x588a5ff151065b18, 0xe4a251ea0028864d, 0x7f0e43ba408a77c3, + 0x65f66dd50a536135, 0x6f49e934d9331c3e, 0xb8d742e0f0fa6b09, 0xe4e9b272deca2348, + 0xaee132ff902f773c, 0x43f658f7c2a0c90a, 0x28cb4dbc76cc53ea, 0x7d92253aa99ac39b, + 0x4fea3d832370baab, 0xb29e36936e51d78e, 0xea10778712321064, 0xff4f21f8ef274be2, + 0x84eff18ddfa0933f, 0xd0ec6a9f86c758a0, 0xaf82e5973c431ae0, 0x352023c00c045425, + 0xad34d7bc4a2f8961, 0xbdb4a02a24d4dee0, 0x354a4846d97447cf, 0x331a8b944d5bc19f, + 0x5ce04f8e17909035, 0x6497581bad8f4aab, 0x07c503bba647111e, 0x85f412ba78e1f7ff, + 0x7f3b920fd20f4cff, 0x424e1a9a4ce34e2f, 0x3035e2d62e1b9f0a, 0xef63114bff7b729a, + 0xe86a05889ab6bb60, 0xee0830cf095585a1, 0x4a54f7fa47d9c94b, 0x17daeece9fcb556a, + 0xc506d3f391834c6f, 0xb3f24be362e1af64, 0xc435e4e23608efdd, 0xeeba9caaa4cc1768, + 0x5a71f306daddc22d, 0x18e5205f41eba1a0, 0x7b29b4d1f6610925, 0x065cb65a0258d9a9, + 0x3e5ac8faa9fd1f95, 0x3b362362c1ea0470, 0xce0e4f6434db7a2e, 0xf327341098de52f2, + 0xcfca3b9e2a1992c3, 0x7483bf9401233e41, 0xbafbac531c6f9281, 0x4b52dd71b2c106f8, + 0xdf73b66e50b5a1f7, 0x237aec0202a20283, 0x23dd5be23dffdf2b, 0xea9730731ee122ef, + 0x5cb3f846014fbcd3, 0xc3b21c8ffdce9201, 0x06a99a02f91a8760, 0x721a81fa8fd7b7a3, + 0x6aafcdddc53cbcd8, 0xd03b464005a93bcc, 0x8212edc1b1669dcb, 0x71f4c31364c31bc7, + 0xfeeec0eba8772307, 0x1948d00a13d88cf1, 0x19064fd6d943ada8, 0x4ec8d31722697bfd, + 0x596d9a953a516609, 0xc4cb4bff53507da2, 0x1d59f3c5be36e4ca, 0xe5b4fc5bf6044c9b, + 0x1bb74e052232f735, 0x04e8a0db611ddd5d, 0x8d04eaa009b421bf, 0xa7878ae0ac0e6d58, + 0x28c1030217cab2b3, 0x827943767e56a883, 0x28fce5fa02d22809, 0xb30c322fffc8c58e, + 0x1ca5a6a9f8066c5b, 0xb24db5f1462b2513, 0x02f653b89b7e5f6c, 0xe31f8fb5d5f78eee, + 0x266acc514ed93501, 0x936879d1c6fddcc4, 0xcd51be3636af1952, 0x3fdbb6fc332c78c8, + 0x9eb656379fa73094, 0x056146cc92fa0f96, 0xed6c4f1836c027c3, 0x021e0bb5d2113f2a, + 0x8983e42ec1c626b3, 0x73ea9bc6513ad9c9, 0x0c904903b24f4247, 0xacbac1e6243e2525, + 0x0b1069a0c230fb06, 0x77d709fca3fc1ce5, 0x87ad0f65020947e6, 0x555302641c53f4e6, + 0x65ea87871fa9aaee, 0x58aaf4ecc1067bb4, 0x1a66c48cc4c65b3f, 0xca96aca48b2ea969, + 0xa68eb70bad14de2b, 0x5ccdb3d7e00a6f6e, 0xe178fbfec73fe72f, 0x2b63d6a16b83e890, + 0x32fdb7a5330fbae0, 0x2ab5803c8d1bf32c, 0xda838388c1527c94, 0x16a50bdc4de24acb, + 0xe561301f134c074a, 0xd7ae63d2816b4db1, 0x036aabd4df0dd741, 0xc5e0db8783435b9d, + 0x9c4386cf0a07f3b2, 0x6a72ac1aa56a13a1, 0x299bbdb04bb20a23, 0x138c1018fda16b81, + 0x0e354f0b3bda49df, 0x9f4c295b23127437, 0xd133ceb2bd561341, 0xd8b4bfd5a526ac29, + 0xcdd0a70ddc1c7bbd, 0x81dce595bf572225, 0x1c6f925c05f6efd7, 0x8ae5097553856ea0, + 0x3aabeaeef248f60d, 0xd9005809d19a69e2, 0x2a3a1a314311cc27, 0x89bb2dc76b2b624a, + 0x50a2a95d0412e289, 0x9def8df564e68581, 0xf49010a9b2e2ea5c, 0x8602ae175d9ff3f0, + 0xbf037e245369a618, 0x8038164365f6e2b5, 0xe2e1f6163b4e8d08, 0x8df9314914f0857e}}; + +} // namespace parquet::internal diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 280704c4247..2661e260d0c 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -754,7 +754,8 @@ class ColumnWriterImpl { fallback_(false), definition_levels_sink_(allocator_), repetition_levels_sink_(allocator_), - content_defined_chunker_(level_info_, properties->cdc_size_range(), + content_defined_chunker_(level_info_, properties->cdc_size_range().first, + properties->cdc_size_range().second, properties->cdc_norm_factor()) { definition_levels_rle_ = std::static_pointer_cast(AllocateBuffer(allocator_, 0)); From 4cb991fb07f260cb78b0f3c07368250ccfc1d594 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Thu, 6 Mar 2025 19:54:03 +0100 Subject: [PATCH 033/102] fix generated header name --- cpp/src/parquet/chunker_internal_codegen.py | 2 +- cpp/src/parquet/column_chunker_generated.h | 545 -------------------- 2 files changed, 1 insertion(+), 546 deletions(-) delete mode 100644 cpp/src/parquet/column_chunker_generated.h diff --git a/cpp/src/parquet/chunker_internal_codegen.py b/cpp/src/parquet/chunker_internal_codegen.py index 063eda4b92f..29cd856f3c4 100644 --- a/cpp/src/parquet/chunker_internal_codegen.py +++ b/cpp/src/parquet/chunker_internal_codegen.py @@ -75,7 +75,7 @@ def generate_hashtable(seed: int, length=256): return out.getvalue() -def generate_header(ntables=8, relative_path="column_chunker_generated.h"): +def generate_header(ntables=8, relative_path="chunker_internal_generated.h"): path = pathlib.Path(__file__).parent / relative_path tables = [generate_hashtable(seed) for seed in range(ntables)] diff --git a/cpp/src/parquet/column_chunker_generated.h b/cpp/src/parquet/column_chunker_generated.h deleted file mode 100644 index 13a47984b74..00000000000 --- a/cpp/src/parquet/column_chunker_generated.h +++ /dev/null @@ -1,545 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once -#include - -namespace parquet::internal { - -constexpr uint64_t kGearhashTable[8][256] = { - {// seed = 0 - 0xf09f35a563783945, 0x0dcc5b3bc5ae410a, 0x63f1ea8d22554270, 0xfbe5ee7bd05a7b61, - 0x3f692ed5e9934aba, 0xaab3755952250eb8, 0xdefb168dc2888fa5, 0x501b36f7c77a7d47, - 0xd2fff45d1989642d, 0x80217c1c600e30a6, 0xb9469ee2e43df7ac, 0x3654b76a61999706, - 0x6ea73dfe5de0c6b6, 0xdfd662e1937a589d, 0x0dbe0cc74b188a68, 0xde45f4e6d73ffc6f, - 0xcdf7a7759e70d87e, 0x5d6a951b8d38c310, 0xdc9423c3813fcf2c, 0x25dc2976e167ffce, - 0xc2555baa1d031c84, 0x115bc3f2230a3ab6, 0xd4b10260f350bede, 0xdfd3501ab447d723, - 0x022e79217edaf167, 0x1635e2255c5a7526, 0xa0a750350cc77102, 0xc027133e05d39f56, - 0xd949459779cf0387, 0xb92f1464f5c688c2, 0xd9ac5f3e8b42f2f3, 0xdf02bb6f5ecaac21, - 0x8156f988fac7bfa4, 0xe4580f97bede2ec8, 0x44fe7d17a76fca32, 0x885f59bd54c2014c, - 0x435e63ec655ffae9, 0x5ebc51930967b1f1, 0x5428c2084ac29e47, 0x9465938fec30e36b, - 0xc7cb3de4977772cd, 0x15692d7c201e8c3a, 0x505ee65cdc4b17f4, 0x7d9839a0a7aead6b, - 0xeef5f5b6a0105291, 0x76c2fb232ce7f5bf, 0x5c13893c1c3ff3a9, 0x65b6b547d4442f98, - 0xb8ad7487c8c96fce, 0x906bcf51c99974f8, 0x2f56e48bb943a48c, 0xbc9ab109f82d3a44, - 0xcd5160cdc8c7e735, 0xbe9acb9df3427732, 0x386b91d477d7fade, 0x36be463621dd5af2, - 0xcbe6a2faffd627a8, 0x9c8fd528463a2f5a, 0xb9b88c6bb802b184, 0xb414b4e665c597c7, - 0xbedb142568209556, 0x5360d81c25429dce, 0x63a69a960a952f37, 0xc900d63899e1b503, - 0x1abc63a8b37c7728, 0xa8b3a8b6409080eb, 0x495e391f662959f6, 0xdf1e136f3e12229b, - 0x33d5fc526b0dd38d, 0x321221ae2abfac63, 0x7fde18351fda7395, 0xed79fe5c3a6aa4c3, - 0x2dd6965a4867d8d4, 0x54813ca20fe8799b, 0x5d59ea6456465c39, 0x0de0c294d1936b81, - 0x4aaf0755002c588c, 0x3530a1857ad04c6d, 0xb8a64f4ce184442b, 0xe0def10bceedfa17, - 0x46e38d0a443757ec, 0x9795a1c645ee16d7, 0x7e531def245eac8a, 0x683b25c43a0716cf, - 0x884583d372da219d, 0x5b06b62c910416e5, 0x54b6902fbebd3dbe, 0x931198d40a761a75, - 0xead7d8e830013590, 0x80b4d5dc99bfaced, 0xf98272c8108a1ad2, 0x1adce054289a0ec6, - 0x7d53a1143c56b465, 0x497fbe4f00c92b52, 0x525e4cc2e81ebd69, 0xc94478e0d5508ff6, - 0xb8a5da83c196d07c, 0x7667a921b65b0603, 0xf236fabbdefe6cd1, 0x53da978d19a92b98, - 0xc604f6e97087124d, 0x2cbd27221924b094, 0x65cd1102c985b1d2, 0x08c0755dc1a97eb4, - 0x5e0419e921c0fef1, 0x282d2c1196f84a29, 0xe21117fcfc5793f7, 0xcf4e985dc38e6c2e, - 0xd521f4f264d55616, 0xde69b04c485f2a10, 0x59410e245305178a, 0xceab1d477c943601, - 0xa9805732d71ee5e9, 0x054cd443896974f6, 0xf2b517717a423a3e, 0x09517937fa9fac95, - 0x4938233e9ca871e3, 0x9132cbaf56f83ec0, 0x4703421ed1dd027d, 0xfd9933f4e6f1ec4e, - 0xf237c7fded2274a8, 0xdf4616efe68cd7b4, 0x5e46de0f39f0a380, 0x3d41e0c6d8e095b0, - 0xc5272f8a5bb2df09, 0x68aa78e8301fb964, 0xbf5b5b52c8e32ae0, 0xbf28ed3df74bdcf7, - 0xd6198f64c833815a, 0x8cd99d2974267544, 0xd90560ea4465ff2c, 0x571d65ad7ad59261, - 0x309453518baa367a, 0xa60538377bc79fb2, 0xace515da1ab4183c, 0xf56d3c8d891d1c5b, - 0x5b0d8370b59def49, 0x775866ce7c83c762, 0x3d76085695c8e18a, 0xba064d1a9af1b114, - 0xc84ef7cd7b98b521, 0x90b9231681c2bc37, 0x37e2b13e6f585b6b, 0x1d0a34e55e0f369f, - 0x86bb8019cf41447c, 0x4b95c6ef55b3f71f, 0x3b6ed1660732b310, 0x617eee603d137f21, - 0xf4f6278b464f3bbc, 0xdfb763b720da205a, 0x353478899b871cb7, 0xe45fbbff574cc41e, - 0x1a94b60847907d72, 0xb10eef051eff67a5, 0xf0e012ec6a284d40, 0xcc1cd1a11b926d7c, - 0xcf9d9c5453e19cad, 0x270febcc0fc0e86b, 0xd6567568778b781e, 0x7323b98965eeb46b, - 0xccecd374567086ff, 0xef7b44bfc497a704, 0xebc479c051a9f0a5, 0xc9b7410e3e00a235, - 0x1d084f7ecdf83dab, 0xc8a9a97e33ba8ba3, 0x8c75318f5b2350d6, 0xaa3cd5d0c684bdda, - 0xa81125fe0901bedf, 0xf7bcd76020edfc93, 0x834ee4c12e75874f, 0xb2bb8a7beb44fa14, - 0x32cd26f50a4f4e4d, 0x0fc5817ca55d959a, 0xd6e4ae2e3ae10718, 0x074abdcceb8d6e38, - 0xc0cc5f4f9b3a9c43, 0x1115d364363595b2, 0x69861db2eb19f2e8, 0x59b8d804cf92bc67, - 0x9bac9785e5e4b863, 0x7fa0e17a41869561, 0x10d3c9633f0c709c, 0x534a03deee6bc44a, - 0x73b1f7201257f581, 0x46fd6a11e2e0706b, 0x494abb554946e67a, 0xb5d6da317864dc8e, - 0x402ded9238f39687, 0xd8fa37d2cbd6d290, 0xcc818293fcb06791, 0x6482ab344806cd4d, - 0x0956e6ee9d8eb60b, 0x01fee622d8465ac8, 0xae7ece370cbd9c35, 0x7ff09e937a177279, - 0xa2c29ee7a33ca5f1, 0x990e8dbee083923b, 0x4a819b72f610863a, 0xddecfad79d3f08be, - 0x627372480fac20a7, 0x802154d6eca2db4c, 0x8fcf02e42f805e55, 0x040a911ff8cea977, - 0xbb544485bc64d0d4, 0xaddde1aeb406d0fb, 0xf6b35fae23dce66f, 0xc07a9fb3645d2f9b, - 0xccd113907e9c0fed, 0xd17af369984fd213, 0x9223823c59a083e7, 0xe19d475606b81013, - 0xe181ac116a90e57a, 0x71f7b6258c6def4c, 0x2246f34b45964f7c, 0xd74aedaea2d31751, - 0xb1add86e5dd305d1, 0xeb9ba881f16d6471, 0xef7600e036f5c6ff, 0x1d50bc9735b8fb85, - 0xe63942bd1f3e2969, 0x9241ba9f8b3f4e72, 0xee8bb2bca07d35b6, 0x55cd55dab522654e, - 0x94d0cfa7c1a6845d, 0x02f9845d559884c3, 0x8ce70ea21063b560, 0xd70998028ef08b74, - 0xdfdb5bbee310876b, 0x4e21b2e348256d16, 0xde007a981c13debc, 0xe51950cbbddabfdd, - 0xd223301dbe9957c1, 0x084b8634cc2cce4b, 0x90e551378aa9d70c, 0x833b533ac633e448, - 0x7891e232882da57f, 0xa1bf26f0163ce2b3, 0xf33a0171eb9c68d5, 0x2e7de18ca69b3fa2, - 0x666fd6f175619199, 0x1239d37edb5feb9f, 0xfa9fc9382e61ff5c, 0x3ca4ad427e3c126f, - 0x37c6dd4c2c31ae6e, 0x1f1bacb619d427b2, 0x7dd09f5d10759afe, 0xc8d941432327d733, - 0x2b389ba25e1d43a7, 0xa4e3030c3740ff21, 0xcc56dae13fd37463, 0x2481457c175b560f, - 0x9deb35bde77c5c41, 0x847aa6ea5549a0c3, 0xcde01bb48b6e7f02, 0x15a28844e64cb211}, - {// seed = 1 - 0xecfcba92fe5691a3, 0x71377799fea34699, 0xb284c9096fa614e5, 0x54534170f40de6c8, - 0xbbd804d45884fba3, 0x44929a896388c8a1, 0x79b712508e0fa3b1, 0xeb53ab280af31054, - 0x351ea23a6319da7a, 0x2fbe55d9819d85a2, 0x34f4b6568dcd28b1, 0x8c94ea5e5d82967a, - 0x09068d333a46d3c5, 0x762ad4f64cb73381, 0xd5c6db5ef0e22640, 0x36d8ab5a36175680, - 0xd41fe333cdc3525a, 0xa1f51dbdf20ce781, 0x1410a95e786c8be6, 0x96b7499a670c2b41, - 0x3912e1037835d893, 0x272c5bd83e1e9115, 0x2ea7f91cad82a0d6, 0xcd10e85662ce9931, - 0xedad49be8d5e8b74, 0x7ccd8fe0f37d12bc, 0xfac0482005eed593, 0x4513991681f6c8b0, - 0x2804d612eb0ad37d, 0x7cca9e8412b81d34, 0x85ffd6707192b7b8, 0xea0560aeea954411, - 0x0122d28226102bba, 0xf51c47cdbd22fdd1, 0x3707d851183ff17c, 0xaef5a1465f3e902d, - 0xbcb38c2d8736a04f, 0x4025317e864bef15, 0x8d3f66d86e1ea58f, 0xc16759a3d97ed79a, - 0x1c62abdc0659f2f5, 0x23b3eb4e699bd28f, 0x5083c4fceed3ccaf, 0xa65bf34562cc989c, - 0xaa5865932fd79064, 0xf24d08d268c24593, 0x7fbd00a215196999, 0x7812cd366d752964, - 0x62e8dcb27ef3d945, 0xf08b7984e1b946dc, 0x547d23ad9a5c1dcf, 0x496b1fb249b27fb7, - 0xcd692e1db5f3b3ba, 0x41931e39f1e1bc61, 0x286c6a7d7edae82b, 0x17ef6638b6c4ca6e, - 0x609beb5a2576a934, 0xcc5e16fe4a69b83c, 0xbbd14d08b078fc24, 0x2a617680f481cb94, - 0x81dbbd5f86e6d039, 0xeb8205e1fc8ecc3c, 0xe5e3bb576faa8042, 0x5d6f1eb9d9df01b5, - 0x9a47b8739c10fb44, 0x398a7caad7ea7696, 0x9c0fc1d7c46adde6, 0x67cd6de0a51978a6, - 0x68ccc4b77a21cca4, 0x1e067066b82f415c, 0xf7ddade6535e1819, 0xf2185c884291751b, - 0xc322b7381fcbe34f, 0x242f593e88290b9b, 0x8e11ccc0ea5e84a3, 0x40e3a2e3346db8a2, - 0xf18bfc3ad2931a2c, 0x2468397394b00144, 0xeae199cce14e6817, 0x05b462686c75a1ae, - 0xda096cb859c51673, 0xd87aeb967a906bef, 0xaabc74493cb02fe6, 0x74d48fc2e7da143e, - 0x6ec1c8fed3f2c1fd, 0xe01e0704b463f18e, 0xc3d88a4d3a8056e4, 0xd01ae0ffab6c8f3f, - 0x881ba052620ae7c7, 0xcea033aef0a823a5, 0x8d2cad91d83df1e3, 0x18746d205e66dbe9, - 0x3061f8e58d046650, 0xd819c59f0ce2cf8b, 0x144e89e93635e870, 0x3415e88279b21651, - 0xd6f7ab944b86c3fa, 0x45f1dd15d0f67bdc, 0xbf0d97c7f4fa24f4, 0x34a7de520a57fcd2, - 0x4ba86fda03e9e2bc, 0xa7995265a025b552, 0x698f6819d5f51cf7, 0xd07dbe9d8a156981, - 0x2683945373857fc1, 0x116f8a84f96167de, 0x8bc832bd85595ebf, 0xb206519d74fdfafa, - 0xde9519b2e9b5cc5f, 0x16fdd6f2da1d8163, 0x7ba32bd48ef56f11, 0x6f4e4d7ee8b29717, - 0xd31576dde7468aad, 0x023bb08848676045, 0xf6dcc083178160b7, 0x42035f426250e683, - 0x343732993cfed89f, 0x0640a870a22d3d58, 0x65cff80b53b4ae6a, 0x27996fa17ab05215, - 0xfd5db01401b21a04, 0x894508784bc1673c, 0x5bfcf43a2380e27d, 0x4cd6dcc2715583b7, - 0xa43b3763e7d4c902, 0x6da83e12ef0c1257, 0xfe80a602b0335aff, 0x293a7d8f4ff344de, - 0xb4ae7c2b8956bf5a, 0x6b45432d38254b4d, 0xd086acbdf15d9455, 0xa4d19e43f41ea87b, - 0xf01f13ba4bb87fbf, 0xca582cf301a299ff, 0x0ddad3d45298fa7d, 0x0646a130459c3999, - 0xc08e3af3747e2cee, 0xfc7db8aa9ed67295, 0x783b329e7bd79d5f, 0x732dbc607957af7b, - 0x8e446ac19fb26555, 0xff1dfa4d61dc89a5, 0xb6fbc46bd8d011d8, 0x185147ec5779f0d7, - 0x6eb2cf6149a5380f, 0xb0e773df803a1eae, 0xc07706c5519bfce5, 0xc35abcf54fa95f14, - 0x40a01d99a38608ea, 0x776dcd6f603c277f, 0x6ae12389b1d6d0bb, 0x8bd981448df92bb9, - 0x426a6a7ca21a2c16, 0x87efd5b71c1bad26, 0x71fb7fc4cd41de48, 0xdd9033c45619d463, - 0x40eaab322654cef7, 0xe077fffed6f3e3a2, 0x375a4dbef9384447, 0x2066b009d2c4a100, - 0xeca4a5794a068447, 0x2128f64bddf341a1, 0x738b4bb1be90bd61, 0x433772cf3813d52e, - 0x9540c88add8e4474, 0x0b6d5decd21d3519, 0x654ead966745642d, 0xe1bfb03c3b4bdb4c, - 0x0b977a9937515b1f, 0x0a4587509ef63870, 0xe89f0de1d9cfd44a, 0x23a91390272e7f68, - 0xd92defbc9096b8d8, 0x004db87174612539, 0xc88ecaabdd1a71f1, 0x050de38393073346, - 0x8af1426d7964e038, 0xf352c4fef8ad5c87, 0x6f26bc7408e26548, 0x0d41543fd9bf3084, - 0xfc4e07553a840fc6, 0x5ef117de86a555a9, 0x1f11c42dffb5ae1b, 0x4147648f07490fa5, - 0x09b35fd7671b21aa, 0x1453b14f7ccca481, 0x944f6fcce4c9b2ba, 0x5b08dd2e3583dc06, - 0xe0220df78dc9c22d, 0x1c200b9506cbf666, 0x8a0b7465eadb523b, 0xfbcb43a91a1e2d80, - 0xe697f44be3c36a58, 0x2f8a8e48fb7e350d, 0x7baba71b8920d55f, 0x10edc0216105bc96, - 0x52db07c79d7a7a63, 0x1916e8cef9452ac3, 0x5cbbbf21f867b6cc, 0xadd583365a690a4b, - 0x4e4ca2c8bffc2fdb, 0xf5fe3416d2eebcfe, 0x839af8b85e452476, 0x8496c0c54ad44e16, - 0x6c46f1ecad4482bf, 0xb794cad76ae18715, 0x67b762eec7c62985, 0x52dc9e68df5b3a53, - 0x0cc7e444b422a5f9, 0xadbfe90841c112b0, 0xfe37b136f0ca5c34, 0xcfe9e47948a8d73e, - 0xee90572b86a30d91, 0x549e72d8262830aa, 0x3361564b469f32c6, 0x1e6eba9e0d2648e2, - 0x5f8e2b2ac5fcb4eb, 0xe4224fa5f71f7cc6, 0x7357a9230c76757b, 0xcad70f74aaf6b702, - 0xeef28ced23894cc2, 0x753fdd3352aefd68, 0x1fed6ba90bbeb9d2, 0x05316f4ab4034b4b, - 0x3396df022b9f63d6, 0x82d7125a7cfd0935, 0x3519a71caf1f87f0, 0xd1dfb7a5cc3974be, - 0xbfae40ecbdbbcc2a, 0x152c11778e08dd54, 0x4a96566a6c848554, 0x3a84d621c340cdd7, - 0xfd47aa1887e2fb03, 0xa63cae94b2f1d099, 0xed61783f3e5b75e0, 0xefd44864106019be, - 0x145ff78b80b081aa, 0x34670e5fcea9230e, 0x876ef976328db371, 0x4221f3a5269942a6, - 0x95315cbd85c648f4, 0x3ca344dc7c3b1600, 0x38421ea39ff28780, 0x31dbeee967c0435c, - 0x27437c3e268402e7, 0xdd0cf8343312a654, 0x965ab9dad1d8aa29, 0xf871706dd3e23509, - 0xce23d06c7a25e699, 0x1b37d59382b27589, 0x3407f004723d6324, 0x56efb69cdb5deaa1, - 0xf46cdd2b9fd604e0, 0xcad3ca79fdac69bd, 0x7252802a574e63cb, 0xc281fb8acc6ec1d3}, - {// seed = 2 - 0xdd16cb672ba6979c, 0x3954eaa9ec41ae41, 0x52cb802771d2966d, 0xf57ed8eb0d0294f2, - 0x768be23c71da2219, 0x6131e22d95a84ad3, 0xd849e4e49bb15842, 0x18e8e5c4978cf00d, - 0x3af5e5867ce1f9bd, 0x06c75a9fffe83d63, 0xe8de75a00b58a065, 0x0a773251bc0d755a, - 0x629dc21e54548329, 0x2a168f5e5a883e70, 0x33547375f0996c86, 0xdfcb4c7680451322, - 0x55c1ecaaaa57e397, 0x4546c346c24f5a31, 0x6f8f0401dfabc86c, 0x7760d2d36ee340b4, - 0xf6448e48bdeb229d, 0xba70e1633b4dba65, 0x069cda561e273054, 0xa010b6a84aebf340, - 0x5c23b8229eee34b6, 0xea63c926d90153af, 0x7d7de27b3e43ec1b, 0xea119541eddc3491, - 0xf1259daeddfc724c, 0x2873ca9a67730647, 0xa1e7710dade32607, 0x758de030b61d43fd, - 0xd2c9bcbfa475edb4, 0x18ade47bb8a0aa29, 0xf7a74af0ff1aea88, 0x6f8873274a987162, - 0x6963e8d876f4d282, 0xd435d4fe448c6c5b, 0x93ec80ba404cafff, 0xcf90d24c509e41e7, - 0x5f0fc8a62923e36e, 0x9224878fe458f3a4, 0xd9a039edf1945bcd, 0x0877d1892c288441, - 0x75205491f4b4740b, 0x30f9d2d523a9085b, 0x4b7f4029fa097c99, 0x170bb013745709d4, - 0x7087af537f11ef2e, 0x28c62b88e08fc464, 0x84bbcb3e0bb56271, 0x485a4b099165c681, - 0x357c63357caa9292, 0x819eb7d1aee2d27e, 0xdaa759eb9c0f8c9d, 0x42cdc36729cc3db5, - 0x9489aa852eddbb06, 0x8161e4f85a84e6d4, 0xa964863fdad3eb29, 0xcc095ddbce1a6702, - 0x3ecfadbb8dc2ce58, 0x971316509b95a231, 0xc8f484d1dbc38427, 0xae9c510c463574c0, - 0xdf2b31179600c21a, 0x440de87bada4dfa3, 0xbd8d30f3f6fb7522, 0x84e6d7f678a0e2d0, - 0x0ec4d74323e15975, 0xf6947610dad6d9ab, 0x73a55a95d73fe3a5, 0x3e5f623024d37eda, - 0x8d99a728d95d9344, 0x8b82a7956c4acdc4, 0x7faeaea4385b27f6, 0x540625ff4aa2ff21, - 0x4aa43b3ebd92ce2b, 0x899646a6df2da807, 0x49225115780942d7, 0xe16606636af89525, - 0xb980bcf893888e33, 0xf9ed57695291b0d8, 0x5c6dd14464619afa, 0x50606d69b733d4f3, - 0x7fb1af465b990f97, 0x3fab2634c8bbd936, 0x556da6168838b902, 0x0f15975902a30e1f, - 0xb29d782ae9e1991f, 0xae00e26ff8f7e739, 0xd3da86458bb292d5, 0x4528ee0afb27e4ce, - 0x49882d5ba49fabad, 0x7e873b6a7cf875ee, 0x777edd535113c912, 0x94ed05e7ff149594, - 0x0b8f95fc4211df43, 0x9135c2b42426fef2, 0x411e6c2b47307073, 0x503207d1af0c8cf8, - 0xd76f8619059f9a79, 0x64d24617855dee45, 0xf7bc7a877923196a, 0xd6cc42ed6a65be79, - 0xe3912ff09d4fc574, 0x4192d03b2bc2460a, 0xa0dcc37dad98af85, 0xfc59049b2a5818a4, - 0x2128bae90a5b975f, 0xbe7067ca05ea3294, 0x5bab7e7753064c4f, 0x42cbf0949ef88443, - 0x564df4bbd017492c, 0xf2c2eb500cf80564, 0x5b92e67eb00e92af, 0x8c4103eef59c0341, - 0x83412122b8284998, 0x888daf2da0636b6d, 0x4d54b10303dd07d6, 0x201190e7c1e7b5ed, - 0x3797510bb53a5771, 0x03f7bc598b570b79, 0xdc1e15d67d94f73e, 0x721e8b499ebe02c1, - 0x71f954f606d13fa0, 0x0c7a2e408c168bf0, 0x07df2ef14f69c89d, 0xe295096f46b4baaf, - 0x7a2037916438737e, 0xd1e861aeaf8676ea, 0xb36ebdce368b8108, 0xb7e53b090ddb5d25, - 0x5a606607b390b1aa, 0x475e52994f4a2471, 0xbcc2038ba55b2078, 0x28b8a6b6c80df694, - 0xb5f0130ec972c9a2, 0x7a87cd2a93276b54, 0x4d0eec7ecf92d625, 0xac1a8ce16269a42e, - 0xa4ca0237ca9637b8, 0xd8dc8ff91202b6ff, 0x75b29846799d7678, 0x761b11a5edd9c757, - 0xf2581db294ef3307, 0xe3173c2b6a48e20f, 0xe46fd7d486d65b3c, 0x1352024303580d1f, - 0x2d665dae485c1d6d, 0x4e0905c825d74d3b, 0x14ff470c331c229e, 0xbdc656b8613d8805, - 0x36de38e396345721, 0xaae682c1aa8ff13b, 0x57eb28d7b85a1052, 0xf3145290231d443a, - 0xd0f68095e23cbe39, 0x67f99b3c2570b33d, 0x54575285f3017a83, 0x9b2f7bb03d836a79, - 0xa57b209d303367a9, 0x7ccb545dd0939c79, 0x1392b79a37f4716d, 0x6e81bb91a3c79bcd, - 0x2c2cd80307dddf81, 0xb949e119e2a16cbb, 0x69625382c4c7596f, 0xf19c6d97204fb95c, - 0x1b2ea42a24b6b05e, 0x8976f83cd43d20ac, 0x7149dd3de44c9872, 0xc79f1ae2d2623059, - 0xca17a4f143a414e1, 0x66d7a1a21b6f0185, 0xed2c6198fe73f113, 0x16a5f0295cbe06af, - 0x5f27162e38d98013, 0xf54d9f295bdc0f76, 0x9ba7d562073ef77b, 0xa4a24daaa2cfc571, - 0x49884cf486da43cd, 0x74c641c0e2148a24, 0xbff9dcbff504c482, 0xf8fc2d9403c837ab, - 0x6ccc44828af0bb1e, 0xbcf0d69b4c19dfdb, 0x8fe0d962d47abf8f, 0xa65f1d9d5514271d, - 0x26ff393e62ef6a03, 0xc7153500f283e8fc, 0xea5ed99cdd9d15cd, 0xfc16ac2ba8b48bb7, - 0xf49694b70041c67a, 0xbd35dd30f5d15f72, 0xcf10ad7385f83f98, 0x709e52e27339cdc2, - 0xe9505cb3ec893b71, 0x2ffa610e4a229af7, 0x12e1bc774d1f0e52, 0xe301a3bb7eacccc8, - 0x1fdd3b6dcd877ebf, 0x56a7e8bda59c05aa, 0x99acd421035d6ab4, 0xfd21e401cecd2808, - 0x9a89d23df8b8d46f, 0x4e26b1f1eb297b9c, 0x9df24d973e1eae07, 0xe6cdc74da62a6318, - 0xfc360d74df992db0, 0xf4eca0a739514c98, 0x481c515ba9bf5215, 0xce89cce80f5f3022, - 0xf487a10fc80e4777, 0x235b379a87e41832, 0x76f72e028371f194, 0xd044d4a201325a7d, - 0x47d8e855e0ffbdde, 0x268ae196fe7334b0, 0x123f2b26db46faa8, 0x11741175b86eb083, - 0x72ee185a423e6e31, 0x8da113dfe6f6df89, 0x286b72e338bbd548, 0xa922246204973592, - 0x7237b4f939a6b629, 0x31babda9bedf039a, 0xb2e8f18c6aeec258, 0x0f5f6ce6dd65a45e, - 0x8f9071a0f23e57d3, 0x71307115ba598423, 0xcbe70264c0e1768c, 0x1c23729f955681a8, - 0xfbc829099bc2fc24, 0x9619355cbc37d5d6, 0xea694d4e59b59a74, 0xb41cf8d3a7c4f638, - 0xae1e792df721cd0b, 0x7cd855d28aac11f6, 0xca11ba0efec11238, 0x7c433e554ce261d8, - 0xe3140366f042b6ba, 0x8a59d68642b3b18c, 0x094fcdd5d7bccac2, 0x9517d80356362c37, - 0x4a20a9949c6c74e8, 0xc25bcf1699d3b326, 0xa8893f1d1ed2f340, 0x9b58986e0e8a886e, - 0x29d78c647587ce41, 0x3b210181df471767, 0xd45e8e807627849d, 0x1ec56bc3f2b653e3, - 0x974ff23068558b00, 0xdb72bdac5d34262c, 0x23225143bb206b57, 0xd0a34cfe027cbb7e}, - {// seed = 3 - 0x39209fb3eb541043, 0xee0cd3754563088f, 0x36c05fc545bf8abe, 0x842cb6381a9d396b, - 0xd5059dcb443ce3bf, 0xe92545a8dfa7097e, 0xb9d47558d8049174, 0xc6389e426f4c2fc0, - 0xd8e0a6e4c0b850d3, 0x7730e54360bd0d0d, 0x6ecb4d4c50d050d5, 0x07a16584d4eb229f, - 0x13305d05f4a92267, 0xb278ddd75db4baec, 0x32381b774138608f, 0x61fe7a7163948057, - 0x460c58a9092efee6, 0x553bf895d9b5ff62, 0x899daf2dabfd0189, 0xf388ab9c1c4b6f70, - 0xd600fe47027ea4cd, 0x16d527ec2b5ef355, 0x5ac1f58ff6908c81, 0xa08d79ff8ee9ffe8, - 0xc1060a80b7a5e117, 0x14b2c23118c60bda, 0x8cc0defbb890df8f, 0xe29540fd94c6d28b, - 0xa604f003f82d5b71, 0xa67583d4eb066d18, 0xd62cbd796322b3fc, 0x070cfe244cdcccf3, - 0x73557c30b3af47e5, 0x2e544e31153a2163, 0x996eef7464d5bead, 0xbc71cb5ab0586cdc, - 0x0bfcb6c1b517ed69, 0x62b4f1fcc82e8ca0, 0x0edbc68f544965c5, 0x40fa39baa24af412, - 0xf39aeb2413dab165, 0x17e6013e7afee738, 0x8109bff1c8d42a9d, 0x3cd99863390989b5, - 0x02021a4cc9c336c8, 0xa06060778cb60aa4, 0xd96591db60bc1e06, 0xd2727175183f4022, - 0xcdc1f1c5bce3e7ce, 0xb393ccc447872a37, 0xdf6efe63257ead3a, 0x20729d0340dbceb6, - 0x9f3d2d26fc0ea0d7, 0xf392e0885189bd79, 0xdf2ee01eb212b8b6, 0x6e103a0c0f97e2c3, - 0x96c604a763bd841b, 0x9fc590c43bba0169, 0xf92dcd5ddc248c40, 0x113a8b54446941dc, - 0x5943eda146b46bb8, 0xbf657901a36a39a7, 0x5a4e0e7ea6568971, 0xb94c635bae9f9117, - 0x2626fb65b3a4ef81, 0xa59bfd5478ce97de, 0x79112ba9cc1a1c63, 0xf41f102f002cf39c, - 0x0a589bcbfb7ff1c8, 0xa1478c53540c4fa1, 0x60d55e72c86dfaca, 0x312e7b6840ea7a39, - 0x8aae72dcccfe1f75, 0xff2f51f55bf0247a, 0x3c2e4b109edb4a90, 0x5c6d73f6525c7637, - 0xe49acb04a199f61c, 0x27860642d966df7f, 0x541ce75fb1e21c30, 0xd9fcd6f90806c7cc, - 0xb87c27bc93a7969b, 0x92f77a1179b8f8dc, 0xb1f29379deb89ed4, 0x7e63ead35808efe7, - 0x13545183d7fa5420, 0x575f593e34cf029d, 0x27f1199fb07344ae, 0xe67f95f7dc741455, - 0x49b478b761ab850b, 0xd7bedf794adfc21e, 0xdc788dcd2dda40ae, 0x14673eb9f4d8ad35, - 0x0cced3c71ecf5eb1, 0xe62d4e6c84471180, 0xdfe1b9e2cb4ada7d, 0x70185a8fce980426, - 0x0ce2db5e8f9553d6, 0x1fedc57bb37b7264, 0xb9310a2e970b3760, 0x989ff8ab9805e87d, - 0x0b912d7eb712d9ee, 0x1fe272830379e67c, 0x16e6a73aff4738fb, 0xeed196d98ba43866, - 0x7088ca12d356cbe2, 0x23539aa43a71eee0, 0xed52f0311fa0f7ad, 0xa12b16233f302eea, - 0xc477786f0870ecb4, 0xd603674717a93920, 0x4abe0ae17fa62a4c, 0xa18f1ad79e4edc8d, - 0xc49fe6db967c6981, 0xcc154d7e3c1271e9, 0xdd075d640013c0c0, 0xc026cd797d10922a, - 0xead7339703f95572, 0x4342f6f11739eb4b, 0x9862f4657d15c197, 0x4f3cb1d4d392f9ff, - 0xe35bffa018b97d03, 0x600c755031939ad3, 0xb8c6557ffea83abf, 0x14c9e7f2f8a122ea, - 0x0a2eb9285ee95a7c, 0x8823fec19840c46f, 0x2c4c445c736ed1d0, 0x83181dff233449f1, - 0x15ed3fca3107bef5, 0x305e9adb688a4c71, 0x7dbef196f68a3e2e, 0x93e47ece3e249187, - 0x8353c5e890ead93c, 0xea8a7ae66abafdf7, 0xf956dbb6becf7f74, 0x9f37c494fbfdb6e4, - 0x11c6cbaa2485dd32, 0x206f336fcca11320, 0x9befe9a59135d8fe, 0x5f3ef8b8db92c7db, - 0xbb305e556ce0ce9a, 0xf26bdafb1305887f, 0xcbf28abe23f08c61, 0x0bc64173b914e00b, - 0x9168da52e983f54a, 0x6ea41d09c3574a3e, 0x78aa44d4a74459ae, 0x2931422878387bf5, - 0x018f64a3a92c2d9c, 0x9be43f6752e66b34, 0xae378890decd1152, 0x07325329a1cb7623, - 0x3b96f4ee3dd9c525, 0x2d6ebcdbe77d61a3, 0x10e32b0e975f510c, 0xffc007b9da959bf9, - 0x38bf66c6559e5d90, 0xbe22bdf0bf8899fe, 0x87807d7a991632a8, 0x149a0d702816766a, - 0x026f723db057e9ab, 0xeeecb83625ec6798, 0xcec2ed5984208148, 0xd985a78e97f03c84, - 0xf96c279e7927b116, 0x99d5027b3204f6e2, 0x13a84878c3d34c55, 0x5cf5ec96229e9676, - 0x0bc36b07e4f8e289, 0xbed33b80a069914d, 0x2fbfbdd1ff4b9396, 0xab352bb6982da90f, - 0x154d219e4fa3f62b, 0x4d087512bb6b9be7, 0xc582e31775ee400e, 0x7dadb002ae8c4a4e, - 0xaae2957375c1aee2, 0x5f36ca643356625b, 0xf87cf8eb76e07fb7, 0x46f432a755e02cc3, - 0x36087e07aba09642, 0xe5642c1e4ebb9939, 0xb9152d22338eefad, 0xf7ba44278a22cf7f, - 0xd3b8013502acd838, 0x7761511da6482659, 0xb0857621638e8e50, 0x552eddb4a8b1d5f5, - 0xc43d9861e812c3ea, 0xd765c2aada47910c, 0x21c935b68f552b19, 0x6256d5641a2b47dc, - 0xab711d8e6c94bc79, 0xa8d0b91a2a01ab81, 0x5e6d66141e8d632a, 0x7638285124d5d602, - 0x794876dbca3e471f, 0x951937d8682670ce, 0x0f99cb1f52ed466a, 0x8c7cd205543b804c, - 0x2fd24d74a9c33783, 0xe5dcb7b7762e5af1, 0x45e6749cca4af77c, 0x540ac7ee61f2259f, - 0x89c505c72802ce86, 0xeab83b9d2d8000d1, 0x9f01d5e76748d005, 0xc740aaef3035b6d0, - 0x49afcd31d582d054, 0xcba5dc4c1efb5ddc, 0xc0a4c07434350ca1, 0xfc8dfaddcc65ee80, - 0x157c9780f6e4b2d9, 0x9762a872e1797617, 0xc4afae2cf3c7e1bd, 0x71cde14591b595d4, - 0x8843c3e0e641f3b9, 0xd92ecd91dce28750, 0x1474e7a1742cb19f, 0xec198e22764fa06b, - 0x39394edb47330c7d, 0x00ba1d925242533d, 0xaed8702536c6fb30, 0x6d3618e531c2967a, - 0x77f7cedcd7cc0411, 0xbc1e2ab82be5b752, 0x07b0cf9223676977, 0x596c693b099edd53, - 0xbb7f570f5b9b2811, 0x96bfdad3c4a6840c, 0x668015e79b60c534, 0x3ad38d72123f1366, - 0x6b994d81d2fcbb09, 0x70885f022c5052d8, 0xc891ee79d9306a7b, 0x2c4df05c0ed02497, - 0x19ebc13816898be2, 0xea7c64df11c392a2, 0xb7663e88dd12e1bd, 0x79f768cb8e154c21, - 0x1fb21b12e945933b, 0xe6a9045643f6906e, 0x544c47acd7e15371, 0xb7709b14f727e3d1, - 0x326ee36a46942971, 0x477f1cf7b0e2d847, 0x88b8f6b82b3b0c24, 0x18bc357b80e3cd5c, - 0x3333de70e4d66e0b, 0x4fd4c5e148583cf6, 0xae1b62f3008c0af3, 0xc49f419b6ab29cf5, - 0x2c29fa65afc3fa28, 0x4b19d93734d03009, 0x7dd6c09e589276ad, 0x1cece97f30de48ad}, - {// seed = 4 - 0x58bdf4338602e4fb, 0x71a5620b02c926d5, 0x3811c960129c2d9f, 0x29c2fb11fccac567, - 0x0d6b1ea7780f1352, 0xcc4d3ddfae3f87b3, 0xfdd30257362a586b, 0xabc948fde69f25f1, - 0x51b3523469d30f7b, 0xe0f0322724405ace, 0xd3729266d896da1e, 0xb10c37e5147915bf, - 0x8b577039f9fa32a3, 0xe677c6a9cbfb44b3, 0x7317a756ebb51a03, 0xf8e988ef37359485, - 0x600fc1ef3f469ff3, 0xbf0b8f8520444e01, 0x3711168b08b63d73, 0x34146f2944a6cb36, - 0x717feb263862cdde, 0x7185f8347db00412, 0x900798d82127e693, 0x84089e976a473268, - 0x10f8308c0d293719, 0xf62a618d4e5719b8, 0x8bdbd257a1a9516f, 0xf49f666fd7a75110, - 0xbaf45e2db7864339, 0xe4efa1ea0c627697, 0x3e71d4c82a09fe10, 0x54a2a51cf12127bb, - 0xa0592c9f54ba14cd, 0x27dd627a101c7a42, 0x3d2ceb44b3d20d72, 0x7ee1f94a68ca8f5d, - 0x7e8cb8651b006c36, 0xbd9fa7ca3a475259, 0x856de173586a7b34, 0xcedb291b594cb1b5, - 0xa3d6e462fd21cddc, 0x74561d10af9118e4, 0x13a3d389fc2d4b36, 0xeea8594a4a054856, - 0xf56d7474d9ba4b13, 0x25ddce2f6490b2fd, 0x920653ff3a8d830b, 0xcd8c0c9cdac740d1, - 0x2c348a738db9c4a0, 0x2967ccbe8ea44c22, 0x47963f69adb049f8, 0xf9d01eb5b4cf7eb6, - 0x7a5c26eb63a86bd2, 0x62ad8b7a71fa0566, 0xb373213179f250ae, 0x589d4e9a88245a4d, - 0x433dafebe2d558a8, 0x521fbef2c8fe4399, 0x62a31f9ff9ccd46b, 0x51602203eba7c1a6, - 0x9afc8c451b06c99f, 0xb529085bdbaffcea, 0xac251825cc75892b, 0x94976a5bce23d58e, - 0xdd17925b6c71b515, 0x568fd07a57bce92e, 0xefac31200d8bd340, 0x716c3e466b540ef9, - 0x3d2c9e380063c69b, 0x14168f9a3662dd83, 0xd298c7504dbc412f, 0x74490a94f016719f, - 0x0e0da431e1ab80c8, 0xe321f63dc6b169ae, 0xf08671544febc95a, 0x39324450cc394b3b, - 0xea6e3d35f1aa3a70, 0x8ef8a886508ce486, 0xdc1a631ef0a17f06, 0xfda2b3fbcd79e87b, - 0xd75bcae936403b10, 0xf88b5bd9f035f875, 0xc43efec2e3792dd4, 0xe9fac21a9d47cd94, - 0xc2876f0c4b7d47c3, 0xaba156cf49f368b4, 0x5ccda2170fa58bf9, 0xadc92c879ed18df7, - 0x110c1b227354e6c8, 0x298ee7a603249200, 0xde92142ede0e8ee7, 0x88e4a4610644ba9e, - 0xbb62d277e7641d3a, 0xb9be1985b7bf8073, 0x29024e5426cdb0d1, 0xf6aefd01f3092ab8, - 0x2a07087b313133aa, 0x6d71f445d6dfc839, 0x1e2412ff12e5526b, 0xed5cdeba6617b9e1, - 0x20b1d0d5e5f8760e, 0x12ff15705c368260, 0x7bf4338b7c387203, 0x34ff25f00cd06185, - 0x1148c706c518cf28, 0x5c04f0623388f025, 0xcb9d649275d87d79, 0x9b5f0c24fabc42ec, - 0x1a7b5e7964e33858, 0x2a81bbd8efdc6793, 0x8d05431ffe42752e, 0x83915cd511002677, - 0x580ed4d791837b31, 0x5982e041d19ff306, 0xcad0d08fa5d864ca, 0x867bee6efe1afa63, - 0x26467b0320f23009, 0xd842414dfda4ec36, 0x047fcdcbc0a76725, 0xbddb340a3768aeca, - 0xef4ce6fa6e99ab45, 0x88c5b66c7762bf9b, 0x5679f1c51ffb225d, 0xdab79048317d77ee, - 0xf14e9b8a8ba03803, 0xe77f07f7731184c1, 0x4c2aab9a108c1ef5, 0xa137795718e6ad97, - 0x8d6c7cc73350b88b, 0x5c34e2ae74131a49, 0xd4828f579570a056, 0xb7862594da5336fc, - 0x6fd590a4a2bed7a5, 0x138d327de35e0ec1, 0xe8290eb33d585b0b, 0xcee01d52cdf88833, - 0x165c7c76484f160e, 0x7232653da72fc7f6, 0x66600f13445ca481, 0x6bbdf0a01f7b127d, - 0xd7b71d6a1992c73b, 0xcf259d37ae3fda4a, 0xf570c70d05895acf, 0x1e01e6a3e8f60155, - 0x2dacbb83c2bd3671, 0x9c291f5a5bca81af, 0xd976826c68b4ee90, 0x95112eec1f6310a2, - 0x11ebc7f623bc4c9a, 0x18471781b1122b30, 0x48f7c65414b00187, 0x6834b03efa2f5c30, - 0x0875ef5c2c56b164, 0x45248d4f2a60ba71, 0x5a7d466e7f7ba830, 0x2bebe6a5e42c4a1d, - 0xd871d8483db51d10, 0x6ee37decd2fd392f, 0x7d724392010cede3, 0x8e96ef11e1c9bcc8, - 0x804a61d86b89d178, 0xbb1b83ce956055ec, 0xcb44e107410ff64f, 0xc426bb09ee0ba955, - 0x057c08f42c3dd7f1, 0x40ea1ec148602bdf, 0xc24688deeb65d7f1, 0xd8bcc53c768ba4e4, - 0x16e0e3af65c1106c, 0xfc12f7e7d647218b, 0x70d6e1d3ee93cef4, 0x01d2a505c4541ef9, - 0x1ef79e16e764d5c3, 0x0363d14d13870b98, 0xb56ef64345d06b11, 0xe653d557ebb7c346, - 0x8304a8597c2b2706, 0x1536e1322ce7e7bb, 0x525aec08a65af822, 0x91f66d6e98d28e43, - 0xe65af12c0b5c0274, 0xdf6ae56b7d5ea4c2, 0x5cef621cedf3c81c, 0x41e8b1ffd4889944, - 0xb5c0f452c213c3e5, 0x77af86f3e67e499b, 0xe20e76ea5b010704, 0xbdc205ab0c889ec0, - 0xc76d93eb0469cd83, 0x17ac27f65cab0034, 0xd49ec4531fd62133, 0x07a873ea2f1b9984, - 0xbff270dfef0032ee, 0x1764dbe91592f255, 0xe40363126f79e859, 0xa06cad3ab46971f6, - 0x0be596e90dedd875, 0x3387cce5c1658461, 0x44246acf88a9585e, 0xe0ad82b92d5ecb2c, - 0x2177491c9a1600a6, 0x16e7c4aac0f02422, 0x75792eeeec15c4e1, 0x2309cd359d08ee30, - 0x7cd9831dd1b83b0a, 0x374914a7c4ee8cf0, 0x0dd17765c9ac2e54, 0xb7847470ba9a7688, - 0xfba4f4bbe2991173, 0x422b203fc3de040e, 0x63bfcaf2ecf2ab0e, 0x0c5559f3a192946e, - 0xfdf80675c1847695, 0xf5f570accab842c9, 0x65cc5a448767afea, 0x1efeb0a7ee234f2f, - 0x9b05f03d81e7b5d2, 0xe7c31317a8626cf4, 0x620f2a53081d0398, 0x1b6de96cdd9943ae, - 0x8c226a436777d303, 0xa08fbbd50fafb10d, 0x6a64c5ec20104883, 0x9c9c653502c0f671, - 0x678a02b2174f52a0, 0x68e008ba16bbad4b, 0xa317c16d2efb860f, 0xeab2075d17ed714c, - 0x565eeeddf0c4ea15, 0x8ec8e94d242a6c19, 0x139e8e27d9000fae, 0xc977a7ff1b33d2f5, - 0x1d0accca84420346, 0xc9e82602cd436e03, 0x6a2231da53d2ccd3, 0xb44b12d917826e2a, - 0x4f4567c6a74cf0b9, 0xd8e115a42fc6da8f, 0xb6bbe79d95742a74, 0x5686c647f1707dab, - 0xa70d58eb6c008fc5, 0xaaedc2dbe4418026, 0x6661e2267bdcfd3d, 0x4882a6eda7706f9e, - 0xf6c2d2c912dafdd0, 0x2f2298c142fd61f9, 0x31d75afeb17143a8, 0x1f9b96580a2a982f, - 0xa6cd3e5604a8ad49, 0x0dae2a80aad17419, 0xdb9a9d12868124ac, 0x66b6109f80877fac, - 0x9a81d9c703a94029, 0xbd3b381b1e03c647, 0xe88bc07b70f31083, 0x4e17878356a55822}, - {// seed = 5 - 0xb3c58c2483ad5ead, 0x6570847428cdcf6c, 0x2b38adbf813ac866, 0x8cb9945d37eb9ad3, - 0xf5b409ec3d1aed1c, 0xa35f4bffc9bb5a93, 0x5db89cde3c9e9340, 0xff1225231b2afb2b, - 0x157b0b212b9cc47d, 0xf03faf97a2b2e04d, 0x86fdab8544a20f87, 0xfcb8732744ae5c1c, - 0xd91744c0787986d5, 0x5f8db2a76d65ad05, 0xcff605cbed17a90d, 0xf80284980a3164e7, - 0x59cc24e713fccc7d, 0x268982cada117ce4, 0xcd020e63896e730e, 0xe760dc46e9fe9885, - 0x6aaece8ab49c6b5d, 0x7451194d597aae3e, 0x35d4385900332457, 0xa40fb563a096583d, - 0xa797b612f7f11b76, 0x2fed6eb68e6a2b9b, 0x2f06ee64aeffd943, 0x9dd0e49d9ca45330, - 0x97d48f08bd7f1d8f, 0x1cfa7fe3ebe4d8ee, 0x2a2ba076bd397d42, 0x68c4344f7472f333, - 0xce21ec31987d74b5, 0xb73dabdc91d84088, 0x801aadee592222fe, 0xaf41345398ebc3f5, - 0x8a8f653d7f15ee46, 0xce2d065ff2ba2965, 0x4e05da515da2adb7, 0xa6dbdb8aa25f0fd4, - 0xca9f9666bbd2d5a9, 0x6b917ce50bd46408, 0x1550cc564ba6c84d, 0xb3063ae043506504, - 0x84e5f96bb796653d, 0xe2364798096cf6e3, 0x3b0dfedf6d3a53d0, 0xb7e4c7c77bde8d93, - 0xe99545bac9ab418a, 0xa0e31f96889507bb, 0x883c74f80c346885, 0xf674ae0b039fd341, - 0x8bb6ce2d5e8d1c75, 0x0c48737966a7ed7c, 0x04fcdf897b34c61c, 0xe96ac181bacbd4d6, - 0x5a9c55a6106a9c01, 0x2520f020de4f45d3, 0x935730955e94d208, 0xce5ad4d7f3f67d3b, - 0xa4b6d107fe2d81ca, 0x4f0033f50ae7944e, 0x32c5d28dd8a645a7, 0x57ce018223ef1039, - 0x2cbab15a661ab68e, 0x6de08798c0b5bec2, 0xee197fb2c5c007c6, 0x31b630ac63e7bda2, - 0xab98785aefe9efe3, 0xa36006158a606bf7, 0x7b20376b9f4af635, 0xa40762fdc3c08680, - 0x943b5faffd0ebee2, 0x7f39f41d0b81f06e, 0x7c4b399b116a90f8, 0x24e1662ac92bc9f3, - 0xcf586fc4e8e6c7db, 0xe46e0d047eeb12d7, 0xe8021076e4ea9958, 0x11fc13492e3ca22a, - 0xd61eae01410397e3, 0x7e8c4a58036a8e9f, 0x068a6de267970745, 0x64faab129bef1a41, - 0xb4a6f720943dad01, 0x631491058d73a9d5, 0xdad4fe95eab3ec02, 0x0a8b141c5c3a44f6, - 0x9fc69d4c2b335b98, 0x94d5f84a07d6e4cd, 0x1b73965de143c608, 0x443932c2dda54bcc, - 0x7397818fb0b04cd2, 0xef4ab03a1202b277, 0xf3d2ee459c0c2b92, 0x182d4daf8b058a87, - 0x90e63035d7b51368, 0xba4cd8b9a95d45fd, 0x12a7392c76731090, 0x890d264ec5d082d2, - 0xeeaf5c363da4994e, 0xd6aad756902123fb, 0xb531ebebdb28f191, 0xe71ce659fc59babd, - 0x37c1b94f63f2dcb5, 0xe4e3abeb311f9b96, 0x4a31b72ccb8695d3, 0x52cae1f0629fdce4, - 0xe5b0475e2ed71369, 0x2724e8c3506414fb, 0xbab0367920672deb, 0x0161a781c305449f, - 0x37b70f40f5bb60be, 0xddd1094c50251a01, 0x3b28283afd17224e, 0x06dec0cfe889fc6b, - 0x47608ea95bb4902d, 0xad883ebc12c00e82, 0x9e8d7ae0f7a8df29, 0xa79443e9f7c013a1, - 0xcfa26f68b7c68b71, 0x33ae6cc19bda1f23, 0xd9741e22b407887f, 0xf2bff78066d46b1c, - 0x794123191c9d32d4, 0x56cb6b903764ec76, 0x98775d0ef91e1a5a, 0xae7b713bc15c1db9, - 0x3b4c1a7870ed7a0d, 0x46666965f305cc34, 0x0ea0c3b2e9c6b3cd, 0x4dc387039a143bff, - 0x5f38bb9229ef9477, 0xea5d39ba72af7850, 0x69a5ed0174ce2b6d, 0x06969a36bfe7594d, - 0x0adee8e4065ccaa3, 0x908a581d57113718, 0x64822d6c5a8190ed, 0x8c5068b56ace4e4c, - 0x88ba3b4fb4e30bef, 0xa6ec0b8bb5896cfe, 0x4e23fcc6b47996fd, 0xe18e75b0dd549c7a, - 0xcd90f17e106cf939, 0x1666fdfb2ef7c52f, 0x4fae325f206dd88c, 0xe7bc1160e25b062d, - 0x3cc999cb246db950, 0xc5930a7326cd5c37, 0xb008a48a211367bd, 0xc5559da145a88fd4, - 0x1e3ad46655fac69c, 0x7834266b4841bfd7, 0xa764450fbffc58cc, 0x54d8cf93a939c667, - 0x93c51f11b21b2d9d, 0x0964112082ed65cc, 0x4c2df21213e7fb03, 0xf0405bc877468615, - 0x17b4fc835d116ab4, 0xa6b112ae5f3cb4ef, 0x23cfc8a7fd38a46e, 0x8e0a360dc2774808, - 0x24ca9c8092105ad5, 0xafd3f75524f2e0d5, 0x4f39ed7dbaddc24c, 0xe5e362c7679a7875, - 0x00914a916b07b389, 0xdfe1119b7d5ab5da, 0xabd6ed9940e46161, 0x630ed2044171e22c, - 0xdecc244157dd1601, 0x777e6d5b4b4868d5, 0x9b3530bee67017d8, 0xd2faf08b291fdcb9, - 0x006e99455d6523de, 0xd559b5817f6955b5, 0xefcc1063b0088c61, 0xed73145ae0f00ae7, - 0xab2af402cf5b7421, 0x897767f537644926, 0x26c9c0473ca83695, 0x192e34e1881b2962, - 0xf7cf666ec3b3d020, 0x27f9b79c7404afb7, 0xe533e8bed3010767, 0xe5817838e11d05d3, - 0x65659c531bd36517, 0xd427c5e0a23836fd, 0xf3eab7ea58fa3528, 0x07683adae1289f35, - 0x201d6af7e896dd32, 0xd5da938b9a21ad88, 0x843fb73ad67bc316, 0x1782ec7d5feef21b, - 0x943f66f6ec772877, 0x7e9112e7b26da097, 0xeac8161f8663c2c7, 0xe8600db480a9ebf4, - 0x07807fc90f6eaf5f, 0xe0e4c9deb41abf83, 0xbdf533db271f9c15, 0xb398411b0497afe2, - 0xdebb45ef25448940, 0xe7a5decefcd376c4, 0xaf1ef3c728c83735, 0xb8b83a99355cb15a, - 0x6444a0344f1611e4, 0xe8bb7f5cf3c60179, 0x77ab5c5177e75ff7, 0xc38fd6fa849d585d, - 0x390d57d53029060a, 0xa66327eb7b8b593c, 0x6350a14f6fcd5ac9, 0x2c08125bcd7008b4, - 0x2d00c299a6a6bf8e, 0x6b0039c1f68d1445, 0x0035150c5d06f143, 0xa34d01628cc927e1, - 0xdf5b3164d7b2ede1, 0x8167db1d0583d72e, 0x4e13b341cd2ae8bc, 0xa693d9b1f416e306, - 0xc15ed7ca0bc67609, 0xdc344313c1c4f0af, 0x88b6887ccf772bb4, 0x6326d8f93ca0b20e, - 0x6964fad667dc2f11, 0xe9783dd38fc6d515, 0x359ed258fa022718, 0x27ac934d1f7fd60a, - 0xd68130437294dbcc, 0xaf5f869921f8f416, 0x2b8f149b4ab4bf9f, 0xc41caca607e421cb, - 0x7746976904238ef9, 0x604cb5529b1532f0, 0x1c94cd17c4c4e4ab, 0xe833274b734d6bbe, - 0xe9f1d3ef674539ce, 0x64f56ed68d193c6a, 0xe34192343d8ecfc1, 0xcb162f6c3aa71fe8, - 0x99eaf25f4c0f8fa4, 0x92f11e7361cb8d02, 0xb89170cddff37197, 0x4f86e68a51e071e3, - 0x31abf6afd911a75b, 0x6d20cf259c269333, 0x4150b9f88fcb6513, 0x705063989ebf7451, - 0x559231d927c84410, 0x1ca8ec4b098bc687, 0xebed22405c9180e0, 0xaa815b37d052af59}, - {// seed = 6 - 0x946ac62246e04460, 0x9cebee264fcbc1ae, 0x8af54943a415652b, 0x2b327ed3b17b8682, - 0x983fde47b3c3847e, 0x10a3013f99a2ad33, 0x6e230bb92d2721ef, 0x1cf8b8369e5c5c50, - 0x7f64017f2b7b3738, 0xd393248a62417fa1, 0x9ff01c0b20a372c5, 0xb0e44abce7e7c220, - 0xcebb9f88d48a815f, 0xdb7df6bd09033886, 0x7844fc82b6fa9091, 0x72d095449863b8ec, - 0xc13e678c89da2c7e, 0x6caf4d5ad231d12f, 0x2e0ab7b5fcf35c49, 0xf410720cb932a70f, - 0xd66ea581f16fce06, 0x175c9f002f57dc98, 0xccbcfd0d32988775, 0xfde4c407d3b0a232, - 0x5db2931ae7e97223, 0x6e07e2173085809f, 0x6e1d1ec0f9cad73c, 0xb2fc251a7f802619, - 0xbc1fc17f04f342de, 0x8de8f21ec658e078, 0x72c0f40cbee53fd6, 0x0678244411fc17a1, - 0x1d5837ca166b9bbd, 0xc8cada003c554345, 0x6a2fe2bfb2e58652, 0xfca9d797a6f7988b, - 0x6699e24ac737948b, 0x69623ffcb05789ba, 0x946429c529d95b75, 0x0d14df0b2a13970f, - 0x593d8592c440dfec, 0x2ee176f3d7e74b94, 0xae003f1da3be9e26, 0x0c7b02c4c0f6764a, - 0x3117e2fa1f632462, 0xf0f23265b6f1eaeb, 0x3111255d9b10c137, 0xc82745e509a00397, - 0xbd1d04037005fea7, 0xe104ab0dd22a9036, 0x51b27ce50851ac7a, 0xb2cb9fb21b471b15, - 0x29d298074c5a3e26, 0x6ebdf2058b737418, 0xc4a974041431b96f, 0x1ec5a30ccb6bdaac, - 0xe818beede9bf4425, 0x4b69b1bce67a5555, 0xf5c35f1eb0d62698, 0xf4509bbd8e99867c, - 0xb17206debd52e1bc, 0x35785668c770b3be, 0xe9343987ff5863bc, 0x2ee768499ac73114, - 0x5132bb3426eeaaf4, 0x471bce2c6833c5ff, 0xbb9a2d5428e6f6f9, 0xd5678943c595792d, - 0xab2a65e7f81e479c, 0xa82407bb23990b31, 0xdae321383984923c, 0x01823bb22648e6f1, - 0xda6e8df4214a8b04, 0x0e172bb88e03d94f, 0x552da6c22e362777, 0x7ce67329fb0e90cb, - 0x7b2d7f287ede7ebf, 0xd44f8222500651bd, 0x4acca1ef58fbb8ab, 0x428ecf058df9656b, - 0xd7e1ec6a8987c185, 0x365be6a54b253246, 0x168849be1e271ee8, 0x6a00f3c4151a8db2, - 0x37602727ca94b33d, 0xf6b50f18504fa9ce, 0x1c10817f6bc872de, 0x4bfe1fe42b0f3638, - 0x135fad4b8ef6143b, 0x1b25ad2bafc25f58, 0x41e37f85cf321f92, 0xfc73f75d9d5b9bea, - 0x9eb3694d1e9cb7e1, 0x601d51f08fa83b90, 0x234a2a9b88366f41, 0x63fe903e16f2c3bf, - 0x1cdbd34fa751c0b0, 0x0ce4fc6747c0558c, 0x51ed72afb8bb49aa, 0x20313ba13ca12c96, - 0x271fa38f9ebd54c1, 0x3696a5ac03a8edde, 0x05602be7df625702, 0x11f1ac73790f7a9f, - 0xa2836c099f0810bd, 0xe5ac2e47caa532fa, 0xd9c000a66d39f681, 0xd93d900e6f3d9d5f, - 0x792c81c65b7900f2, 0x5c5dce790ee20da1, 0x74ff1950edec1aee, 0x71fc85fa1e277d8f, - 0x0e77df17d6546cbc, 0x07debad44816c3b4, 0xbafa721581e92a70, 0x8ab6fbe2ed27bba8, - 0xe83243a20dea304a, 0xaa85a63a84c00a07, 0xde0e79917fc4153a, 0x21bb445e83537896, - 0xeedcac49fc0b433a, 0xffb2926a810ae57a, 0xf724be1f41d28702, 0x79cb95746039bb3b, - 0x5a54fe3742a00900, 0xda4768d64922c04f, 0x420396a84a339dae, 0xa171e26ee5e8724e, - 0x4c8da7c5d289c20a, 0x9ebd79a1a8e94742, 0x39235232b97e9782, 0xb75df0be9bba7d80, - 0x0c1d204dd87d48fc, 0x8f81f3e7177266e8, 0xe4a460b39e78d72b, 0x50b98fa151e65351, - 0xb7cb585c3ee1eddc, 0x11cdad9a76ee1dc4, 0xa38054a78595dc1c, 0x92f09e2ec4978edc, - 0xa8f0061b5efdabaa, 0x04bcc4abc224d230, 0xc58606738e692d46, 0xdd2b27b565952433, - 0x19e6ed1b740beec0, 0xceadd49b2ef9891f, 0x328178c28fe95cad, 0xe5ad4c43afe02848, - 0x03c0cb538cd967c0, 0xec4352526d19a630, 0x4c7e99389d39b031, 0xf65dd05362c2deb6, - 0xd1e70daf6879d28d, 0xbe9f57db6309b265, 0xa4b66f370b872bb7, 0xe26896fbc6ee1fd5, - 0xac705e661bfcf7c5, 0xab4d0d07d7f09940, 0x976417c06aeb6267, 0x8161c684a6bd468c, - 0xf77b6b9976dc4601, 0xc6489b779a39c12c, 0xb2aa58d5681cea1a, 0x043b1b40f8c3e04c, - 0x681fcbfadc845430, 0xab8896c921ba8def, 0x57aaf172606f37b2, 0xc3735048cd5eb8d7, - 0xa7078b96955631bd, 0xdd6b3543aa187f33, 0xc7103ea4a2a697fd, 0x8d7b95f6ff1f7407, - 0xe44f419e84709530, 0xf340caa9132cbb0a, 0x2ba407283143c66c, 0xe1be240ca636c844, - 0x90d32f2877ac08bc, 0x5d26e6294b2c8673, 0x4a6b2f5b27c87a44, 0x961fb9043f76d34f, - 0x0afee02d8d3c55d2, 0x6228e3f48c42e5dc, 0xc338e69ee6593675, 0x853f74b16efb7bdd, - 0xd062f40bdd22e687, 0x647164b9ab4c4190, 0xf94689f67d598369, 0x8e4b29d87a5012d7, - 0xaf02b8b925656fbd, 0x7a722a767179a630, 0xb5c8afe937a75ace, 0xfdb8e8d02d279372, - 0x887ef700cb25fae1, 0xcfe9bd912f72cabe, 0xb1d4dedc24f978de, 0x517522d38319cc2a, - 0x7dd87b2b36aab798, 0x579c4ff3046b5a04, 0xf5c5975c5028b7a7, 0x7094579d1000ec84, - 0xbc8d5b1ea70a5291, 0x161b2d783be8855c, 0xd26d0b0d6d18279f, 0x0be1945f02a78bd5, - 0xb822a5a9e045415b, 0x2fe9d68b1ccc3562, 0xb2e375960033d14f, 0x26aca04e49b4ff22, - 0x732a81c862112aea, 0x8bd901ed6e4260b8, 0xe839532c561ad5b0, 0x8fb6e4d517a79b12, - 0x0dd37f8c0be9b429, 0xc8ad87ad12f1b1b0, 0xc51f3aa62b90318b, 0x031a7e8b86c1cefc, - 0xa95547af2b70fc76, 0x9cb3615c5a98801e, 0xa387e3c3341d7032, 0xa087ea52a1debaef, - 0x16325ec9a2e6e835, 0x587944a484c585eb, 0xc8879033bde22ecc, 0xa39dbfce709c464a, - 0x7acc010f99208774, 0x98dd2973a096c5ad, 0x26458b51139f198c, 0x2f5d19575e8c4f02, - 0x726643f0d38af352, 0x44d879b6d73e6e94, 0xa68a03885c980abe, 0x06048acd161c40c0, - 0xa4dab8f89d405d28, 0x7120c880cb04be18, 0xa062ace22a1cf0cf, 0x3901a9daf29704f4, - 0xff08f3ed989db30a, 0x6d22b13e874c67e9, 0x80c6f35518d73f4d, 0xc23c2a521aac6f29, - 0x2e708fd83aaa42e0, 0x7fc3780f55f1b0fd, 0xabb3075c98cf87f2, 0xb4df3f40f7c61143, - 0x2a04418098a76d75, 0x0d9eeee9509b2d37, 0x6be8ae51f4b59cdc, 0xe746cc7c00e4a2ab, - 0x785bc6df9cac597c, 0x33cb6620ce8adc48, 0xc1ba30739bffcef7, 0x6d95771f18e503f7, - 0xf7be3ae2e62652ff, 0xc8d82ffd2a73c62b, 0x8725a3ba5b110973, 0x67ed6b9c724757ec}, - {// seed = 7 - 0xc0272d42c19ff3ae, 0x4694228b43ea043b, 0x5709a6ef8a462841, 0xc9210a1e538805c9, - 0x279b171196113ec2, 0x859b769fc2d9e815, 0x0d5d3125a2bf14d3, 0x22bca1cfefa878ba, - 0x481b6bf58037bd83, 0x4933ba8647728d22, 0xf08c7b6b56f6e1b6, 0x374e8af5a15407c7, - 0xa95c4dc3d2487a5c, 0x9b832808ff11e751, 0xf2048507e9da01d5, 0xa9c576189f544a4a, - 0xf6c2a45b2e9d2b41, 0x9b9874c9f10ecc2f, 0x37d9b5f51f8c149e, 0x93aead54c9de9467, - 0x59cf0b4af262da23, 0xe7e9929af18194b2, 0x9df2644e33eb0178, 0xde4122d6f0671938, - 0xf005786c07f4800b, 0xb1fc9d254b5d1039, 0x0bf1088631f6dd7b, 0x665623f0a4b8f0c7, - 0x60f0113a9187db7c, 0xfd7cceda4f0d23a6, 0x26c01e9d89955940, 0x33afa1dfc0f5a6a0, - 0xeb77daf215e9283c, 0xc7575214bf85edb4, 0xeb0d804bf297e616, 0x84bff4ffd564f747, - 0xc4ac33189246f620, 0x43ef61213ecc1005, 0xcbbb0dea6cd96acd, 0x8ed27abfa8cfcb05, - 0x543b61529cb996b6, 0xa5f987ca41ea5e59, 0x3c50e0ac5254cb7a, 0x4192b0446c06d1e6, - 0x3e86592e21b45388, 0xdb766f06fcc6e51e, 0x0448ee36efe632db, 0x663c9db689253e35, - 0x72e0bd4985331dd4, 0xff501b5bf7d94e74, 0xe911ce758e2113a8, 0xec3a8d03a75a6ba4, - 0xaf6b4b72f56edc83, 0xf284857936c0a391, 0x5ba6feff407d46f4, 0x9d689c26de9d6702, - 0x28c04a9083726b5d, 0x2ccf4a627a029730, 0x7b4719500d4f0c71, 0x76470a9a7da250a8, - 0xcc48409404a1c890, 0xccefbdc7ec9a8055, 0xe0db91bff3cc42d3, 0x0532436426141254, - 0xf2ee9325e6f0ff0b, 0x149c20a5fbb28d9d, 0xe71624cd8d2d14d4, 0x8f01d4dc8cc2dd77, - 0x29cf409b333015b7, 0xba8bebd211884dd1, 0xc3396635e8c8db1d, 0x8ed0f6208d0528b8, - 0x0d90b43fdd0ee334, 0xd73c9a3333a044c7, 0xa2595cd208dbdc38, 0xae93cb264f940c09, - 0x8e0538d8afb07a97, 0x19115ec881385ba2, 0xa886f9e6a8039c6a, 0xcd5d62147ce3ecac, - 0xaecdf9e0bb4969f7, 0x2ddd631c53dcad10, 0x73ad1c97b3412054, 0xb08915fa2722efc6, - 0x97966047e5067eb0, 0x337f1675ed91445c, 0xb3a833d150b96a0d, 0x5940a98fe35e5e2e, - 0xfd03cc354ed0d8ff, 0x4e65b98291a8644a, 0x14a259f2852a60b2, 0x7648e3478c1e8e5f, - 0xbc0fbef6d9a919b4, 0xbec4302081346cf1, 0x57d2ce7aa1c7c511, 0x234c209d8f4e1ac3, - 0x87cf80cc933ce443, 0x7c262c616931e94e, 0xc5e33b049cf9eddf, 0x1a80790ed03ae51b, - 0xf2e8b9494f7220cf, 0x124cb59c14fff3ff, 0xa8a06cbfdb86ce18, 0x9068ef1f80b37653, - 0x0c55417b8d90338f, 0xcd579a523f6bcd30, 0xa31bfe2476a8d2a9, 0x1f8d142208094223, - 0x332dc40a5203cfad, 0xf8792fe5b2d33b4c, 0x443bd9668bf9461e, 0xc9019db0ace1409e, - 0x781bea919a113e8b, 0xb0f11d866abfbeec, 0xcfe139a60db0c26a, 0x869ab8721e6aa39e, - 0xdb48a4977717837a, 0x588a5ff151065b18, 0xe4a251ea0028864d, 0x7f0e43ba408a77c3, - 0x65f66dd50a536135, 0x6f49e934d9331c3e, 0xb8d742e0f0fa6b09, 0xe4e9b272deca2348, - 0xaee132ff902f773c, 0x43f658f7c2a0c90a, 0x28cb4dbc76cc53ea, 0x7d92253aa99ac39b, - 0x4fea3d832370baab, 0xb29e36936e51d78e, 0xea10778712321064, 0xff4f21f8ef274be2, - 0x84eff18ddfa0933f, 0xd0ec6a9f86c758a0, 0xaf82e5973c431ae0, 0x352023c00c045425, - 0xad34d7bc4a2f8961, 0xbdb4a02a24d4dee0, 0x354a4846d97447cf, 0x331a8b944d5bc19f, - 0x5ce04f8e17909035, 0x6497581bad8f4aab, 0x07c503bba647111e, 0x85f412ba78e1f7ff, - 0x7f3b920fd20f4cff, 0x424e1a9a4ce34e2f, 0x3035e2d62e1b9f0a, 0xef63114bff7b729a, - 0xe86a05889ab6bb60, 0xee0830cf095585a1, 0x4a54f7fa47d9c94b, 0x17daeece9fcb556a, - 0xc506d3f391834c6f, 0xb3f24be362e1af64, 0xc435e4e23608efdd, 0xeeba9caaa4cc1768, - 0x5a71f306daddc22d, 0x18e5205f41eba1a0, 0x7b29b4d1f6610925, 0x065cb65a0258d9a9, - 0x3e5ac8faa9fd1f95, 0x3b362362c1ea0470, 0xce0e4f6434db7a2e, 0xf327341098de52f2, - 0xcfca3b9e2a1992c3, 0x7483bf9401233e41, 0xbafbac531c6f9281, 0x4b52dd71b2c106f8, - 0xdf73b66e50b5a1f7, 0x237aec0202a20283, 0x23dd5be23dffdf2b, 0xea9730731ee122ef, - 0x5cb3f846014fbcd3, 0xc3b21c8ffdce9201, 0x06a99a02f91a8760, 0x721a81fa8fd7b7a3, - 0x6aafcdddc53cbcd8, 0xd03b464005a93bcc, 0x8212edc1b1669dcb, 0x71f4c31364c31bc7, - 0xfeeec0eba8772307, 0x1948d00a13d88cf1, 0x19064fd6d943ada8, 0x4ec8d31722697bfd, - 0x596d9a953a516609, 0xc4cb4bff53507da2, 0x1d59f3c5be36e4ca, 0xe5b4fc5bf6044c9b, - 0x1bb74e052232f735, 0x04e8a0db611ddd5d, 0x8d04eaa009b421bf, 0xa7878ae0ac0e6d58, - 0x28c1030217cab2b3, 0x827943767e56a883, 0x28fce5fa02d22809, 0xb30c322fffc8c58e, - 0x1ca5a6a9f8066c5b, 0xb24db5f1462b2513, 0x02f653b89b7e5f6c, 0xe31f8fb5d5f78eee, - 0x266acc514ed93501, 0x936879d1c6fddcc4, 0xcd51be3636af1952, 0x3fdbb6fc332c78c8, - 0x9eb656379fa73094, 0x056146cc92fa0f96, 0xed6c4f1836c027c3, 0x021e0bb5d2113f2a, - 0x8983e42ec1c626b3, 0x73ea9bc6513ad9c9, 0x0c904903b24f4247, 0xacbac1e6243e2525, - 0x0b1069a0c230fb06, 0x77d709fca3fc1ce5, 0x87ad0f65020947e6, 0x555302641c53f4e6, - 0x65ea87871fa9aaee, 0x58aaf4ecc1067bb4, 0x1a66c48cc4c65b3f, 0xca96aca48b2ea969, - 0xa68eb70bad14de2b, 0x5ccdb3d7e00a6f6e, 0xe178fbfec73fe72f, 0x2b63d6a16b83e890, - 0x32fdb7a5330fbae0, 0x2ab5803c8d1bf32c, 0xda838388c1527c94, 0x16a50bdc4de24acb, - 0xe561301f134c074a, 0xd7ae63d2816b4db1, 0x036aabd4df0dd741, 0xc5e0db8783435b9d, - 0x9c4386cf0a07f3b2, 0x6a72ac1aa56a13a1, 0x299bbdb04bb20a23, 0x138c1018fda16b81, - 0x0e354f0b3bda49df, 0x9f4c295b23127437, 0xd133ceb2bd561341, 0xd8b4bfd5a526ac29, - 0xcdd0a70ddc1c7bbd, 0x81dce595bf572225, 0x1c6f925c05f6efd7, 0x8ae5097553856ea0, - 0x3aabeaeef248f60d, 0xd9005809d19a69e2, 0x2a3a1a314311cc27, 0x89bb2dc76b2b624a, - 0x50a2a95d0412e289, 0x9def8df564e68581, 0xf49010a9b2e2ea5c, 0x8602ae175d9ff3f0, - 0xbf037e245369a618, 0x8038164365f6e2b5, 0xe2e1f6163b4e8d08, 0x8df9314914f0857e}}; - -} // namespace parquet::internal From 0b868ca2c53089d246531259f086604afbede0ea Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Thu, 6 Mar 2025 20:06:25 +0100 Subject: [PATCH 034/102] more docstring for CDC arguments --- cpp/src/parquet/chunker_internal.h | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.h b/cpp/src/parquet/chunker_internal.h index b7334d3f6a5..77ab1b7784a 100644 --- a/cpp/src/parquet/chunker_internal.h +++ b/cpp/src/parquet/chunker_internal.h @@ -99,11 +99,24 @@ class ContentDefinedChunker { /// Create a new ContentDefinedChunker instance /// /// @param level_info Information about definition and repetition levels - /// @param size_range Min/max chunk size as pair, the chunker will - /// attempt to uniformly distribute the chunks between these extremes. + /// @param min_size Minimum chunk size in bytes, the rolling hash will not be updated + /// until this size is reached for each chunk. Note that all data sent + /// through the hash function is counted towards the chunk size, + /// including definition and repetition levels if present. + /// @param max_size Maximum chunk size in bytes, the chunker will create a new chunk + /// whenever the chunk size exceeds this value. The chunker will + /// attempt to uniformly distribute the chunks between min_size and + /// max_size. /// @param norm_factor Normalization factor to center the chunk size around the average /// size more aggressively. By increasing the normalization factor, - /// probability of finding a chunk boundary increases. + /// probability of finding a chunk boundary increases improving the + /// deduplication ratio, but also increases the number of small + /// chunks resulting in small parquet data pages. The default value + /// provides a good balance between deduplication ratio and + /// fragmentation. Use norm_factor=1 or norm_factor=2 if a higher + /// deduplication ratio is required at the expense of fragmentation, + /// norm_factor>2 is typically not increasing the deduplication + /// ratio. ContentDefinedChunker(const LevelInfo& level_info, uint64_t min_size, uint64_t max_size, uint8_t norm_factor = 0); From 02143fce66cd9e4c597230d52d026c79ee325b7d Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Thu, 6 Mar 2025 21:02:56 +0100 Subject: [PATCH 035/102] prefer templated GenerateArray rather than macro --- cpp/src/parquet/chunker_internal_test.cc | 79 ++++++++++++++---------- 1 file changed, 47 insertions(+), 32 deletions(-) diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index 99301179ebd..c4d963217f6 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -65,34 +65,38 @@ inline uint64_t hash(uint64_t seed, uint64_t index) { return h; } -#define GENERATE_CASE_BODY(BUILDER_TYPE, VALUE_EXPR) \ - { \ - BUILDER_TYPE builder(type, default_memory_pool()); \ - if (nullable) { \ - for (int64_t i = 0; i < length; ++i) { \ - uint64_t val = hash(seed, i); \ - if (val % 10 == 0) { \ - RETURN_NOT_OK(builder.AppendNull()); \ - } else { \ - RETURN_NOT_OK(builder.Append(VALUE_EXPR)); \ - } \ - } \ - } else { \ - for (int64_t i = 0; i < length; ++i) { \ - uint64_t val = hash(seed, i); \ - RETURN_NOT_OK(builder.Append(VALUE_EXPR)); \ - } \ - } \ - std::shared_ptr array; \ - RETURN_NOT_OK(builder.Finish(&array)); \ - RETURN_NOT_OK(array->ValidateFull()); \ - return array; \ - } - -// Macro to generate a case for a given scalar type. -#define GENERATE_CASE(TYPE_ID, BUILDER_TYPE, VALUE_EXPR) \ - case ::arrow::Type::TYPE_ID: { \ - GENERATE_CASE_BODY(BUILDER_TYPE, VALUE_EXPR) \ +template +Result> GenerateArray(const std::shared_ptr& type, + bool nullable, int64_t length, uint64_t seed, + ValueFunc value_func) { + BuilderType builder(type, default_memory_pool()); + + if (nullable) { + for (int64_t i = 0; i < length; ++i) { + uint64_t val = hash(seed, i); + if (val % 10 == 0) { + RETURN_NOT_OK(builder.AppendNull()); + } else { + RETURN_NOT_OK(builder.Append(value_func(val))); + } + } + } else { + for (int64_t i = 0; i < length; ++i) { + uint64_t val = hash(seed, i); + RETURN_NOT_OK(builder.Append(value_func(val))); + } + } + + std::shared_ptr array; + RETURN_NOT_OK(builder.Finish(&array)); + RETURN_NOT_OK(array->ValidateFull()); + return array; +} + +#define GENERATE_CASE(TYPE_ID, BUILDER_TYPE, VALUE_EXPR) \ + case ::arrow::Type::TYPE_ID: { \ + auto value_func = [](uint64_t val) { return VALUE_EXPR; }; \ + return GenerateArray(type, nullable, length, seed, value_func); \ } Result> GenerateArray(const std::shared_ptr& field, @@ -122,7 +126,11 @@ Result> GenerateArray(const std::shared_ptr& field // Limit the value to fit within the specified precision int32_t max_exponent = decimal_type.precision() - decimal_type.scale(); int64_t max_value = static_cast(std::pow(10, max_exponent) - 1); - GENERATE_CASE_BODY(::arrow::Decimal128Builder, ::arrow::Decimal128(val % max_value)) + auto value_func = [&](uint64_t val) { + return ::arrow::Decimal128(val % max_value); + }; + return GenerateArray<::arrow::Decimal128Builder>(type, nullable, length, seed, + value_func); } case ::arrow::Type::DECIMAL256: { const auto& decimal_type = static_cast(*type); @@ -130,7 +138,11 @@ Result> GenerateArray(const std::shared_ptr& field // int64_t overflow int32_t max_exponent = std::min(9, decimal_type.precision() - decimal_type.scale()); int64_t max_value = static_cast(std::pow(10, max_exponent) - 1); - GENERATE_CASE_BODY(::arrow::Decimal256Builder, ::arrow::Decimal256(val % max_value)) + auto value_func = [&](uint64_t val) { + return ::arrow::Decimal256(val % max_value); + }; + return GenerateArray<::arrow::Decimal256Builder>(type, nullable, length, seed, + value_func); } // Temporal types @@ -151,8 +163,11 @@ Result> GenerateArray(const std::shared_ptr& field std::string("bin_") + std::to_string(val)) case ::arrow::Type::FIXED_SIZE_BINARY: { auto size = static_cast<::arrow::FixedSizeBinaryType*>(type.get())->byte_width(); - GENERATE_CASE_BODY(::arrow::FixedSizeBinaryBuilder, - std::string("bin_") + std::to_string(val).substr(0, size - 4)) + auto value_func = [size](uint64_t val) { + return std::string("bin_") + std::to_string(val).substr(0, size - 4); + }; + return GenerateArray<::arrow::FixedSizeBinaryBuilder>(type, nullable, length, seed, + value_func); } case ::arrow::Type::STRUCT: { From 34dbf5bbd6ca2e039f107ab8a4706d438cfc2196 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 7 Mar 2025 13:25:57 +0100 Subject: [PATCH 036/102] don't hash undefined null values; reduce generated code size by dispatching based on type width; use pointers when calculating rolling hash --- cpp/src/parquet/chunker_internal.cc | 172 ++++++++++++++++------------ cpp/src/parquet/chunker_internal.h | 10 +- 2 files changed, 107 insertions(+), 75 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index 2ded58c7bdd..a85e7ccfaa3 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -28,16 +28,6 @@ namespace parquet::internal { -// create a fake null array class with a GetView method returning 0 always -class FakeNullArray { - public: - uint8_t GetView(int64_t i) const { return 0; } - - std::shared_ptr<::arrow::DataType> type() const { return ::arrow::null(); } - - int64_t null_count() const { return 0; } -}; - static uint64_t GetMask(uint64_t min_size, uint64_t max_size, uint8_t norm_factor) { // we aim for gaussian-like distribution of chunk sizes between min_size and max_size uint64_t avg_size = (min_size + max_size) / 2; @@ -59,8 +49,18 @@ ContentDefinedChunker::ContentDefinedChunker(const LevelInfo& level_info, max_size_(max_size), hash_mask_(GetMask(min_size, max_size, norm_factor)) {} +void ContentDefinedChunker::Roll(const bool value) { + if (chunk_size_++ < min_size_) { + // short-circuit if we haven't reached the minimum chunk size, this speeds up the + // chunking process since the gearhash doesn't need to be updated + return; + } + rolling_hash_ = (rolling_hash_ << 1) + kGearhashTable[nth_run_][value]; + has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0); +} + template -void ContentDefinedChunker::Roll(const T value) { +void ContentDefinedChunker::Roll(const T* value) { constexpr size_t BYTE_WIDTH = sizeof(T); chunk_size_ += BYTE_WIDTH; if (chunk_size_ < min_size_) { @@ -68,23 +68,22 @@ void ContentDefinedChunker::Roll(const T value) { // chunking process since the gearhash doesn't need to be updated return; } - auto bytes = reinterpret_cast(&value); + auto bytes = reinterpret_cast(value); for (size_t i = 0; i < BYTE_WIDTH; ++i) { rolling_hash_ = (rolling_hash_ << 1) + kGearhashTable[nth_run_][bytes[i]]; has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0); } } -void ContentDefinedChunker::Roll(std::string_view value) { - chunk_size_ += value.size(); +void ContentDefinedChunker::Roll(const uint8_t* value, int64_t num_bytes) { + chunk_size_ += num_bytes; if (chunk_size_ < min_size_) { // short-circuit if we haven't reached the minimum chunk size, this speeds up the // chunking process since the gearhash doesn't need to be updated return; } - for (char c : value) { - rolling_hash_ = - (rolling_hash_ << 1) + kGearhashTable[nth_run_][static_cast(c)]; + for (int64_t i = 0; i < num_bytes; ++i) { + rolling_hash_ = (rolling_hash_ << 1) + kGearhashTable[nth_run_][value[i]]; has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0); } } @@ -114,22 +113,20 @@ bool ContentDefinedChunker::NeedNewChunk() { return false; } -template +template const std::vector ContentDefinedChunker::Calculate(const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, - const T& leaf_array) { + const RollFunc& RollValue) { std::vector chunks; bool has_def_levels = level_info_.def_level > 0; bool has_rep_levels = level_info_.rep_level > 0; if (!has_rep_levels && !has_def_levels) { // fastest path for non-nested non-null data - int64_t offset = 0; int64_t prev_offset = 0; - while (offset < num_levels) { - Roll(leaf_array.GetView(offset)); - ++offset; + for (int64_t offset = 0; offset < num_levels; ++offset) { + RollValue(offset); if (NeedNewChunk()) { chunks.emplace_back(prev_offset, prev_offset, offset - prev_offset); prev_offset = offset; @@ -140,12 +137,15 @@ const std::vector ContentDefinedChunker::Calculate(const int16_t* def_lev } } else if (!has_rep_levels) { // non-nested data with nulls - int64_t offset = 0; + int16_t def_level; int64_t prev_offset = 0; - while (offset < num_levels) { - Roll(def_levels[offset]); - Roll(leaf_array.GetView(offset)); - ++offset; + for (int64_t offset = 0; offset < num_levels; ++offset) { + def_level = def_levels[offset]; + + Roll(&def_level); + if (def_level == level_info_.def_level) { + RollValue(offset); + } if (NeedNewChunk()) { chunks.emplace_back(prev_offset, prev_offset, offset - prev_offset); prev_offset = offset; @@ -161,8 +161,8 @@ const std::vector ContentDefinedChunker::Calculate(const int16_t* def_lev int16_t def_level; int16_t rep_level; int64_t value_offset = 0; - int64_t record_level_offset = 0; - int64_t record_value_offset = 0; + int64_t prev_level_offset = 0; + int64_t prev_value_offset = 0; for (int64_t level_offset = 0; level_offset < num_levels; ++level_offset) { def_level = def_levels[level_offset]; @@ -171,18 +171,18 @@ const std::vector ContentDefinedChunker::Calculate(const int16_t* def_lev has_leaf_value = def_level >= level_info_.repeated_ancestor_def_level; is_record_boundary = rep_level == 0; - Roll(def_level); - Roll(rep_level); + Roll(&def_level); + Roll(&rep_level); if (has_leaf_value) { - Roll(leaf_array.GetView(value_offset)); + RollValue(value_offset); } if (is_record_boundary && NeedNewChunk()) { - auto levels_to_write = level_offset - record_level_offset; + auto levels_to_write = level_offset - prev_level_offset; if (levels_to_write > 0) { - chunks.emplace_back(record_level_offset, record_value_offset, levels_to_write); - record_level_offset = level_offset; - record_value_offset = value_offset; + chunks.emplace_back(prev_level_offset, prev_value_offset, levels_to_write); + prev_level_offset = level_offset; + prev_value_offset = value_offset; } } @@ -191,57 +191,87 @@ const std::vector ContentDefinedChunker::Calculate(const int16_t* def_lev } } - auto levels_to_write = num_levels - record_level_offset; + auto levels_to_write = num_levels - prev_level_offset; if (levels_to_write > 0) { - chunks.emplace_back(record_level_offset, record_value_offset, levels_to_write); + chunks.emplace_back(prev_level_offset, prev_value_offset, levels_to_write); } } return chunks; } -#define PRIMITIVE_CASE(TYPE_ID, ArrowType) \ - case ::arrow::Type::TYPE_ID: \ - return Calculate(def_levels, rep_levels, num_levels, \ - static_cast(values)); +#define FIXED_WIDTH_CASE(CType) \ + { \ + const auto raw_values = values.data()->GetValues(1); \ + return Calculate(def_levels, rep_levels, num_levels, \ + [&](int64_t i) { return Roll(raw_values + i); }); \ + } + +#define BINARY_LIKE_CASE(OffsetCType) \ + { \ + const auto raw_offsets = values.data()->GetValues(1); \ + const auto raw_values = values.data()->GetValues(2); \ + return Calculate(def_levels, rep_levels, num_levels, [&](int64_t i) { \ + const OffsetCType pos = raw_offsets[i]; \ + const OffsetCType length = raw_offsets[i + 1] - pos; \ + Roll(raw_values + pos, length); \ + }); \ + } const std::vector ContentDefinedChunker::GetBoundaries( const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, const ::arrow::Array& values) { auto type_id = values.type()->id(); switch (type_id) { - PRIMITIVE_CASE(BOOL, Boolean) - PRIMITIVE_CASE(INT8, Int8) - PRIMITIVE_CASE(INT16, Int16) - PRIMITIVE_CASE(INT32, Int32) - PRIMITIVE_CASE(INT64, Int64) - PRIMITIVE_CASE(UINT8, UInt8) - PRIMITIVE_CASE(UINT16, UInt16) - PRIMITIVE_CASE(UINT32, UInt32) - PRIMITIVE_CASE(UINT64, UInt64) - PRIMITIVE_CASE(HALF_FLOAT, HalfFloat) - PRIMITIVE_CASE(FLOAT, Float) - PRIMITIVE_CASE(DOUBLE, Double) - PRIMITIVE_CASE(STRING, String) - PRIMITIVE_CASE(LARGE_STRING, LargeString) - PRIMITIVE_CASE(BINARY, Binary) - PRIMITIVE_CASE(LARGE_BINARY, LargeBinary) - PRIMITIVE_CASE(FIXED_SIZE_BINARY, FixedSizeBinary) - PRIMITIVE_CASE(DATE32, Date32) - PRIMITIVE_CASE(DATE64, Date64) - PRIMITIVE_CASE(TIME32, Time32) - PRIMITIVE_CASE(TIME64, Time64) - PRIMITIVE_CASE(TIMESTAMP, Timestamp) - PRIMITIVE_CASE(DURATION, Duration) - PRIMITIVE_CASE(DECIMAL128, Decimal128) - PRIMITIVE_CASE(DECIMAL256, Decimal256) + case ::arrow::Type::NA: { + return Calculate(def_levels, rep_levels, num_levels, [](int64_t) {}); + } + case ::arrow::Type::BOOL: { + const auto& bool_array = static_cast(values); + return Calculate(def_levels, rep_levels, num_levels, + [&](int64_t i) { return Roll(bool_array.Value(i)); }); + } + case ::arrow::Type::INT8: + case ::arrow::Type::UINT8: + FIXED_WIDTH_CASE(uint8_t) + case ::arrow::Type::INT16: + case ::arrow::Type::UINT16: + case ::arrow::Type::HALF_FLOAT: + FIXED_WIDTH_CASE(uint16_t) + case ::arrow::Type::INT32: + case ::arrow::Type::UINT32: + case ::arrow::Type::FLOAT: + case ::arrow::Type::DATE32: + case ::arrow::Type::TIME32: + FIXED_WIDTH_CASE(uint32_t) + case ::arrow::Type::INT64: + case ::arrow::Type::UINT64: + case ::arrow::Type::DOUBLE: + case ::arrow::Type::DATE64: + case ::arrow::Type::TIME64: + case ::arrow::Type::TIMESTAMP: + case ::arrow::Type::DURATION: + FIXED_WIDTH_CASE(uint64_t) + case ::arrow::Type::BINARY: + case ::arrow::Type::STRING: + BINARY_LIKE_CASE(int32_t) + case ::arrow::Type::LARGE_BINARY: + case ::arrow::Type::LARGE_STRING: + BINARY_LIKE_CASE(int64_t) + case ::arrow::Type::DECIMAL128: + case ::arrow::Type::DECIMAL256: + case ::arrow::Type::FIXED_SIZE_BINARY: { + const auto raw_values = values.data()->GetValues(1); + const auto byte_width = + static_cast(values).byte_width(); + return Calculate(def_levels, rep_levels, num_levels, [&](int64_t i) { + return Roll(raw_values + i * byte_width, byte_width); + }); + } case ::arrow::Type::DICTIONARY: return GetBoundaries( def_levels, rep_levels, num_levels, *static_cast(values).indices()); - case ::arrow::Type::NA: - FakeNullArray fake_null_array; - return Calculate(def_levels, rep_levels, num_levels, fake_null_array); default: throw ParquetException("Unsupported Arrow array type " + values.type()->ToString()); } diff --git a/cpp/src/parquet/chunker_internal.h b/cpp/src/parquet/chunker_internal.h index 77ab1b7784a..cc550e1a4dd 100644 --- a/cpp/src/parquet/chunker_internal.h +++ b/cpp/src/parquet/chunker_internal.h @@ -132,23 +132,25 @@ class ContentDefinedChunker { const ::arrow::Array& values); private: + void Roll(const bool value); + // Update the rolling hash with a compile-time known sized value, set has_matched_ to // true if the hash matches the mask. template - void Roll(const T value); + void Roll(const T* value); // Update the rolling hash with a binary-like value, set has_matched_ to true if the // hash matches the mask. - void Roll(std::string_view value); + void Roll(const uint8_t* value, int64_t num_bytes); // Evaluate whether a new chunk should be created based on the has_matched_, nth_run_ // and chunk_size_ state. inline bool NeedNewChunk(); // Calculate the chunk boundaries for typed Arrow arrays. - template + template const std::vector Calculate(const int16_t* def_levels, const int16_t* rep_levels, - int64_t num_levels, const T& leaf_array); + int64_t num_levels, const RollFunc& RollValue); // Reference to the column's level information const internal::LevelInfo& level_info_; From 9792acbf8cc821818d6b5c13be2df203dec355ad Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 7 Mar 2025 14:34:56 +0100 Subject: [PATCH 037/102] only hash non-null values in the nested case as well --- cpp/src/parquet/chunker_internal.cc | 54 ++++++++++++----------------- 1 file changed, 23 insertions(+), 31 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index a85e7ccfaa3..874ff6f47fa 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -119,26 +119,26 @@ const std::vector ContentDefinedChunker::Calculate(const int16_t* def_lev int64_t num_levels, const RollFunc& RollValue) { std::vector chunks; + int64_t offset; + int64_t prev_offset = 0; + int64_t prev_value_offset = 0; bool has_def_levels = level_info_.def_level > 0; bool has_rep_levels = level_info_.rep_level > 0; if (!has_rep_levels && !has_def_levels) { // fastest path for non-nested non-null data - int64_t prev_offset = 0; - for (int64_t offset = 0; offset < num_levels; ++offset) { + for (offset = 0; offset < num_levels; ++offset) { RollValue(offset); if (NeedNewChunk()) { chunks.emplace_back(prev_offset, prev_offset, offset - prev_offset); prev_offset = offset; } } - if (prev_offset < num_levels) { - chunks.emplace_back(prev_offset, prev_offset, num_levels - prev_offset); - } + // set the previous value offset to add the last chunk + prev_value_offset = prev_offset; } else if (!has_rep_levels) { // non-nested data with nulls int16_t def_level; - int64_t prev_offset = 0; for (int64_t offset = 0; offset < num_levels; ++offset) { def_level = def_levels[offset]; @@ -151,52 +151,44 @@ const std::vector ContentDefinedChunker::Calculate(const int16_t* def_lev prev_offset = offset; } } - if (prev_offset < num_levels) { - chunks.emplace_back(prev_offset, prev_offset, num_levels - prev_offset); - } + // set the previous value offset to add the last chunk + prev_value_offset = prev_offset; } else { // nested data with nulls - bool has_leaf_value; - bool is_record_boundary; int16_t def_level; int16_t rep_level; int64_t value_offset = 0; - int64_t prev_level_offset = 0; - int64_t prev_value_offset = 0; - for (int64_t level_offset = 0; level_offset < num_levels; ++level_offset) { - def_level = def_levels[level_offset]; - rep_level = rep_levels[level_offset]; - - has_leaf_value = def_level >= level_info_.repeated_ancestor_def_level; - is_record_boundary = rep_level == 0; + for (offset = 0; offset < num_levels; ++offset) { + def_level = def_levels[offset]; + rep_level = rep_levels[offset]; Roll(&def_level); Roll(&rep_level); - if (has_leaf_value) { + if (def_level == level_info_.def_level) { RollValue(value_offset); } - if (is_record_boundary && NeedNewChunk()) { - auto levels_to_write = level_offset - prev_level_offset; + if ((rep_level == 0) && NeedNewChunk()) { + // if we are at a record boundary and need a new chunk, we create a new chunk + auto levels_to_write = offset - prev_offset; if (levels_to_write > 0) { - chunks.emplace_back(prev_level_offset, prev_value_offset, levels_to_write); - prev_level_offset = level_offset; + chunks.emplace_back(prev_offset, prev_value_offset, levels_to_write); + prev_offset = offset; prev_value_offset = value_offset; } } - - if (has_leaf_value) { + if (def_level >= level_info_.repeated_ancestor_def_level) { + // we only increment the value offset if we have a leaf value ++value_offset; } } - - auto levels_to_write = num_levels - prev_level_offset; - if (levels_to_write > 0) { - chunks.emplace_back(prev_level_offset, prev_value_offset, levels_to_write); - } } + // add the last chunk if we have any levels left + if (prev_offset < num_levels) { + chunks.emplace_back(prev_offset, prev_value_offset, num_levels - prev_offset); + } return chunks; } From 74857628703f9ba996471c4c986b73f1748d856d Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 7 Mar 2025 15:32:24 +0100 Subject: [PATCH 038/102] add docstrings to the hashtable generating pythons script --- cpp/src/parquet/chunker_internal_codegen.py | 34 +++++++++++++++++++-- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/chunker_internal_codegen.py b/cpp/src/parquet/chunker_internal_codegen.py index 29cd856f3c4..f01e1a56d26 100644 --- a/cpp/src/parquet/chunker_internal_codegen.py +++ b/cpp/src/parquet/chunker_internal_codegen.py @@ -17,6 +17,32 @@ # specific language governing permissions and limitations # under the License. +""" +Produce the given number gearhash tables for rolling hash calculations. + +Each table consists of 256 64-bit integer values and by default 8 tables are +produced. The tables are written to a header file that can be included in the +C++ code. + +The generated numbers are deterministic "random" numbers created by MD5 hashing +a fixed seed and the table index. This ensures that the tables are the same +across different runs and platforms. The function of generating the numbers is +less important as long as they have sufficiently uniform distribution. + +Reference implementations: +- https://github.com/Borelset/destor/blob/master/src/chunking/fascdc_chunking.c +- https://github.com/nlfiedler/fastcdc-rs/blob/master/examples/table64.rs + +Usage: + python chunker_internal_codegen.py [ntables] + + ntables: Number of gearhash tables to generate (default 8), the + the C++ implementation expects 8 tables so this should not be + changed unless the C++ code is also updated. + + The generated header file is written to ./chunker_internal_generated.h +""" + import hashlib import pathlib import sys @@ -54,18 +80,20 @@ def generate_hash(n: int, seed: int): + """Produce predictable hash values for a given seed and n using MD5.""" value = bytes([seed] * 64 + [n] * 64) hasher = hashlib.md5(value) return hasher.hexdigest()[:16] def generate_hashtable(seed: int, length=256): + """Generate and render a single gearhash table.""" table = [generate_hash(n, seed=seed) for n in range(length)] out = StringIO() out.write(f" {{// seed = {seed}\n") for i in range(0, length, 4): - values = [f"0x{value}" for value in table[i:i + 4]] + values = [f"0x{value}" for value in table[i : i + 4]] values = ", ".join(values) out.write(f" {values}") if i < length - 4: @@ -76,8 +104,8 @@ def generate_hashtable(seed: int, length=256): def generate_header(ntables=8, relative_path="chunker_internal_generated.h"): + """Generate a header file with multiple gearhash tables.""" path = pathlib.Path(__file__).parent / relative_path - tables = [generate_hashtable(seed) for seed in range(ntables)] text = template.format(content=",\n".join(tables)) path.write_text(text) @@ -85,4 +113,4 @@ def generate_header(ntables=8, relative_path="chunker_internal_generated.h"): if __name__ == "__main__": ntables = int(sys.argv[1]) if len(sys.argv) > 1 else 8 - generate_header(ntables) \ No newline at end of file + generate_header(ntables) From 9c3ea99d74321117801e104944209bb04041fd0d Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 7 Mar 2025 16:17:30 +0100 Subject: [PATCH 039/102] prefer to use signed integers as size arguments --- cpp/src/parquet/chunker_internal.cc | 22 +++- cpp/src/parquet/chunker_internal.h | 12 +- cpp/src/parquet/chunker_internal_test.cc | 148 +++++++++++------------ cpp/src/parquet/properties.h | 20 +-- 4 files changed, 104 insertions(+), 98 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index 874ff6f47fa..d13186ec91d 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -28,12 +28,12 @@ namespace parquet::internal { -static uint64_t GetMask(uint64_t min_size, uint64_t max_size, uint8_t norm_factor) { +static uint64_t GetMask(int64_t min_size, int64_t max_size, uint8_t norm_factor) { // we aim for gaussian-like distribution of chunk sizes between min_size and max_size - uint64_t avg_size = (min_size + max_size) / 2; + int64_t avg_size = (min_size + max_size) / 2; // we skip calculating gearhash for the first `min_size` bytes, so we are looking for // a smaller chunk as the average size - uint64_t target_size = avg_size - min_size; + int64_t target_size = avg_size - min_size; size_t mask_bits = static_cast(std::floor(std::log2(target_size))); // -3 because we are using 8 hash tables to have more gaussian-like distribution // `norm_factor` narrows the chunk size distribution aroun avg_size @@ -42,12 +42,22 @@ static uint64_t GetMask(uint64_t min_size, uint64_t max_size, uint8_t norm_facto } ContentDefinedChunker::ContentDefinedChunker(const LevelInfo& level_info, - uint64_t min_size, uint64_t max_size, - uint8_t norm_factor) + int64_t min_size, int64_t max_size, + int8_t norm_factor) : level_info_(level_info), min_size_(min_size), max_size_(max_size), - hash_mask_(GetMask(min_size, max_size, norm_factor)) {} + hash_mask_(GetMask(min_size, max_size, norm_factor)) { + if (min_size_ < 0) { + throw ParquetException("min_size must be non-negative"); + } + if (max_size_ < 0) { + throw ParquetException("max_size must be non-negative"); + } + if (min_size_ > max_size_) { + throw ParquetException("min_size must be less than or equal to max_size"); + } +} void ContentDefinedChunker::Roll(const bool value) { if (chunk_size_++ < min_size_) { diff --git a/cpp/src/parquet/chunker_internal.h b/cpp/src/parquet/chunker_internal.h index cc550e1a4dd..5299e1c8ec8 100644 --- a/cpp/src/parquet/chunker_internal.h +++ b/cpp/src/parquet/chunker_internal.h @@ -117,8 +117,8 @@ class ContentDefinedChunker { /// deduplication ratio is required at the expense of fragmentation, /// norm_factor>2 is typically not increasing the deduplication /// ratio. - ContentDefinedChunker(const LevelInfo& level_info, uint64_t min_size, uint64_t max_size, - uint8_t norm_factor = 0); + ContentDefinedChunker(const LevelInfo& level_info, int64_t min_size, int64_t max_size, + int8_t norm_factor = 0); /// Get the chunk boundaries for the given column data /// @@ -157,8 +157,8 @@ class ContentDefinedChunker { // Minimum chunk size in bytes, the rolling hash will not be updated until this size is // reached for each chunk. Note that all data sent through the hash function is counted // towards the chunk size, including definition and repetition levels. - const uint64_t min_size_; - const uint64_t max_size_; + const int64_t min_size_; + const int64_t max_size_; // The mask to match the rolling hash against to determine if a new chunk should be // created. The mask is calculated based on min/max chunk size and the normalization // factor. @@ -170,9 +170,9 @@ class ContentDefinedChunker { bool has_matched_ = false; // The current run of the rolling hash, used to normalize the chunk size distribution // by requiring multiple consecutive matches to create a new chunk. - uint64_t nth_run_ = 0; + int8_t nth_run_ = 0; // Current chunk size in bytes, reset to 0 when a new chunk is created. - uint64_t chunk_size_ = 0; + int64_t chunk_size_ = 0; // Rolling hash state, never reset only initialized once for the entire column. uint64_t rolling_hash_ = 0; }; diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index c4d963217f6..9062b74d536 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -67,13 +67,13 @@ inline uint64_t hash(uint64_t seed, uint64_t index) { template Result> GenerateArray(const std::shared_ptr& type, - bool nullable, int64_t length, uint64_t seed, + bool nullable, int64_t length, int64_t seed, ValueFunc value_func) { BuilderType builder(type, default_memory_pool()); if (nullable) { for (int64_t i = 0; i < length; ++i) { - uint64_t val = hash(seed, i); + int64_t val = hash(seed, i); if (val % 10 == 0) { RETURN_NOT_OK(builder.AppendNull()); } else { @@ -82,7 +82,7 @@ Result> GenerateArray(const std::shared_ptr& ty } } else { for (int64_t i = 0; i < length; ++i) { - uint64_t val = hash(seed, i); + int64_t val = hash(seed, i); RETURN_NOT_OK(builder.Append(value_func(val))); } } @@ -95,12 +95,12 @@ Result> GenerateArray(const std::shared_ptr& ty #define GENERATE_CASE(TYPE_ID, BUILDER_TYPE, VALUE_EXPR) \ case ::arrow::Type::TYPE_ID: { \ - auto value_func = [](uint64_t val) { return VALUE_EXPR; }; \ + auto value_func = [](int64_t val) { return VALUE_EXPR; }; \ return GenerateArray(type, nullable, length, seed, value_func); \ } Result> GenerateArray(const std::shared_ptr& field, - int64_t length, uint64_t seed) { + int64_t length, int64_t seed) { const std::shared_ptr& type = field->type(); bool nullable = field->nullable(); @@ -126,9 +126,7 @@ Result> GenerateArray(const std::shared_ptr& field // Limit the value to fit within the specified precision int32_t max_exponent = decimal_type.precision() - decimal_type.scale(); int64_t max_value = static_cast(std::pow(10, max_exponent) - 1); - auto value_func = [&](uint64_t val) { - return ::arrow::Decimal128(val % max_value); - }; + auto value_func = [&](int64_t val) { return ::arrow::Decimal128(val % max_value); }; return GenerateArray<::arrow::Decimal128Builder>(type, nullable, length, seed, value_func); } @@ -138,9 +136,7 @@ Result> GenerateArray(const std::shared_ptr& field // int64_t overflow int32_t max_exponent = std::min(9, decimal_type.precision() - decimal_type.scale()); int64_t max_value = static_cast(std::pow(10, max_exponent) - 1); - auto value_func = [&](uint64_t val) { - return ::arrow::Decimal256(val % max_value); - }; + auto value_func = [&](int64_t val) { return ::arrow::Decimal256(val % max_value); }; return GenerateArray<::arrow::Decimal256Builder>(type, nullable, length, seed, value_func); } @@ -163,7 +159,7 @@ Result> GenerateArray(const std::shared_ptr& field std::string("bin_") + std::to_string(val)) case ::arrow::Type::FIXED_SIZE_BINARY: { auto size = static_cast<::arrow::FixedSizeBinaryType*>(type.get())->byte_width(); - auto value_func = [size](uint64_t val) { + auto value_func = [size](int64_t val) { return std::string("bin_") + std::to_string(val).substr(0, size - 4); }; return GenerateArray<::arrow::FixedSizeBinaryBuilder>(type, nullable, length, seed, @@ -176,7 +172,7 @@ Result> GenerateArray(const std::shared_ptr& field for (auto i = 0; i < struct_type->num_fields(); i++) { ARROW_ASSIGN_OR_RAISE(auto child_array, GenerateArray(struct_type->field(i), length, - seed + static_cast(i + 300))); + seed + static_cast(i + 300))); child_arrays.push_back(child_array); } auto struct_array = @@ -239,7 +235,7 @@ Result> GenerateArray(const std::shared_ptr& field } Result> GenerateTable( - const std::shared_ptr<::arrow::Schema>& schema, int64_t size, uint64_t seed = 0) { + const std::shared_ptr<::arrow::Schema>& schema, int64_t size, int64_t seed = 0) { std::vector> arrays; for (const auto& field : schema->fields()) { ARROW_ASSIGN_OR_RAISE(auto array, GenerateArray(field, size, seed)); @@ -257,8 +253,8 @@ Result> ConcatAndCombine( } Result> WriteTableToBuffer(const std::shared_ptr
& table, - uint64_t min_chunk_size, - uint64_t max_chunk_size, + int64_t min_chunk_size, + int64_t max_chunk_size, bool enable_dictionary = false, int64_t row_group_size = 1024 * 1024) { @@ -293,8 +289,8 @@ Result> ReadTableFromBuffer(const std::shared_ptr } struct PageSizes { - std::vector lengths; - std::vector sizes; + std::vector lengths; + std::vector sizes; }; PageSizes GetColumnPageSizes(const std::shared_ptr& data, int column_index = 0) { @@ -322,7 +318,7 @@ PageSizes GetColumnPageSizes(const std::shared_ptr& data, int column_ind } Result WriteAndGetPageSizes(const std::shared_ptr
& table, - uint64_t min_chunk_size, uint64_t max_chunk_size, + int64_t min_chunk_size, int64_t max_chunk_size, bool enable_dictionary = false, int column_index = 0) { // Write the table to a buffer and read it back to get the page sizes @@ -339,7 +335,7 @@ Result WriteAndGetPageSizes(const std::shared_ptr
& table, return GetColumnPageSizes(buffer, column_index); } -void AssertAllBetween(const std::vector& values, uint64_t min, uint64_t max, +void AssertAllBetween(const std::vector& values, int64_t min, int64_t max, bool expect_dictionary_fallback = false) { // expect the last chunk since it is not guaranteed to be within the range if (expect_dictionary_fallback) { @@ -365,8 +361,8 @@ void AssertAllBetween(const std::vector& values, uint64_t min, uint64_ ASSERT_LE(values.back(), max); } -std::vector, std::vector>> FindDifferences( - const std::vector& first, const std::vector& second) { +std::vector, std::vector>> FindDifferences( + const std::vector& first, const std::vector& second) { // Compute LCS table. size_t n = first.size(), m = second.size(); std::vector> dp(n + 1, std::vector(m + 1, 0)); @@ -395,7 +391,7 @@ std::vector, std::vector>> FindDiffere std::reverse(common.begin(), common.end()); // Build raw differences. - std::vector, std::vector>> result; + std::vector, std::vector>> result; size_t last_i = 0, last_j = 0; for (auto& c : common) { auto ci = c.first; @@ -414,7 +410,7 @@ std::vector, std::vector>> FindDiffere // Merge adjacent diffs if one side is empty in the first diff and the other side // is empty in the next diff, to avoid splitting single changes into two parts. - std::vector, std::vector>> merged; + std::vector, std::vector>> merged; for (auto& diff : result) { if (!merged.empty()) { auto& prev = merged.back(); @@ -438,8 +434,8 @@ std::vector, std::vector>> FindDiffere } void PrintDifferences( - const std::vector& original, const std::vector& modified, - std::vector, std::vector>>& diffs) { + const std::vector& original, const std::vector& modified, + std::vector, std::vector>>& diffs) { std::cout << "Original: "; for (const auto& val : original) { std::cout << val << " "; @@ -468,60 +464,60 @@ void PrintDifferences( } TEST(TestFindDifferences, Basic) { - std::vector first = {1, 2, 3, 4, 5}; - std::vector second = {1, 7, 8, 4, 5}; + std::vector first = {1, 2, 3, 4, 5}; + std::vector second = {1, 7, 8, 4, 5}; auto diffs = FindDifferences(first, second); ASSERT_EQ(diffs.size(), 1); - ASSERT_EQ(diffs[0].first, std::vector({2, 3})); - ASSERT_EQ(diffs[0].second, std::vector({7, 8})); + ASSERT_EQ(diffs[0].first, std::vector({2, 3})); + ASSERT_EQ(diffs[0].second, std::vector({7, 8})); } TEST(TestFindDifferences, MultipleDifferences) { - std::vector first = {1, 2, 3, 4, 5, 6, 7}; - std::vector second = {1, 8, 9, 4, 10, 6, 11}; + std::vector first = {1, 2, 3, 4, 5, 6, 7}; + std::vector second = {1, 8, 9, 4, 10, 6, 11}; auto diffs = FindDifferences(first, second); ASSERT_EQ(diffs.size(), 3); - ASSERT_EQ(diffs[0].first, std::vector({2, 3})); - ASSERT_EQ(diffs[0].second, std::vector({8, 9})); + ASSERT_EQ(diffs[0].first, std::vector({2, 3})); + ASSERT_EQ(diffs[0].second, std::vector({8, 9})); - ASSERT_EQ(diffs[1].first, std::vector({5})); - ASSERT_EQ(diffs[1].second, std::vector({10})); + ASSERT_EQ(diffs[1].first, std::vector({5})); + ASSERT_EQ(diffs[1].second, std::vector({10})); - ASSERT_EQ(diffs[2].first, std::vector({7})); - ASSERT_EQ(diffs[2].second, std::vector({11})); + ASSERT_EQ(diffs[2].first, std::vector({7})); + ASSERT_EQ(diffs[2].second, std::vector({11})); } TEST(TestFindDifferences, DifferentLengths) { - std::vector first = {1, 2, 3}; - std::vector second = {1, 2, 3, 4, 5}; + std::vector first = {1, 2, 3}; + std::vector second = {1, 2, 3, 4, 5}; auto diffs = FindDifferences(first, second); ASSERT_EQ(diffs.size(), 1); ASSERT_TRUE(diffs[0].first.empty()); - ASSERT_EQ(diffs[0].second, std::vector({4, 5})); + ASSERT_EQ(diffs[0].second, std::vector({4, 5})); } TEST(TestFindDifferences, EmptyArrays) { - std::vector first = {}; - std::vector second = {}; + std::vector first = {}; + std::vector second = {}; auto diffs = FindDifferences(first, second); ASSERT_TRUE(diffs.empty()); } TEST(TestFindDifferences, LongSequenceWithSingleDifference) { - std::vector first = { + std::vector first = { 1994, 2193, 2700, 1913, 2052, }; - std::vector second = {2048, 43, 2080, 2700, 1913, 2052}; + std::vector second = {2048, 43, 2080, 2700, 1913, 2052}; auto diffs = FindDifferences(first, second); ASSERT_EQ(diffs.size(), 1); - ASSERT_EQ(diffs[0].first, std::vector({1994, 2193})); - ASSERT_EQ(diffs[0].second, std::vector({2048, 43, 2080})); + ASSERT_EQ(diffs[0].first, std::vector({1994, 2193})); + ASSERT_EQ(diffs[0].second, std::vector({2048, 43, 2080})); // Verify that elements after the difference are identical for (size_t i = 3; i < second.size(); i++) { @@ -530,15 +526,15 @@ TEST(TestFindDifferences, LongSequenceWithSingleDifference) { } TEST(TestFindDifferences, LongSequenceWithMiddleChanges) { - std::vector first = {2169, 1976, 2180, 2147, 1934, 1772, - 1914, 2075, 2154, 1940, 1934, 1970}; - std::vector second = {2169, 1976, 2180, 2147, 2265, 1804, - 1717, 1925, 2122, 1940, 1934, 1970}; + std::vector first = {2169, 1976, 2180, 2147, 1934, 1772, + 1914, 2075, 2154, 1940, 1934, 1970}; + std::vector second = {2169, 1976, 2180, 2147, 2265, 1804, + 1717, 1925, 2122, 1940, 1934, 1970}; auto diffs = FindDifferences(first, second); ASSERT_EQ(diffs.size(), 1); - ASSERT_EQ(diffs[0].first, std::vector({1934, 1772, 1914, 2075, 2154})); - ASSERT_EQ(diffs[0].second, std::vector({2265, 1804, 1717, 1925, 2122})); + ASSERT_EQ(diffs[0].first, std::vector({1934, 1772, 1914, 2075, 2154})); + ASSERT_EQ(diffs[0].second, std::vector({2265, 1804, 1717, 1925, 2122})); // Verify elements before and after the difference are identical for (size_t i = 0; i < 4; i++) { @@ -550,14 +546,14 @@ TEST(TestFindDifferences, LongSequenceWithMiddleChanges) { } TEST(TestFindDifferences, AdditionalCase) { - std::vector original = {445, 312, 393, 401, 410, 138, 558, 457}; - std::vector modified = {445, 312, 393, 393, 410, 138, 558, 457}; + std::vector original = {445, 312, 393, 401, 410, 138, 558, 457}; + std::vector modified = {445, 312, 393, 393, 410, 138, 558, 457}; auto diffs = FindDifferences(original, modified); ASSERT_EQ(diffs.size(), 1); - ASSERT_EQ(diffs[0].first, std::vector({401})); - ASSERT_EQ(diffs[0].second, std::vector({393})); + ASSERT_EQ(diffs[0].first, std::vector({401})); + ASSERT_EQ(diffs[0].second, std::vector({393})); // Verify elements before and after the difference are identical for (size_t i = 0; i < 3; i++) { @@ -569,8 +565,8 @@ TEST(TestFindDifferences, AdditionalCase) { } void AssertUpdateCase(const std::shared_ptr<::arrow::DataType>& dtype, - const std::vector& original, - const std::vector& modified, uint8_t n_modifications) { + const std::vector& original, + const std::vector& modified, uint8_t n_modifications) { auto diffs = FindDifferences(original, modified); if (diffs.size() > n_modifications) { PrintDifferences(original, modified, diffs); @@ -595,9 +591,9 @@ void AssertUpdateCase(const std::shared_ptr<::arrow::DataType>& dtype, } void AssertDeleteCase(const std::shared_ptr<::arrow::DataType>& dtype, - const std::vector& original, - const std::vector& modified, uint8_t n_modifications, - uint64_t edit_length) { + const std::vector& original, + const std::vector& modified, uint8_t n_modifications, + int64_t edit_length) { auto diffs = FindDifferences(original, modified); if (diffs.size() != n_modifications) { PrintDifferences(original, modified, diffs); @@ -606,7 +602,7 @@ void AssertDeleteCase(const std::shared_ptr<::arrow::DataType>& dtype, for (const auto& diff : diffs) { if (!::arrow::is_list_like(dtype->id())) { - uint64_t left_sum = 0, right_sum = 0; + int64_t left_sum = 0, right_sum = 0; for (const auto& val : diff.first) left_sum += val; for (const auto& val : diff.second) right_sum += val; ASSERT_EQ(left_sum, right_sum + edit_length); @@ -617,9 +613,9 @@ void AssertDeleteCase(const std::shared_ptr<::arrow::DataType>& dtype, } void AssertInsertCase(const std::shared_ptr<::arrow::DataType>& dtype, - const std::vector& original, - const std::vector& modified, uint8_t n_modifications, - uint64_t edit_length) { + const std::vector& original, + const std::vector& modified, uint8_t n_modifications, + int64_t edit_length) { auto diffs = FindDifferences(original, modified); if (diffs.size() != n_modifications) { PrintDifferences(original, modified, diffs); @@ -638,8 +634,8 @@ void AssertInsertCase(const std::shared_ptr<::arrow::DataType>& dtype, } } -void AssertAppendCase(const std::vector& original, - const std::vector& modified) { +void AssertAppendCase(const std::vector& original, + const std::vector& modified) { ASSERT_GE(modified.size(), original.size()); for (size_t i = 0; i < original.size() - 1; i++) { ASSERT_EQ(original[i], modified[i]); @@ -647,7 +643,7 @@ void AssertAppendCase(const std::vector& original, ASSERT_GT(modified[original.size() - 1], original.back()); } -uint64_t ElementCount(uint64_t size, int32_t byte_width, bool nullable) { +uint64_t ElementCount(int64_t size, int32_t byte_width, bool nullable) { if (nullable) { // in case of nullable types the def_levels are also fed through the chunker // to identify changes in the null bitmap, this will increase the byte width @@ -659,9 +655,9 @@ uint64_t ElementCount(uint64_t size, int32_t byte_width, bool nullable) { void AssertChunkSizes(const std::shared_ptr<::arrow::DataType>& dtype, PageSizes base_result, PageSizes modified_result, bool nullable, - bool enable_dictionary, uint64_t min_chunk_size, - uint64_t max_chunk_size) { - max_chunk_size = static_cast(max_chunk_size * 1.2); + bool enable_dictionary, int64_t min_chunk_size, + int64_t max_chunk_size) { + max_chunk_size = static_cast(max_chunk_size * 1.2); if (::arrow::is_fixed_width(dtype->id())) { auto min_length = ElementCount(min_chunk_size, dtype->byte_width(), nullable); auto max_length = ElementCount(max_chunk_size, dtype->byte_width(), nullable); @@ -676,10 +672,10 @@ void AssertChunkSizes(const std::shared_ptr<::arrow::DataType>& dtype, } } -constexpr uint64_t kMinChunkSize = 8 * 1024; -constexpr uint64_t kMaxChunkSize = 32 * 1024; -constexpr uint64_t kPartSize = 128 * 1024; -constexpr uint64_t kEditSize = 128; +constexpr int64_t kMinChunkSize = 8 * 1024; +constexpr int64_t kMaxChunkSize = 32 * 1024; +constexpr int64_t kPartSize = 128 * 1024; +constexpr int64_t kEditSize = 128; class TestColumnCDC : public ::testing::TestWithParam< std::tuple, bool, size_t>> { diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index a730348e3dc..c6d97acc1d1 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -167,9 +167,9 @@ static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOM static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = true; static constexpr SizeStatisticsLevel DEFAULT_SIZE_STATISTICS_LEVEL = SizeStatisticsLevel::PageAndColumnChunk; -static constexpr std::pair DEFAULT_CDC_SIZE_RANGE = +static constexpr std::pair DEFAULT_CDC_SIZE_RANGE = std::make_pair(256 * 1024, 1024 * 1024); -static constexpr uint8_t DEFAULT_CDC_NORM_FACTOR = 0; +static constexpr int8_t DEFAULT_CDC_NORM_FACTOR = 0; class PARQUET_EXPORT ColumnProperties { public: @@ -298,12 +298,12 @@ class PARQUET_EXPORT WriterProperties { return this; } - Builder* cdc_size_range(uint64_t min_size, uint64_t max_size) { + Builder* cdc_size_range(int64_t min_size, int64_t max_size) { cdc_size_range_ = std::make_pair(min_size, max_size); return this; } - Builder* cdc_norm_factor(uint8_t norm_factor) { + Builder* cdc_norm_factor(int8_t norm_factor) { cdc_norm_factor_ = norm_factor; return this; } @@ -762,8 +762,8 @@ class PARQUET_EXPORT WriterProperties { std::unordered_map page_index_enabled_; bool cdc_enabled_; - std::pair cdc_size_range_; - uint8_t cdc_norm_factor_; + std::pair cdc_size_range_; + int8_t cdc_norm_factor_; }; inline MemoryPool* memory_pool() const { return pool_; } @@ -789,8 +789,8 @@ class PARQUET_EXPORT WriterProperties { inline bool page_checksum_enabled() const { return page_checksum_enabled_; } inline bool cdc_enabled() const { return cdc_enabled_; } - inline std::pair cdc_size_range() const { return cdc_size_range_; } - inline uint8_t cdc_norm_factor() const { return cdc_norm_factor_; } + inline std::pair cdc_size_range() const { return cdc_size_range_; } + inline int8_t cdc_norm_factor() const { return cdc_norm_factor_; } inline SizeStatisticsLevel size_statistics_level() const { return size_statistics_level_; @@ -895,7 +895,7 @@ class PARQUET_EXPORT WriterProperties { const std::unordered_map& column_properties, ParquetDataPageVersion data_page_version, bool store_short_decimal_as_integer, std::vector sorting_columns, bool cdc_enabled, - std::pair cdc_size_range, uint8_t cdc_norm_factor) + std::pair cdc_size_range, int8_t cdc_norm_factor) : pool_(pool), dictionary_pagesize_limit_(dictionary_pagesize_limit), write_batch_size_(write_batch_size), @@ -936,7 +936,7 @@ class PARQUET_EXPORT WriterProperties { bool cdc_enabled_; std::pair cdc_size_range_; - uint8_t cdc_norm_factor_; + int8_t cdc_norm_factor_; }; PARQUET_EXPORT const std::shared_ptr& default_writer_properties(); From c439e59f267c1c8ea5ca1ab13a67afdfc6b92f52 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 7 Mar 2025 16:54:50 +0100 Subject: [PATCH 040/102] use type aliases for better readability in tests --- cpp/src/parquet/chunker_internal_test.cc | 230 ++++++++++++----------- 1 file changed, 118 insertions(+), 112 deletions(-) diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index 9062b74d536..740d16c7e2e 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -67,13 +67,13 @@ inline uint64_t hash(uint64_t seed, uint64_t index) { template Result> GenerateArray(const std::shared_ptr& type, - bool nullable, int64_t length, int64_t seed, + bool nullable, int64_t length, uint64_t seed, ValueFunc value_func) { BuilderType builder(type, default_memory_pool()); if (nullable) { for (int64_t i = 0; i < length; ++i) { - int64_t val = hash(seed, i); + uint64_t val = hash(seed, i); if (val % 10 == 0) { RETURN_NOT_OK(builder.AppendNull()); } else { @@ -82,7 +82,7 @@ Result> GenerateArray(const std::shared_ptr& ty } } else { for (int64_t i = 0; i < length; ++i) { - int64_t val = hash(seed, i); + uint64_t val = hash(seed, i); RETURN_NOT_OK(builder.Append(value_func(val))); } } @@ -95,7 +95,7 @@ Result> GenerateArray(const std::shared_ptr& ty #define GENERATE_CASE(TYPE_ID, BUILDER_TYPE, VALUE_EXPR) \ case ::arrow::Type::TYPE_ID: { \ - auto value_func = [](int64_t val) { return VALUE_EXPR; }; \ + auto value_func = [](uint64_t val) { return VALUE_EXPR; }; \ return GenerateArray(type, nullable, length, seed, value_func); \ } @@ -126,7 +126,9 @@ Result> GenerateArray(const std::shared_ptr& field // Limit the value to fit within the specified precision int32_t max_exponent = decimal_type.precision() - decimal_type.scale(); int64_t max_value = static_cast(std::pow(10, max_exponent) - 1); - auto value_func = [&](int64_t val) { return ::arrow::Decimal128(val % max_value); }; + auto value_func = [&](uint64_t val) { + return ::arrow::Decimal128(val % max_value); + }; return GenerateArray<::arrow::Decimal128Builder>(type, nullable, length, seed, value_func); } @@ -136,7 +138,9 @@ Result> GenerateArray(const std::shared_ptr& field // int64_t overflow int32_t max_exponent = std::min(9, decimal_type.precision() - decimal_type.scale()); int64_t max_value = static_cast(std::pow(10, max_exponent) - 1); - auto value_func = [&](int64_t val) { return ::arrow::Decimal256(val % max_value); }; + auto value_func = [&](uint64_t val) { + return ::arrow::Decimal256(val % max_value); + }; return GenerateArray<::arrow::Decimal256Builder>(type, nullable, length, seed, value_func); } @@ -159,7 +163,7 @@ Result> GenerateArray(const std::shared_ptr& field std::string("bin_") + std::to_string(val)) case ::arrow::Type::FIXED_SIZE_BINARY: { auto size = static_cast<::arrow::FixedSizeBinaryType*>(type.get())->byte_width(); - auto value_func = [size](int64_t val) { + auto value_func = [size](uint64_t val) { return std::string("bin_") + std::to_string(val).substr(0, size - 4); }; return GenerateArray<::arrow::FixedSizeBinaryBuilder>(type, nullable, length, seed, @@ -172,7 +176,7 @@ Result> GenerateArray(const std::shared_ptr& field for (auto i = 0; i < struct_type->num_fields(); i++) { ARROW_ASSIGN_OR_RAISE(auto child_array, GenerateArray(struct_type->field(i), length, - seed + static_cast(i + 300))); + seed + static_cast(i + 300))); child_arrays.push_back(child_array); } auto struct_array = @@ -256,7 +260,6 @@ Result> WriteTableToBuffer(const std::shared_ptr
& int64_t min_chunk_size, int64_t max_chunk_size, bool enable_dictionary = false, - int64_t row_group_size = 1024 * 1024) { auto sink = CreateOutputStream(); @@ -288,16 +291,20 @@ Result> ReadTableFromBuffer(const std::shared_ptr return result; } -struct PageSizes { - std::vector lengths; - std::vector sizes; +// Type to represent a list of chunks where each element is the size of the chunk. +using ChunkList = std::vector; + +// Type to represent the sizes and lengths of the data pages in a column. +struct PageInfo { + ChunkList lengths; + ChunkList sizes; }; -PageSizes GetColumnPageSizes(const std::shared_ptr& data, int column_index = 0) { +PageInfo GetColumnPageInfo(const std::shared_ptr& data, int column_index = 0) { // Read the parquet data out of the buffer and get the sizes and lengths of the // data pages in given column. We assert on the sizes and lengths of the pages // to ensure that the chunking is done correctly. - PageSizes result; + PageInfo result; auto buffer_reader = std::make_shared(data); auto parquet_reader = ParquetFileReader::Open(std::move(buffer_reader)); @@ -317,10 +324,10 @@ PageSizes GetColumnPageSizes(const std::shared_ptr& data, int column_ind return result; } -Result WriteAndGetPageSizes(const std::shared_ptr
& table, - int64_t min_chunk_size, int64_t max_chunk_size, - bool enable_dictionary = false, - int column_index = 0) { +Result WriteAndGetPageInfo(const std::shared_ptr
& table, + uint64_t min_chunk_size, uint64_t max_chunk_size, + bool enable_dictionary = false, + int column_index = 0) { // Write the table to a buffer and read it back to get the page sizes ARROW_ASSIGN_OR_RAISE( auto buffer, @@ -332,10 +339,10 @@ Result WriteAndGetPageSizes(const std::shared_ptr
& table, ARROW_RETURN_IF(!readback->Equals(*table), Status::Invalid("Readback table not equal to original")); } - return GetColumnPageSizes(buffer, column_index); + return GetColumnPageInfo(buffer, column_index); } -void AssertAllBetween(const std::vector& values, int64_t min, int64_t max, +void AssertAllBetween(const ChunkList& chunks, int64_t min, int64_t max, bool expect_dictionary_fallback = false) { // expect the last chunk since it is not guaranteed to be within the range if (expect_dictionary_fallback) { @@ -344,26 +351,29 @@ void AssertAllBetween(const std::vector& values, int64_t min, int64_t m // guarantee that all chunks are within the range in this case, but we // know that there can be at most 2 pages smaller than the min_chunk_size size_t smaller_count = 0; - for (size_t i = 0; i < values.size() - 1; i++) { - if (values[i] < min) { + for (size_t i = 0; i < chunks.size() - 1; i++) { + if (chunks[i] < min) { smaller_count++; } else { - ASSERT_LE(values[i], max); + ASSERT_LE(chunks[i], max); } } ASSERT_LE(smaller_count, 2); } else { - for (size_t i = 0; i < values.size() - 1; i++) { - ASSERT_GE(values[i], min); - ASSERT_LE(values[i], max); + for (size_t i = 0; i < chunks.size() - 1; i++) { + ASSERT_GE(chunks[i], min); + ASSERT_LE(chunks[i], max); } } - ASSERT_LE(values.back(), max); + ASSERT_LE(chunks.back(), max); } -std::vector, std::vector>> FindDifferences( - const std::vector& first, const std::vector& second) { - // Compute LCS table. +// A git-hunk like side-by-side data structure to represent the differences between two +// vectors of uint64_t values. +using ChunkDiff = std::pair; + +std::vector FindDifferences(const ChunkList& first, const ChunkList& second) { + // Compute longest-common-subsequence between the two vectors. size_t n = first.size(), m = second.size(); std::vector> dp(n + 1, std::vector(m + 1, 0)); for (size_t i = 0; i < n; i++) { @@ -391,7 +401,7 @@ std::vector, std::vector>> FindDifferenc std::reverse(common.begin(), common.end()); // Build raw differences. - std::vector, std::vector>> result; + std::vector result; size_t last_i = 0, last_j = 0; for (auto& c : common) { auto ci = c.first; @@ -410,7 +420,7 @@ std::vector, std::vector>> FindDifferenc // Merge adjacent diffs if one side is empty in the first diff and the other side // is empty in the next diff, to avoid splitting single changes into two parts. - std::vector, std::vector>> merged; + std::vector merged; for (auto& diff : result) { if (!merged.empty()) { auto& prev = merged.back(); @@ -433,9 +443,8 @@ std::vector, std::vector>> FindDifferenc return merged; } -void PrintDifferences( - const std::vector& original, const std::vector& modified, - std::vector, std::vector>>& diffs) { +void PrintDifferences(const ChunkList& original, const ChunkList& modified, + std::vector& diffs) { std::cout << "Original: "; for (const auto& val : original) { std::cout << val << " "; @@ -464,60 +473,60 @@ void PrintDifferences( } TEST(TestFindDifferences, Basic) { - std::vector first = {1, 2, 3, 4, 5}; - std::vector second = {1, 7, 8, 4, 5}; + ChunkList first = {1, 2, 3, 4, 5}; + ChunkList second = {1, 7, 8, 4, 5}; auto diffs = FindDifferences(first, second); ASSERT_EQ(diffs.size(), 1); - ASSERT_EQ(diffs[0].first, std::vector({2, 3})); - ASSERT_EQ(diffs[0].second, std::vector({7, 8})); + ASSERT_EQ(diffs[0].first, ChunkList({2, 3})); + ASSERT_EQ(diffs[0].second, ChunkList({7, 8})); } TEST(TestFindDifferences, MultipleDifferences) { - std::vector first = {1, 2, 3, 4, 5, 6, 7}; - std::vector second = {1, 8, 9, 4, 10, 6, 11}; + ChunkList first = {1, 2, 3, 4, 5, 6, 7}; + ChunkList second = {1, 8, 9, 4, 10, 6, 11}; auto diffs = FindDifferences(first, second); ASSERT_EQ(diffs.size(), 3); - ASSERT_EQ(diffs[0].first, std::vector({2, 3})); - ASSERT_EQ(diffs[0].second, std::vector({8, 9})); + ASSERT_EQ(diffs[0].first, ChunkList({2, 3})); + ASSERT_EQ(diffs[0].second, ChunkList({8, 9})); - ASSERT_EQ(diffs[1].first, std::vector({5})); - ASSERT_EQ(diffs[1].second, std::vector({10})); + ASSERT_EQ(diffs[1].first, ChunkList({5})); + ASSERT_EQ(diffs[1].second, ChunkList({10})); - ASSERT_EQ(diffs[2].first, std::vector({7})); - ASSERT_EQ(diffs[2].second, std::vector({11})); + ASSERT_EQ(diffs[2].first, ChunkList({7})); + ASSERT_EQ(diffs[2].second, ChunkList({11})); } TEST(TestFindDifferences, DifferentLengths) { - std::vector first = {1, 2, 3}; - std::vector second = {1, 2, 3, 4, 5}; + ChunkList first = {1, 2, 3}; + ChunkList second = {1, 2, 3, 4, 5}; auto diffs = FindDifferences(first, second); ASSERT_EQ(diffs.size(), 1); ASSERT_TRUE(diffs[0].first.empty()); - ASSERT_EQ(diffs[0].second, std::vector({4, 5})); + ASSERT_EQ(diffs[0].second, ChunkList({4, 5})); } TEST(TestFindDifferences, EmptyArrays) { - std::vector first = {}; - std::vector second = {}; + ChunkList first = {}; + ChunkList second = {}; auto diffs = FindDifferences(first, second); ASSERT_TRUE(diffs.empty()); } TEST(TestFindDifferences, LongSequenceWithSingleDifference) { - std::vector first = { + ChunkList first = { 1994, 2193, 2700, 1913, 2052, }; - std::vector second = {2048, 43, 2080, 2700, 1913, 2052}; + ChunkList second = {2048, 43, 2080, 2700, 1913, 2052}; auto diffs = FindDifferences(first, second); ASSERT_EQ(diffs.size(), 1); - ASSERT_EQ(diffs[0].first, std::vector({1994, 2193})); - ASSERT_EQ(diffs[0].second, std::vector({2048, 43, 2080})); + ASSERT_EQ(diffs[0].first, ChunkList({1994, 2193})); + ASSERT_EQ(diffs[0].second, ChunkList({2048, 43, 2080})); // Verify that elements after the difference are identical for (size_t i = 3; i < second.size(); i++) { @@ -526,15 +535,15 @@ TEST(TestFindDifferences, LongSequenceWithSingleDifference) { } TEST(TestFindDifferences, LongSequenceWithMiddleChanges) { - std::vector first = {2169, 1976, 2180, 2147, 1934, 1772, - 1914, 2075, 2154, 1940, 1934, 1970}; - std::vector second = {2169, 1976, 2180, 2147, 2265, 1804, - 1717, 1925, 2122, 1940, 1934, 1970}; + ChunkList first = {2169, 1976, 2180, 2147, 1934, 1772, + 1914, 2075, 2154, 1940, 1934, 1970}; + ChunkList second = {2169, 1976, 2180, 2147, 2265, 1804, + 1717, 1925, 2122, 1940, 1934, 1970}; auto diffs = FindDifferences(first, second); ASSERT_EQ(diffs.size(), 1); - ASSERT_EQ(diffs[0].first, std::vector({1934, 1772, 1914, 2075, 2154})); - ASSERT_EQ(diffs[0].second, std::vector({2265, 1804, 1717, 1925, 2122})); + ASSERT_EQ(diffs[0].first, ChunkList({1934, 1772, 1914, 2075, 2154})); + ASSERT_EQ(diffs[0].second, ChunkList({2265, 1804, 1717, 1925, 2122})); // Verify elements before and after the difference are identical for (size_t i = 0; i < 4; i++) { @@ -546,14 +555,14 @@ TEST(TestFindDifferences, LongSequenceWithMiddleChanges) { } TEST(TestFindDifferences, AdditionalCase) { - std::vector original = {445, 312, 393, 401, 410, 138, 558, 457}; - std::vector modified = {445, 312, 393, 393, 410, 138, 558, 457}; + ChunkList original = {445, 312, 393, 401, 410, 138, 558, 457}; + ChunkList modified = {445, 312, 393, 393, 410, 138, 558, 457}; auto diffs = FindDifferences(original, modified); ASSERT_EQ(diffs.size(), 1); - ASSERT_EQ(diffs[0].first, std::vector({401})); - ASSERT_EQ(diffs[0].second, std::vector({393})); + ASSERT_EQ(diffs[0].first, ChunkList({401})); + ASSERT_EQ(diffs[0].second, ChunkList({393})); // Verify elements before and after the difference are identical for (size_t i = 0; i < 3; i++) { @@ -565,8 +574,8 @@ TEST(TestFindDifferences, AdditionalCase) { } void AssertUpdateCase(const std::shared_ptr<::arrow::DataType>& dtype, - const std::vector& original, - const std::vector& modified, uint8_t n_modifications) { + const ChunkList& original, const ChunkList& modified, + uint8_t n_modifications) { auto diffs = FindDifferences(original, modified); if (diffs.size() > n_modifications) { PrintDifferences(original, modified, diffs); @@ -591,9 +600,8 @@ void AssertUpdateCase(const std::shared_ptr<::arrow::DataType>& dtype, } void AssertDeleteCase(const std::shared_ptr<::arrow::DataType>& dtype, - const std::vector& original, - const std::vector& modified, uint8_t n_modifications, - int64_t edit_length) { + const ChunkList& original, const ChunkList& modified, + uint8_t n_modifications, uint64_t edit_length) { auto diffs = FindDifferences(original, modified); if (diffs.size() != n_modifications) { PrintDifferences(original, modified, diffs); @@ -602,7 +610,7 @@ void AssertDeleteCase(const std::shared_ptr<::arrow::DataType>& dtype, for (const auto& diff : diffs) { if (!::arrow::is_list_like(dtype->id())) { - int64_t left_sum = 0, right_sum = 0; + uint64_t left_sum = 0, right_sum = 0; for (const auto& val : diff.first) left_sum += val; for (const auto& val : diff.second) right_sum += val; ASSERT_EQ(left_sum, right_sum + edit_length); @@ -613,9 +621,8 @@ void AssertDeleteCase(const std::shared_ptr<::arrow::DataType>& dtype, } void AssertInsertCase(const std::shared_ptr<::arrow::DataType>& dtype, - const std::vector& original, - const std::vector& modified, uint8_t n_modifications, - int64_t edit_length) { + const ChunkList& original, const ChunkList& modified, + uint8_t n_modifications, uint64_t edit_length) { auto diffs = FindDifferences(original, modified); if (diffs.size() != n_modifications) { PrintDifferences(original, modified, diffs); @@ -634,8 +641,7 @@ void AssertInsertCase(const std::shared_ptr<::arrow::DataType>& dtype, } } -void AssertAppendCase(const std::vector& original, - const std::vector& modified) { +void AssertAppendCase(const ChunkList& original, const ChunkList& modified) { ASSERT_GE(modified.size(), original.size()); for (size_t i = 0; i < original.size() - 1; i++) { ASSERT_EQ(original[i], modified[i]); @@ -643,7 +649,7 @@ void AssertAppendCase(const std::vector& original, ASSERT_GT(modified[original.size() - 1], original.back()); } -uint64_t ElementCount(int64_t size, int32_t byte_width, bool nullable) { +uint64_t ElementCount(uint64_t size, int32_t byte_width, bool nullable) { if (nullable) { // in case of nullable types the def_levels are also fed through the chunker // to identify changes in the null bitmap, this will increase the byte width @@ -654,10 +660,10 @@ uint64_t ElementCount(int64_t size, int32_t byte_width, bool nullable) { } void AssertChunkSizes(const std::shared_ptr<::arrow::DataType>& dtype, - PageSizes base_result, PageSizes modified_result, bool nullable, - bool enable_dictionary, int64_t min_chunk_size, - int64_t max_chunk_size) { - max_chunk_size = static_cast(max_chunk_size * 1.2); + PageInfo base_result, PageInfo modified_result, bool nullable, + bool enable_dictionary, uint64_t min_chunk_size, + uint64_t max_chunk_size) { + max_chunk_size = static_cast(max_chunk_size * 1.2); if (::arrow::is_fixed_width(dtype->id())) { auto min_length = ElementCount(min_chunk_size, dtype->byte_width(), nullable); auto max_length = ElementCount(max_chunk_size, dtype->byte_width(), nullable); @@ -710,11 +716,11 @@ TEST_P(TestColumnCDC, DeleteOnce) { for (bool enable_dictionary : {false, true}) { ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, kMinChunkSize, kMaxChunkSize); @@ -734,11 +740,11 @@ TEST_P(TestColumnCDC, DeleteTwice) { for (bool enable_dictionary : {false, true}) { ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, kMinChunkSize, kMaxChunkSize); @@ -756,11 +762,11 @@ TEST_P(TestColumnCDC, UpdateOnce) { for (bool enable_dictionary : {false, true}) { ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, kMinChunkSize, kMaxChunkSize); @@ -779,11 +785,11 @@ TEST_P(TestColumnCDC, UpdateTwice) { for (bool enable_dictionary : {false, true}) { ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, kMinChunkSize, kMaxChunkSize); @@ -800,11 +806,11 @@ TEST_P(TestColumnCDC, InsertOnce) { for (bool enable_dictionary : {false, true}) { ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, kMinChunkSize, kMaxChunkSize); @@ -823,11 +829,11 @@ TEST_P(TestColumnCDC, InsertTwice) { for (bool enable_dictionary : {false, true}) { ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, kMinChunkSize, kMaxChunkSize); @@ -845,11 +851,11 @@ TEST_P(TestColumnCDC, Append) { for (bool enable_dictionary : {false, true}) { ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageSizes(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageSizes(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, kMinChunkSize, kMaxChunkSize); @@ -866,8 +872,8 @@ TEST_P(TestColumnCDC, EmptyTable) { for (bool enable_dictionary : {false, true}) { ASSERT_OK_AND_ASSIGN(auto result, - WriteAndGetPageSizes(empty_table, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + WriteAndGetPageInfo(empty_table, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); // An empty table should result in no data pages ASSERT_TRUE(result.lengths.empty()); From 3a31d93c0ba02a4aca675942d982877e7cbb014d Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 7 Mar 2025 17:26:12 +0100 Subject: [PATCH 041/102] use explicit struct instead of tuples for the test case configuration --- cpp/src/parquet/chunker_internal_test.cc | 230 +++++++++++------------ 1 file changed, 113 insertions(+), 117 deletions(-) diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index 740d16c7e2e..4614d2511e2 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -649,7 +649,7 @@ void AssertAppendCase(const ChunkList& original, const ChunkList& modified) { ASSERT_GT(modified[original.size() - 1], original.back()); } -uint64_t ElementCount(uint64_t size, int32_t byte_width, bool nullable) { +uint64_t ElementCount(int64_t size, int32_t byte_width, bool nullable) { if (nullable) { // in case of nullable types the def_levels are also fed through the chunker // to identify changes in the null bitmap, this will increase the byte width @@ -683,20 +683,29 @@ constexpr int64_t kMaxChunkSize = 32 * 1024; constexpr int64_t kPartSize = 128 * 1024; constexpr int64_t kEditSize = 128; -class TestColumnCDC : public ::testing::TestWithParam< - std::tuple, bool, size_t>> { +struct CaseConfig { + // Arrow data type to generate the testing data for + std::shared_ptr<::arrow::DataType> dtype; + // Whether the data type is nullable + bool is_nullable; + // Approximate number of bytes per record to calculate the number of elements to + // generate + size_t bytes_per_record; +}; + +class TestColumnCDC : public ::testing::TestWithParam { protected: // Column random table parts for testing std::shared_ptr field_; std::shared_ptr
part1_, part2_, part3_, part4_, part5_, part6_, part7_; void SetUp() override { - auto [dtype, nullable, byte_per_record] = GetParam(); - auto field_ = ::arrow::field("f0", dtype, nullable); + const auto& param = GetParam(); + auto field_ = ::arrow::field("f0", param.dtype, param.is_nullable); auto schema = ::arrow::schema({field_}); - auto part_length = kPartSize / byte_per_record; - auto edit_length = kEditSize / byte_per_record; + auto part_length = kPartSize / param.bytes_per_record; + auto edit_length = kEditSize / param.bytes_per_record; ASSERT_OK_AND_ASSIGN(part1_, GenerateTable(schema, part_length, 0)); ASSERT_OK_AND_ASSIGN(part2_, GenerateTable(schema, edit_length, 1)); ASSERT_OK_AND_ASSIGN(part3_, GenerateTable(schema, part_length, part_length)); @@ -708,30 +717,30 @@ class TestColumnCDC : public ::testing::TestWithParam< }; TEST_P(TestColumnCDC, DeleteOnce) { - auto [dtype, nullable, _] = GetParam(); + const auto& param = GetParam(); ASSERT_OK_AND_ASSIGN(auto base, ConcatAndCombine({part1_, part2_, part3_})); ASSERT_OK_AND_ASSIGN(auto modified, ConcatAndCombine({part1_, part3_})); ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { - ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + ASSERT_OK_AND_ASSIGN( + auto base_result, + WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); + ASSERT_OK_AND_ASSIGN( + auto modified_result, + WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary)); - AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, - kMinChunkSize, kMaxChunkSize); + AssertChunkSizes(param.dtype, base_result, modified_result, param.is_nullable, + enable_dictionary, kMinChunkSize, kMaxChunkSize); - AssertDeleteCase(dtype, base_result.lengths, modified_result.lengths, 1, + AssertDeleteCase(param.dtype, base_result.lengths, modified_result.lengths, 1, part2_->num_rows()); } } TEST_P(TestColumnCDC, DeleteTwice) { - auto [dtype, nullable, _] = GetParam(); + const auto& param = GetParam(); ASSERT_OK_AND_ASSIGN(auto base, ConcatAndCombine({part1_, part2_, part3_, part4_, part5_})); @@ -739,43 +748,43 @@ TEST_P(TestColumnCDC, DeleteTwice) { ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { - ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - - AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, - kMinChunkSize, kMaxChunkSize); - AssertDeleteCase(dtype, base_result.lengths, modified_result.lengths, 2, + ASSERT_OK_AND_ASSIGN( + auto base_result, + WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); + ASSERT_OK_AND_ASSIGN( + auto modified_result, + WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary)); + + AssertChunkSizes(param.dtype, base_result, modified_result, param.is_nullable, + enable_dictionary, kMinChunkSize, kMaxChunkSize); + AssertDeleteCase(param.dtype, base_result.lengths, modified_result.lengths, 2, part2_->num_rows()); } } TEST_P(TestColumnCDC, UpdateOnce) { - auto [dtype, nullable, _] = GetParam(); + const auto& param = GetParam(); ASSERT_OK_AND_ASSIGN(auto base, ConcatAndCombine({part1_, part2_, part3_})); ASSERT_OK_AND_ASSIGN(auto modified, ConcatAndCombine({part1_, part4_, part3_})); ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { - ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + ASSERT_OK_AND_ASSIGN( + auto base_result, + WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); + ASSERT_OK_AND_ASSIGN( + auto modified_result, + WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary)); - AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, - kMinChunkSize, kMaxChunkSize); - AssertUpdateCase(dtype, base_result.lengths, modified_result.lengths, 1); + AssertChunkSizes(param.dtype, base_result, modified_result, param.is_nullable, + enable_dictionary, kMinChunkSize, kMaxChunkSize); + AssertUpdateCase(param.dtype, base_result.lengths, modified_result.lengths, 1); } } TEST_P(TestColumnCDC, UpdateTwice) { - auto [dtype, nullable, _] = GetParam(); + const auto& param = GetParam(); ASSERT_OK_AND_ASSIGN(auto base, ConcatAndCombine({part1_, part2_, part3_, part4_, part5_})); @@ -784,43 +793,43 @@ TEST_P(TestColumnCDC, UpdateTwice) { ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { - ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + ASSERT_OK_AND_ASSIGN( + auto base_result, + WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); + ASSERT_OK_AND_ASSIGN( + auto modified_result, + WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary)); - AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, - kMinChunkSize, kMaxChunkSize); - AssertUpdateCase(dtype, base_result.lengths, modified_result.lengths, 2); + AssertChunkSizes(param.dtype, base_result, modified_result, param.is_nullable, + enable_dictionary, kMinChunkSize, kMaxChunkSize); + AssertUpdateCase(param.dtype, base_result.lengths, modified_result.lengths, 2); } } TEST_P(TestColumnCDC, InsertOnce) { - auto [dtype, nullable, _] = GetParam(); + const auto& param = GetParam(); ASSERT_OK_AND_ASSIGN(auto base, ConcatAndCombine({part1_, part3_})); ASSERT_OK_AND_ASSIGN(auto modified, ConcatAndCombine({part1_, part2_, part3_})); ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { - ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - - AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, - kMinChunkSize, kMaxChunkSize); - AssertInsertCase(dtype, base_result.lengths, modified_result.lengths, 1, + ASSERT_OK_AND_ASSIGN( + auto base_result, + WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); + ASSERT_OK_AND_ASSIGN( + auto modified_result, + WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary)); + + AssertChunkSizes(param.dtype, base_result, modified_result, param.is_nullable, + enable_dictionary, kMinChunkSize, kMaxChunkSize); + AssertInsertCase(param.dtype, base_result.lengths, modified_result.lengths, 1, part2_->num_rows()); } } TEST_P(TestColumnCDC, InsertTwice) { - auto [dtype, nullable, _] = GetParam(); + const auto& param = GetParam(); ASSERT_OK_AND_ASSIGN(auto base, ConcatAndCombine({part1_, part3_, part5_})); ASSERT_OK_AND_ASSIGN(auto modified, @@ -828,52 +837,52 @@ TEST_P(TestColumnCDC, InsertTwice) { ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { - ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - - AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, - kMinChunkSize, kMaxChunkSize); - AssertInsertCase(dtype, base_result.lengths, modified_result.lengths, 2, + ASSERT_OK_AND_ASSIGN( + auto base_result, + WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); + ASSERT_OK_AND_ASSIGN( + auto modified_result, + WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary)); + + AssertChunkSizes(param.dtype, base_result, modified_result, param.is_nullable, + enable_dictionary, kMinChunkSize, kMaxChunkSize); + AssertInsertCase(param.dtype, base_result.lengths, modified_result.lengths, 2, part2_->num_rows()); } } TEST_P(TestColumnCDC, Append) { - auto [dtype, nullable, _] = GetParam(); + const auto& param = GetParam(); ASSERT_OK_AND_ASSIGN(auto base, ConcatAndCombine({part1_, part2_, part3_})); ASSERT_OK_AND_ASSIGN(auto modified, ConcatAndCombine({part1_, part2_, part3_, part4_})); ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { - ASSERT_OK_AND_ASSIGN(auto base_result, - WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_result, - WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); - - AssertChunkSizes(dtype, base_result, modified_result, nullable, enable_dictionary, - kMinChunkSize, kMaxChunkSize); + ASSERT_OK_AND_ASSIGN( + auto base_result, + WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); + ASSERT_OK_AND_ASSIGN( + auto modified_result, + WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary)); + + AssertChunkSizes(param.dtype, base_result, modified_result, param.is_nullable, + enable_dictionary, kMinChunkSize, kMaxChunkSize); AssertAppendCase(base_result.lengths, modified_result.lengths); } } TEST_P(TestColumnCDC, EmptyTable) { - auto [dtype, nullable, _] = GetParam(); + const auto& param = GetParam(); - auto schema = ::arrow::schema({::arrow::field("f0", dtype, nullable)}); + auto schema = ::arrow::schema({::arrow::field("f0", param.dtype, param.is_nullable)}); ASSERT_OK_AND_ASSIGN(auto empty_table, GenerateTable(schema, 0, 0)); ASSERT_EQ(empty_table->num_rows(), 0); for (bool enable_dictionary : {false, true}) { ASSERT_OK_AND_ASSIGN(auto result, WriteAndGetPageInfo(empty_table, kMinChunkSize, kMaxChunkSize, - /*enable_dictionary=*/enable_dictionary)); + enable_dictionary)); // An empty table should result in no data pages ASSERT_TRUE(result.lengths.empty()); @@ -881,43 +890,30 @@ TEST_P(TestColumnCDC, EmptyTable) { } } -// TODO(kszucs): add extension type and dictionary type INSTANTIATE_TEST_SUITE_P( FixedSizedTypes, TestColumnCDC, testing::Values( - // Numeric - std::make_tuple(::arrow::uint8(), false, 1), - std::make_tuple(::arrow::uint16(), false, 2), - std::make_tuple(::arrow::uint32(), false, 4), - std::make_tuple(::arrow::uint64(), true, 8), - std::make_tuple(::arrow::int8(), false, 1), - std::make_tuple(::arrow::int16(), false, 2), - std::make_tuple(::arrow::int32(), false, 4), - std::make_tuple(::arrow::int64(), true, 8), - std::make_tuple(::arrow::float16(), false, 2), - std::make_tuple(::arrow::float32(), false, 4), - std::make_tuple(::arrow::float64(), true, 8), - std::make_tuple(::arrow::decimal128(18, 6), false, 16), - std::make_tuple(::arrow::decimal256(40, 6), false, 32), - // Binary-like - std::make_tuple(::arrow::utf8(), false, 16), - std::make_tuple(::arrow::binary(), true, 16), - std::make_tuple(::arrow::fixed_size_binary(16), true, 16), - - // Temporal - std::make_tuple(::arrow::date32(), false, 4), - std::make_tuple(::arrow::time32(::arrow::TimeUnit::MILLI), true, 4), - std::make_tuple(::arrow::time64(::arrow::TimeUnit::NANO), false, 8), - std::make_tuple(::arrow::timestamp(::arrow::TimeUnit::NANO), true, 8), - std::make_tuple(::arrow::duration(::arrow::TimeUnit::NANO), false, 8), - // Nested types - std::make_tuple(::arrow::list(::arrow::int32()), false, 16), - std::make_tuple(::arrow::list(::arrow::int32()), true, 18), - std::make_tuple(::arrow::list(::arrow::utf8()), true, 18), - std::make_tuple(::arrow::struct_({::arrow::field("f0", ::arrow::int32())}), false, - 8), - std::make_tuple(::arrow::struct_({::arrow::field("f0", ::arrow::float64())}), - true, 10))); + CaseConfig{::arrow::uint8(), false, 1}, CaseConfig{::arrow::uint16(), false, 2}, + CaseConfig{::arrow::uint32(), false, 4}, CaseConfig{::arrow::uint64(), true, 8}, + CaseConfig{::arrow::int8(), false, 1}, CaseConfig{::arrow::int16(), false, 2}, + CaseConfig{::arrow::int32(), false, 4}, CaseConfig{::arrow::int64(), true, 8}, + CaseConfig{::arrow::float16(), false, 2}, + CaseConfig{::arrow::float32(), false, 4}, CaseConfig{::arrow::float64(), true, 8}, + CaseConfig{::arrow::decimal128(18, 6), false, 16}, + CaseConfig{::arrow::decimal256(40, 6), false, 32}, + CaseConfig{::arrow::utf8(), false, 16}, CaseConfig{::arrow::binary(), true, 16}, + CaseConfig{::arrow::fixed_size_binary(16), true, 16}, + CaseConfig{::arrow::date32(), false, 4}, + CaseConfig{::arrow::time32(::arrow::TimeUnit::MILLI), true, 4}, + CaseConfig{::arrow::time64(::arrow::TimeUnit::NANO), false, 8}, + CaseConfig{::arrow::timestamp(::arrow::TimeUnit::NANO), true, 8}, + CaseConfig{::arrow::duration(::arrow::TimeUnit::NANO), false, 8}, + CaseConfig{::arrow::list(::arrow::int32()), false, 16}, + CaseConfig{::arrow::list(::arrow::int32()), true, 18}, + CaseConfig{::arrow::list(::arrow::utf8()), true, 18}, + CaseConfig{::arrow::struct_({::arrow::field("f0", ::arrow::int32())}), false, 8}, + CaseConfig{::arrow::struct_({::arrow::field("f0", ::arrow::float64())}), true, + 10})); } // namespace parquet From b3b2b3e9827373829ebe2eea6b17055b51653706 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 7 Mar 2025 18:04:38 +0100 Subject: [PATCH 042/102] add a boolean test case --- cpp/src/parquet/chunker_internal_test.cc | 81 ++++++++++++++---------- 1 file changed, 49 insertions(+), 32 deletions(-) diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index 4614d2511e2..f998dc01675 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -342,32 +342,6 @@ Result WriteAndGetPageInfo(const std::shared_ptr
& table, return GetColumnPageInfo(buffer, column_index); } -void AssertAllBetween(const ChunkList& chunks, int64_t min, int64_t max, - bool expect_dictionary_fallback = false) { - // expect the last chunk since it is not guaranteed to be within the range - if (expect_dictionary_fallback) { - // if dictionary encoding is enabled, the writer can fallback to plain - // encoding splitting within a content defined chunk, so we can't - // guarantee that all chunks are within the range in this case, but we - // know that there can be at most 2 pages smaller than the min_chunk_size - size_t smaller_count = 0; - for (size_t i = 0; i < chunks.size() - 1; i++) { - if (chunks[i] < min) { - smaller_count++; - } else { - ASSERT_LE(chunks[i], max); - } - } - ASSERT_LE(smaller_count, 2); - } else { - for (size_t i = 0; i < chunks.size() - 1; i++) { - ASSERT_GE(chunks[i], min); - ASSERT_LE(chunks[i], max); - } - } - ASSERT_LE(chunks.back(), max); -} - // A git-hunk like side-by-side data structure to represent the differences between two // vectors of uint64_t values. using ChunkDiff = std::pair; @@ -659,14 +633,42 @@ uint64_t ElementCount(int64_t size, int32_t byte_width, bool nullable) { return size / byte_width; } +void AssertAllBetween(const ChunkList& chunks, int64_t min, int64_t max, + bool expect_dictionary_fallback = false) { + // expect the last chunk since it is not guaranteed to be within the range + if (expect_dictionary_fallback) { + // if dictionary encoding is enabled, the writer can fallback to plain + // encoding splitting within a content defined chunk, so we can't + // guarantee that all chunks are within the range in this case, but we + // know that there can be at most 2 pages smaller than the min_chunk_size + size_t smaller_count = 0; + for (size_t i = 0; i < chunks.size() - 1; i++) { + if (chunks[i] < min) { + smaller_count++; + } else { + ASSERT_LE(chunks[i], max); + } + } + ASSERT_LE(smaller_count, 2); + } else { + for (size_t i = 0; i < chunks.size() - 1; i++) { + ASSERT_GE(chunks[i], min); + ASSERT_LE(chunks[i], max); + } + } + ASSERT_LE(chunks.back(), max); +} + void AssertChunkSizes(const std::shared_ptr<::arrow::DataType>& dtype, PageInfo base_result, PageInfo modified_result, bool nullable, - bool enable_dictionary, uint64_t min_chunk_size, - uint64_t max_chunk_size) { - max_chunk_size = static_cast(max_chunk_size * 1.2); - if (::arrow::is_fixed_width(dtype->id())) { - auto min_length = ElementCount(min_chunk_size, dtype->byte_width(), nullable); - auto max_length = ElementCount(max_chunk_size, dtype->byte_width(), nullable); + bool enable_dictionary, int64_t min_chunk_size, + int64_t max_chunk_size) { + if (::arrow::is_fixed_width(dtype->id()) && !nullable) { + // for nullable types we cannot calculate the exact number of elements because + // not all elements are fed through the chunker (null elements are skipped) + auto byte_width = (dtype->id() == ::arrow::Type::BOOL) ? 1 : dtype->byte_width(); + auto min_length = ElementCount(min_chunk_size, byte_width, nullable); + auto max_length = ElementCount(max_chunk_size, byte_width, nullable); AssertAllBetween(base_result.lengths, min_length, max_length, /*expect_dictionary_fallback=*/enable_dictionary); AssertAllBetween(modified_result.lengths, min_length, max_length, @@ -693,6 +695,15 @@ struct CaseConfig { size_t bytes_per_record; }; +// Define PrintTo for MyStruct +void PrintTo(const CaseConfig& param, std::ostream* os) { + *os << "{ " << param.dtype->ToString(); + if (param.is_nullable) { + *os << " nullable"; + } + *os << " }"; +} + class TestColumnCDC : public ::testing::TestWithParam { protected: // Column random table parts for testing @@ -893,6 +904,9 @@ TEST_P(TestColumnCDC, EmptyTable) { INSTANTIATE_TEST_SUITE_P( FixedSizedTypes, TestColumnCDC, testing::Values( + // Boolean + CaseConfig{::arrow::boolean(), false, 1}, + // Numeric CaseConfig{::arrow::uint8(), false, 1}, CaseConfig{::arrow::uint16(), false, 2}, CaseConfig{::arrow::uint32(), false, 4}, CaseConfig{::arrow::uint64(), true, 8}, CaseConfig{::arrow::int8(), false, 1}, CaseConfig{::arrow::int16(), false, 2}, @@ -901,13 +915,16 @@ INSTANTIATE_TEST_SUITE_P( CaseConfig{::arrow::float32(), false, 4}, CaseConfig{::arrow::float64(), true, 8}, CaseConfig{::arrow::decimal128(18, 6), false, 16}, CaseConfig{::arrow::decimal256(40, 6), false, 32}, + // Binary-like CaseConfig{::arrow::utf8(), false, 16}, CaseConfig{::arrow::binary(), true, 16}, CaseConfig{::arrow::fixed_size_binary(16), true, 16}, + // Temporal CaseConfig{::arrow::date32(), false, 4}, CaseConfig{::arrow::time32(::arrow::TimeUnit::MILLI), true, 4}, CaseConfig{::arrow::time64(::arrow::TimeUnit::NANO), false, 8}, CaseConfig{::arrow::timestamp(::arrow::TimeUnit::NANO), true, 8}, CaseConfig{::arrow::duration(::arrow::TimeUnit::NANO), false, 8}, + // Nested types CaseConfig{::arrow::list(::arrow::int32()), false, 16}, CaseConfig{::arrow::list(::arrow::int32()), true, 18}, CaseConfig{::arrow::list(::arrow::utf8()), true, 18}, From 1dd53e9c5bb3fc5df702946a92ec2404751174c9 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 7 Mar 2025 19:49:45 +0100 Subject: [PATCH 043/102] describe test utilities in more details --- cpp/src/parquet/chunker_internal_test.cc | 42 ++++++++++++++++++++---- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index f998dc01675..d5c625a0935 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -346,79 +346,107 @@ Result WriteAndGetPageInfo(const std::shared_ptr
& table, // vectors of uint64_t values. using ChunkDiff = std::pair; +/** + * Finds the differences between two sequences of chunk lengths or sizes. + * Uses a longest common subsequence algorithm to identify matching elements + * and extract the differences between the sequences. + * + * @param first The first sequence of chunk values + * @param second The second sequence of chunk values + * @return A vector of differences, where each difference is a pair of + * subsequences (one from each input) that differ + */ std::vector FindDifferences(const ChunkList& first, const ChunkList& second) { - // Compute longest-common-subsequence between the two vectors. + // Compute the longest common subsequence using dynamic programming size_t n = first.size(), m = second.size(); std::vector> dp(n + 1, std::vector(m + 1, 0)); + + // Fill the dynamic programming table for (size_t i = 0; i < n; i++) { for (size_t j = 0; j < m; j++) { if (first[i] == second[j]) { + // If current elements match, extend the LCS dp[i + 1][j + 1] = dp[i][j] + 1; } else { + // If current elements don't match, take the best option dp[i + 1][j + 1] = std::max(dp[i + 1][j], dp[i][j + 1]); } } } - // Backtrack to get common indices. + // Backtrack through the dynamic programming table to reconstruct the common + // parts and their positions in the original sequences std::vector> common; for (size_t i = n, j = m; i > 0 && j > 0;) { if (first[i - 1] == second[j - 1]) { + // Found a common element, add to common list common.emplace_back(i - 1, j - 1); i--, j--; } else if (dp[i - 1][j] >= dp[i][j - 1]) { + // Move in the direction of the larger LCS value i--; } else { j--; } } + // Reverse to get indices in ascending order std::reverse(common.begin(), common.end()); - // Build raw differences. + // Build the differences by finding sequences between common elements std::vector result; size_t last_i = 0, last_j = 0; for (auto& c : common) { auto ci = c.first; auto cj = c.second; + // If there's a gap between the last common element and this one, + // record the difference if (ci > last_i || cj > last_j) { result.push_back({{first.begin() + last_i, first.begin() + ci}, {second.begin() + last_j, second.begin() + cj}}); } + // Move past this common element last_i = ci + 1; last_j = cj + 1; } + + // Handle any remaining elements after the last common element if (last_i < n || last_j < m) { result.push_back( {{first.begin() + last_i, first.end()}, {second.begin() + last_j, second.end()}}); } - // Merge adjacent diffs if one side is empty in the first diff and the other side - // is empty in the next diff, to avoid splitting single changes into two parts. + // Post-process: merge adjacent diffs to avoid splitting single changes into multiple + // parts std::vector merged; for (auto& diff : result) { if (!merged.empty()) { auto& prev = merged.back(); + // Check if we can merge with the previous diff bool can_merge_a = prev.first.empty() && !prev.second.empty() && !diff.first.empty() && diff.second.empty(); bool can_merge_b = prev.second.empty() && !prev.first.empty() && !diff.second.empty() && diff.first.empty(); + if (can_merge_a) { - // Combine into one change + // Combine into one diff: keep prev's second, use diff's first prev.first = std::move(diff.first); continue; } else if (can_merge_b) { + // Combine into one diff: keep prev's first, use diff's second prev.second = std::move(diff.second); continue; } } + // If we can't merge, add this diff to the result merged.push_back(std::move(diff)); } - return merged; } void PrintDifferences(const ChunkList& original, const ChunkList& modified, std::vector& diffs) { + // Utility function to print the original and modified sequences, and the diffs + // between them. Used in case of failing assertions to display the differences. std::cout << "Original: "; for (const auto& val : original) { std::cout << val << " "; From e39c2439f63ffcc20a630954bf923cc06b14fb22 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Mon, 10 Mar 2025 22:28:41 +0100 Subject: [PATCH 044/102] fix use .getValue() for binary arrays --- cpp/src/parquet/chunker_internal.cc | 76 ++++++++++++++++------------- cpp/src/parquet/chunker_internal.h | 9 ++-- 2 files changed, 47 insertions(+), 38 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index d13186ec91d..09375154e3d 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -69,30 +69,33 @@ void ContentDefinedChunker::Roll(const bool value) { has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0); } -template -void ContentDefinedChunker::Roll(const T* value) { - constexpr size_t BYTE_WIDTH = sizeof(T); - chunk_size_ += BYTE_WIDTH; +template +void ContentDefinedChunker::Roll(const uint8_t* value) { + chunk_size_ += ByteWidth; if (chunk_size_ < min_size_) { // short-circuit if we haven't reached the minimum chunk size, this speeds up the // chunking process since the gearhash doesn't need to be updated return; } - auto bytes = reinterpret_cast(value); - for (size_t i = 0; i < BYTE_WIDTH; ++i) { - rolling_hash_ = (rolling_hash_ << 1) + kGearhashTable[nth_run_][bytes[i]]; + for (size_t i = 0; i < ByteWidth; ++i) { + rolling_hash_ = (rolling_hash_ << 1) + kGearhashTable[nth_run_][value[i]]; has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0); } } -void ContentDefinedChunker::Roll(const uint8_t* value, int64_t num_bytes) { - chunk_size_ += num_bytes; +template +void ContentDefinedChunker::Roll(const T* value) { + return Roll(reinterpret_cast(value)); +} + +void ContentDefinedChunker::Roll(const uint8_t* value, int64_t length) { + chunk_size_ += length; if (chunk_size_ < min_size_) { // short-circuit if we haven't reached the minimum chunk size, this speeds up the // chunking process since the gearhash doesn't need to be updated return; } - for (int64_t i = 0; i < num_bytes; ++i) { + for (auto i = 0; i < length; ++i) { rolling_hash_ = (rolling_hash_ << 1) + kGearhashTable[nth_run_][value[i]]; has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0); } @@ -202,21 +205,22 @@ const std::vector ContentDefinedChunker::Calculate(const int16_t* def_lev return chunks; } -#define FIXED_WIDTH_CASE(CType) \ - { \ - const auto raw_values = values.data()->GetValues(1); \ - return Calculate(def_levels, rep_levels, num_levels, \ - [&](int64_t i) { return Roll(raw_values + i); }); \ +#define FIXED_WIDTH_CASE(ByteWidth) \ + { \ + const auto raw_values = values.data()->GetValues(1); \ + return Calculate(def_levels, rep_levels, num_levels, [&](int64_t i) { \ + return Roll(raw_values + i * ByteWidth); \ + }); \ } -#define BINARY_LIKE_CASE(OffsetCType) \ +#define BINARY_LIKE_CASE(ArrayType) \ { \ - const auto raw_offsets = values.data()->GetValues(1); \ - const auto raw_values = values.data()->GetValues(2); \ + const auto& array = static_cast(values); \ + const uint8_t* value; \ + ArrayType::offset_type length; \ return Calculate(def_levels, rep_levels, num_levels, [&](int64_t i) { \ - const OffsetCType pos = raw_offsets[i]; \ - const OffsetCType length = raw_offsets[i + 1] - pos; \ - Roll(raw_values + pos, length); \ + value = array.GetValue(i, &length); \ + Roll(value, length); \ }); \ } @@ -235,17 +239,17 @@ const std::vector ContentDefinedChunker::GetBoundaries( } case ::arrow::Type::INT8: case ::arrow::Type::UINT8: - FIXED_WIDTH_CASE(uint8_t) + FIXED_WIDTH_CASE(1) case ::arrow::Type::INT16: case ::arrow::Type::UINT16: case ::arrow::Type::HALF_FLOAT: - FIXED_WIDTH_CASE(uint16_t) + FIXED_WIDTH_CASE(2) case ::arrow::Type::INT32: case ::arrow::Type::UINT32: case ::arrow::Type::FLOAT: case ::arrow::Type::DATE32: case ::arrow::Type::TIME32: - FIXED_WIDTH_CASE(uint32_t) + FIXED_WIDTH_CASE(4) case ::arrow::Type::INT64: case ::arrow::Type::UINT64: case ::arrow::Type::DOUBLE: @@ -253,22 +257,24 @@ const std::vector ContentDefinedChunker::GetBoundaries( case ::arrow::Type::TIME64: case ::arrow::Type::TIMESTAMP: case ::arrow::Type::DURATION: - FIXED_WIDTH_CASE(uint64_t) + FIXED_WIDTH_CASE(8) + case ::arrow::Type::DECIMAL128: + FIXED_WIDTH_CASE(16) + case ::arrow::Type::DECIMAL256: + FIXED_WIDTH_CASE(32) case ::arrow::Type::BINARY: + BINARY_LIKE_CASE(::arrow::BinaryArray) case ::arrow::Type::STRING: - BINARY_LIKE_CASE(int32_t) + BINARY_LIKE_CASE(::arrow::StringArray) case ::arrow::Type::LARGE_BINARY: + BINARY_LIKE_CASE(::arrow::LargeBinaryArray) case ::arrow::Type::LARGE_STRING: - BINARY_LIKE_CASE(int64_t) - case ::arrow::Type::DECIMAL128: - case ::arrow::Type::DECIMAL256: + BINARY_LIKE_CASE(::arrow::LargeStringArray) case ::arrow::Type::FIXED_SIZE_BINARY: { - const auto raw_values = values.data()->GetValues(1); - const auto byte_width = - static_cast(values).byte_width(); - return Calculate(def_levels, rep_levels, num_levels, [&](int64_t i) { - return Roll(raw_values + i * byte_width, byte_width); - }); + const auto& array = static_cast(values); + const auto byte_width = array.byte_width(); + return Calculate(def_levels, rep_levels, num_levels, + [&](int64_t i) { Roll(array.GetValue(i), byte_width); }); } case ::arrow::Type::DICTIONARY: return GetBoundaries( diff --git a/cpp/src/parquet/chunker_internal.h b/cpp/src/parquet/chunker_internal.h index 5299e1c8ec8..235543fc706 100644 --- a/cpp/src/parquet/chunker_internal.h +++ b/cpp/src/parquet/chunker_internal.h @@ -132,16 +132,19 @@ class ContentDefinedChunker { const ::arrow::Array& values); private: - void Roll(const bool value); + inline void Roll(const bool value); // Update the rolling hash with a compile-time known sized value, set has_matched_ to // true if the hash matches the mask. + template + void inline Roll(const uint8_t* value); + template - void Roll(const T* value); + inline void Roll(const T* value); // Update the rolling hash with a binary-like value, set has_matched_ to true if the // hash matches the mask. - void Roll(const uint8_t* value, int64_t num_bytes); + inline void Roll(const uint8_t* value, int64_t length); // Evaluate whether a new chunk should be created based on the has_matched_, nth_run_ // and chunk_size_ state. From 5a9dd372a22ad5cae65af85755e8f7ed7389398f Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Mon, 10 Mar 2025 23:38:13 +0100 Subject: [PATCH 045/102] add more details about calculating the mask --- cpp/src/parquet/chunker_internal.cc | 41 +++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index 09375154e3d..bb4da9a0d3a 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -28,15 +28,46 @@ namespace parquet::internal { +/// Calculate the mask to use for the rolling hash, the mask is used to determine if a +/// new chunk should be created based on the rolling hash value. The mask is calculated +/// based on the min_size, max_size and norm_factor parameters. +/// +/// Assuming that the gear hash hash random values with a uniform distribution, then each +/// bit in the actual value of rolling_hash_ has even probability of being set so a mask +/// with the top N bits set has a probability of 1/2^N of matching the rolling hash. This +/// is the judgment criteria for the original gear hash based content-defined chunking. +/// The main drawback of this approach is the non-uniform distribution of the chunk sizes. +/// +/// Later on the FastCDC has improved the process by introducing: +/// - sub-minimum chunk cut-point skipping (not hashing the first `min_size` bytes) +/// - chunk size normalization (using two masks) +/// +/// This implementation uses cut-point skipping because it improves the overall +/// performance and a more accurate alternative to have less skewed chunk size +/// distribution. Instead of using two different masks (one with a lower and one with a +/// probability of matching and switching them based on the actual chunk size), we rather +/// use 8 different gear hash tables and require having 8 consecutive matches while +/// switching between the used hashtables. This approach is based on central limit theorem +/// and approximates normal distribution of the chunk sizes. +// +// @param min_size The minimum chunk size (default 256KiB) +// @param max_size The maximum chunk size (default 1MiB) +// @param norm_factor Normalization factor (default 0) +// @return The mask used to compare against the rolling hash static uint64_t GetMask(int64_t min_size, int64_t max_size, uint8_t norm_factor) { - // we aim for gaussian-like distribution of chunk sizes between min_size and max_size + // calculate the average size of the chunks int64_t avg_size = (min_size + max_size) / 2; - // we skip calculating gearhash for the first `min_size` bytes, so we are looking for - // a smaller chunk as the average size + // since we are skipping the first `min_size` bytes for each chunk, we need to + // target a smaller chunk size to reach the average size after skipping the first + // `min_size` bytes int64_t target_size = avg_size - min_size; + // assuming that the gear hash has a uniform distribution, we can calculate the mask + // by taking the log2 of the target size size_t mask_bits = static_cast(std::floor(std::log2(target_size))); - // -3 because we are using 8 hash tables to have more gaussian-like distribution - // `norm_factor` narrows the chunk size distribution aroun avg_size + // -3 because we are using 8 hash tables to have more gaussian-like distribution, + // a user defined `norm_factor` can be used to adjust the mask size, hence the matching + // probability, by increasing the norm_factor we increase the probability of matching + // the mask, forcing the distribution closer to the average size size_t effective_bits = mask_bits - 3 - norm_factor; return std::numeric_limits::max() << (64 - effective_bits); } From 19089185bc610f905b4b809c6b6cdc807adcfff3 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Thu, 13 Mar 2025 09:30:43 +0100 Subject: [PATCH 046/102] Address review comments --- cpp/src/parquet/chunker_internal.cc | 17 +++++++++-------- cpp/src/parquet/chunker_internal.h | 4 ++-- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index bb4da9a0d3a..b5978063306 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -45,8 +45,8 @@ namespace parquet::internal { /// This implementation uses cut-point skipping because it improves the overall /// performance and a more accurate alternative to have less skewed chunk size /// distribution. Instead of using two different masks (one with a lower and one with a -/// probability of matching and switching them based on the actual chunk size), we rather -/// use 8 different gear hash tables and require having 8 consecutive matches while +/// higher probability of matching and switching them based on the actual chunk size), we +/// rather use 8 different gear hash tables and require having 8 consecutive matches while /// switching between the used hashtables. This approach is based on central limit theorem /// and approximates normal distribution of the chunk sizes. // @@ -139,8 +139,9 @@ bool ContentDefinedChunker::NeedNewChunk() { has_matched_ = false; // in order to have a normal distribution of chunk sizes, we only create a new chunk // if the adjused mask matches the rolling hash 8 times in a row, each run uses a - // different gearhash table (gearhash's chunk size has exponential distribution, and - // we use central limit theorem to approximate normal distribution) + // different gearhash table (gearhash's chunk size has geometric distribution, and + // we use central limit theorem to approximate normal distribution, see section 6.2.1 + // in paper https://www.cidrdb.org/cidr2023/papers/p43-low.pdf) if (ARROW_PREDICT_FALSE(++nth_run_ >= 7)) { nth_run_ = 0; chunk_size_ = 0; @@ -158,10 +159,10 @@ bool ContentDefinedChunker::NeedNewChunk() { } template -const std::vector ContentDefinedChunker::Calculate(const int16_t* def_levels, - const int16_t* rep_levels, - int64_t num_levels, - const RollFunc& RollValue) { +std::vector ContentDefinedChunker::Calculate(const int16_t* def_levels, + const int16_t* rep_levels, + int64_t num_levels, + const RollFunc& RollValue) { std::vector chunks; int64_t offset; int64_t prev_offset = 0; diff --git a/cpp/src/parquet/chunker_internal.h b/cpp/src/parquet/chunker_internal.h index 235543fc706..53830d41a5b 100644 --- a/cpp/src/parquet/chunker_internal.h +++ b/cpp/src/parquet/chunker_internal.h @@ -152,8 +152,8 @@ class ContentDefinedChunker { // Calculate the chunk boundaries for typed Arrow arrays. template - const std::vector Calculate(const int16_t* def_levels, const int16_t* rep_levels, - int64_t num_levels, const RollFunc& RollValue); + std::vector Calculate(const int16_t* def_levels, const int16_t* rep_levels, + int64_t num_levels, const RollFunc& RollValue); // Reference to the column's level information const internal::LevelInfo& level_info_; From 40b175c34dbe2af48d23f02b8b7e539cdfa85cf5 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Thu, 13 Mar 2025 09:32:10 +0100 Subject: [PATCH 047/102] Separate include groups with a new line --- cpp/src/parquet/chunker_internal.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index b5978063306..2aaaf4dc0e2 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -20,6 +20,7 @@ #include #include #include + #include "arrow/array.h" #include "arrow/util/logging.h" #include "parquet/chunker_internal_generated.h" From a9635b06659c6c8af5ab3ca4e8b9d607761ce2b4 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 14 Mar 2025 16:12:03 +0100 Subject: [PATCH 048/102] Remove Chunk constructor and hide implementation using PIMPL --- cpp/src/parquet/chunker_internal.cc | 433 +++++++++++++++------------- cpp/src/parquet/chunker_internal.h | 59 +--- cpp/src/parquet/column_writer.cc | 4 +- 3 files changed, 246 insertions(+), 250 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index 2aaaf4dc0e2..f3c64a9d422 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -73,170 +73,176 @@ static uint64_t GetMask(int64_t min_size, int64_t max_size, uint8_t norm_factor) return std::numeric_limits::max() << (64 - effective_bits); } -ContentDefinedChunker::ContentDefinedChunker(const LevelInfo& level_info, - int64_t min_size, int64_t max_size, - int8_t norm_factor) - : level_info_(level_info), - min_size_(min_size), - max_size_(max_size), - hash_mask_(GetMask(min_size, max_size, norm_factor)) { - if (min_size_ < 0) { - throw ParquetException("min_size must be non-negative"); - } - if (max_size_ < 0) { - throw ParquetException("max_size must be non-negative"); - } - if (min_size_ > max_size_) { - throw ParquetException("min_size must be less than or equal to max_size"); +class ContentDefinedChunker::Impl { + public: + Impl(const LevelInfo& level_info, int64_t min_size, int64_t max_size, + int8_t norm_factor) + : level_info_(level_info), + min_size_(min_size), + max_size_(max_size), + hash_mask_(GetMask(min_size, max_size, norm_factor)) { + if (min_size_ < 0) { + throw ParquetException("min_size must be non-negative"); + } + if (max_size_ < 0) { + throw ParquetException("max_size must be non-negative"); + } + if (min_size_ > max_size_) { + throw ParquetException("min_size must be less than or equal to max_size"); + } } -} -void ContentDefinedChunker::Roll(const bool value) { - if (chunk_size_++ < min_size_) { - // short-circuit if we haven't reached the minimum chunk size, this speeds up the - // chunking process since the gearhash doesn't need to be updated - return; + void Roll(const bool value) { + if (chunk_size_++ < min_size_) { + // short-circuit if we haven't reached the minimum chunk size, this speeds up the + // chunking process since the gearhash doesn't need to be updated + return; + } + rolling_hash_ = (rolling_hash_ << 1) + kGearhashTable[nth_run_][value]; + has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0); } - rolling_hash_ = (rolling_hash_ << 1) + kGearhashTable[nth_run_][value]; - has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0); -} -template -void ContentDefinedChunker::Roll(const uint8_t* value) { - chunk_size_ += ByteWidth; - if (chunk_size_ < min_size_) { - // short-circuit if we haven't reached the minimum chunk size, this speeds up the - // chunking process since the gearhash doesn't need to be updated - return; + template + void Roll(const uint8_t* value) { + // Update the rolling hash with a compile-time known sized value, set has_matched_ to + // true if the hash matches the mask. + + chunk_size_ += ByteWidth; + if (chunk_size_ < min_size_) { + // short-circuit if we haven't reached the minimum chunk size, this speeds up the + // chunking process since the gearhash doesn't need to be updated + return; + } + for (size_t i = 0; i < ByteWidth; ++i) { + rolling_hash_ = (rolling_hash_ << 1) + kGearhashTable[nth_run_][value[i]]; + has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0); + } } - for (size_t i = 0; i < ByteWidth; ++i) { - rolling_hash_ = (rolling_hash_ << 1) + kGearhashTable[nth_run_][value[i]]; - has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0); + + template + void Roll(const T* value) { + return Roll(reinterpret_cast(value)); } -} -template -void ContentDefinedChunker::Roll(const T* value) { - return Roll(reinterpret_cast(value)); -} + void Roll(const uint8_t* value, int64_t length) { + // Update the rolling hash with a binary-like value, set has_matched_ to true if the + // hash matches the mask. -void ContentDefinedChunker::Roll(const uint8_t* value, int64_t length) { - chunk_size_ += length; - if (chunk_size_ < min_size_) { - // short-circuit if we haven't reached the minimum chunk size, this speeds up the - // chunking process since the gearhash doesn't need to be updated - return; - } - for (auto i = 0; i < length; ++i) { - rolling_hash_ = (rolling_hash_ << 1) + kGearhashTable[nth_run_][value[i]]; - has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0); + chunk_size_ += length; + if (chunk_size_ < min_size_) { + // short-circuit if we haven't reached the minimum chunk size, this speeds up the + // chunking process since the gearhash doesn't need to be updated + return; + } + for (auto i = 0; i < length; ++i) { + rolling_hash_ = (rolling_hash_ << 1) + kGearhashTable[nth_run_][value[i]]; + has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0); + } } -} -bool ContentDefinedChunker::NeedNewChunk() { - // decide whether to create a new chunk based on the rolling hash; has_matched_ is - // set to true if we encountered a match since the last NeedNewChunk() call - if (ARROW_PREDICT_FALSE(has_matched_)) { - has_matched_ = false; - // in order to have a normal distribution of chunk sizes, we only create a new chunk - // if the adjused mask matches the rolling hash 8 times in a row, each run uses a - // different gearhash table (gearhash's chunk size has geometric distribution, and - // we use central limit theorem to approximate normal distribution, see section 6.2.1 - // in paper https://www.cidrdb.org/cidr2023/papers/p43-low.pdf) - if (ARROW_PREDICT_FALSE(++nth_run_ >= 7)) { - nth_run_ = 0; + bool NeedNewChunk() { + // decide whether to create a new chunk based on the rolling hash; has_matched_ is + // set to true if we encountered a match since the last NeedNewChunk() call + if (ARROW_PREDICT_FALSE(has_matched_)) { + has_matched_ = false; + // in order to have a normal distribution of chunk sizes, we only create a new chunk + // if the adjused mask matches the rolling hash 8 times in a row, each run uses a + // different gearhash table (gearhash's chunk size has geometric distribution, and + // we use central limit theorem to approximate normal distribution, see + // section 6.2.1 in paper https://www.cidrdb.org/cidr2023/papers/p43-low.pdf) + if (ARROW_PREDICT_FALSE(++nth_run_ >= 7)) { + nth_run_ = 0; + chunk_size_ = 0; + return true; + } + } + if (ARROW_PREDICT_FALSE(chunk_size_ >= max_size_)) { + // we have a hard limit on the maximum chunk size, note that we don't reset the + // rolling hash state here, so the next NeedNewChunk() call will continue from the + // current state chunk_size_ = 0; return true; } + return false; } - if (ARROW_PREDICT_FALSE(chunk_size_ >= max_size_)) { - // we have a hard limit on the maximum chunk size, note that we don't reset the - // rolling hash state here, so the next NeedNewChunk() call will continue from the - // current state - chunk_size_ = 0; - return true; - } - return false; -} -template -std::vector ContentDefinedChunker::Calculate(const int16_t* def_levels, - const int16_t* rep_levels, - int64_t num_levels, - const RollFunc& RollValue) { - std::vector chunks; - int64_t offset; - int64_t prev_offset = 0; - int64_t prev_value_offset = 0; - bool has_def_levels = level_info_.def_level > 0; - bool has_rep_levels = level_info_.rep_level > 0; + template + std::vector Calculate(const int16_t* def_levels, const int16_t* rep_levels, + int64_t num_levels, const RollFunc& RollValue) { + // Calculate the chunk boundaries for typed Arrow arrays. + std::vector chunks; + int64_t offset; + int64_t prev_offset = 0; + int64_t prev_value_offset = 0; + bool has_def_levels = level_info_.def_level > 0; + bool has_rep_levels = level_info_.rep_level > 0; - if (!has_rep_levels && !has_def_levels) { - // fastest path for non-nested non-null data - for (offset = 0; offset < num_levels; ++offset) { - RollValue(offset); - if (NeedNewChunk()) { - chunks.emplace_back(prev_offset, prev_offset, offset - prev_offset); - prev_offset = offset; - } - } - // set the previous value offset to add the last chunk - prev_value_offset = prev_offset; - } else if (!has_rep_levels) { - // non-nested data with nulls - int16_t def_level; - for (int64_t offset = 0; offset < num_levels; ++offset) { - def_level = def_levels[offset]; - - Roll(&def_level); - if (def_level == level_info_.def_level) { + if (!has_rep_levels && !has_def_levels) { + // fastest path for non-nested non-null data + for (offset = 0; offset < num_levels; ++offset) { RollValue(offset); + if (NeedNewChunk()) { + chunks.push_back({prev_offset, prev_offset, offset - prev_offset}); + prev_offset = offset; + } } - if (NeedNewChunk()) { - chunks.emplace_back(prev_offset, prev_offset, offset - prev_offset); - prev_offset = offset; + // set the previous value offset to add the last chunk + prev_value_offset = prev_offset; + } else if (!has_rep_levels) { + // non-nested data with nulls + int16_t def_level; + for (int64_t offset = 0; offset < num_levels; ++offset) { + def_level = def_levels[offset]; + + Roll(&def_level); + if (def_level == level_info_.def_level) { + RollValue(offset); + } + if (NeedNewChunk()) { + chunks.push_back({prev_offset, prev_offset, offset - prev_offset}); + prev_offset = offset; + } } - } - // set the previous value offset to add the last chunk - prev_value_offset = prev_offset; - } else { - // nested data with nulls - int16_t def_level; - int16_t rep_level; - int64_t value_offset = 0; + // set the previous value offset to add the last chunk + prev_value_offset = prev_offset; + } else { + // nested data with nulls + int16_t def_level; + int16_t rep_level; + int64_t value_offset = 0; - for (offset = 0; offset < num_levels; ++offset) { - def_level = def_levels[offset]; - rep_level = rep_levels[offset]; + for (offset = 0; offset < num_levels; ++offset) { + def_level = def_levels[offset]; + rep_level = rep_levels[offset]; - Roll(&def_level); - Roll(&rep_level); - if (def_level == level_info_.def_level) { - RollValue(value_offset); - } + Roll(&def_level); + Roll(&rep_level); + if (def_level == level_info_.def_level) { + RollValue(value_offset); + } - if ((rep_level == 0) && NeedNewChunk()) { - // if we are at a record boundary and need a new chunk, we create a new chunk - auto levels_to_write = offset - prev_offset; - if (levels_to_write > 0) { - chunks.emplace_back(prev_offset, prev_value_offset, levels_to_write); - prev_offset = offset; - prev_value_offset = value_offset; + if ((rep_level == 0) && NeedNewChunk()) { + // if we are at a record boundary and need a new chunk, we create a new chunk + auto levels_to_write = offset - prev_offset; + if (levels_to_write > 0) { + chunks.push_back({prev_offset, prev_value_offset, levels_to_write}); + prev_offset = offset; + prev_value_offset = value_offset; + } + } + if (def_level >= level_info_.repeated_ancestor_def_level) { + // we only increment the value offset if we have a leaf value + ++value_offset; } - } - if (def_level >= level_info_.repeated_ancestor_def_level) { - // we only increment the value offset if we have a leaf value - ++value_offset; } } - } - // add the last chunk if we have any levels left - if (prev_offset < num_levels) { - chunks.emplace_back(prev_offset, prev_value_offset, num_levels - prev_offset); + // add the last chunk if we have any levels left + if (prev_offset < num_levels) { + chunks.push_back({prev_offset, prev_value_offset, num_levels - prev_offset}); + } + return chunks; } - return chunks; -} #define FIXED_WIDTH_CASE(ByteWidth) \ { \ @@ -257,65 +263,104 @@ std::vector ContentDefinedChunker::Calculate(const int16_t* def_levels, }); \ } -const std::vector ContentDefinedChunker::GetBoundaries( - const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, - const ::arrow::Array& values) { - auto type_id = values.type()->id(); - switch (type_id) { - case ::arrow::Type::NA: { - return Calculate(def_levels, rep_levels, num_levels, [](int64_t) {}); - } - case ::arrow::Type::BOOL: { - const auto& bool_array = static_cast(values); - return Calculate(def_levels, rep_levels, num_levels, - [&](int64_t i) { return Roll(bool_array.Value(i)); }); - } - case ::arrow::Type::INT8: - case ::arrow::Type::UINT8: - FIXED_WIDTH_CASE(1) - case ::arrow::Type::INT16: - case ::arrow::Type::UINT16: - case ::arrow::Type::HALF_FLOAT: - FIXED_WIDTH_CASE(2) - case ::arrow::Type::INT32: - case ::arrow::Type::UINT32: - case ::arrow::Type::FLOAT: - case ::arrow::Type::DATE32: - case ::arrow::Type::TIME32: - FIXED_WIDTH_CASE(4) - case ::arrow::Type::INT64: - case ::arrow::Type::UINT64: - case ::arrow::Type::DOUBLE: - case ::arrow::Type::DATE64: - case ::arrow::Type::TIME64: - case ::arrow::Type::TIMESTAMP: - case ::arrow::Type::DURATION: - FIXED_WIDTH_CASE(8) - case ::arrow::Type::DECIMAL128: - FIXED_WIDTH_CASE(16) - case ::arrow::Type::DECIMAL256: - FIXED_WIDTH_CASE(32) - case ::arrow::Type::BINARY: - BINARY_LIKE_CASE(::arrow::BinaryArray) - case ::arrow::Type::STRING: - BINARY_LIKE_CASE(::arrow::StringArray) - case ::arrow::Type::LARGE_BINARY: - BINARY_LIKE_CASE(::arrow::LargeBinaryArray) - case ::arrow::Type::LARGE_STRING: - BINARY_LIKE_CASE(::arrow::LargeStringArray) - case ::arrow::Type::FIXED_SIZE_BINARY: { - const auto& array = static_cast(values); - const auto byte_width = array.byte_width(); - return Calculate(def_levels, rep_levels, num_levels, - [&](int64_t i) { Roll(array.GetValue(i), byte_width); }); + std::vector GetChunks(const int16_t* def_levels, const int16_t* rep_levels, + int64_t num_levels, const ::arrow::Array& values) { + auto type_id = values.type()->id(); + switch (type_id) { + case ::arrow::Type::NA: { + return Calculate(def_levels, rep_levels, num_levels, [](int64_t) {}); + } + case ::arrow::Type::BOOL: { + const auto& bool_array = static_cast(values); + return Calculate(def_levels, rep_levels, num_levels, + [&](int64_t i) { return Roll(bool_array.Value(i)); }); + } + case ::arrow::Type::INT8: + case ::arrow::Type::UINT8: + FIXED_WIDTH_CASE(1) + case ::arrow::Type::INT16: + case ::arrow::Type::UINT16: + case ::arrow::Type::HALF_FLOAT: + FIXED_WIDTH_CASE(2) + case ::arrow::Type::INT32: + case ::arrow::Type::UINT32: + case ::arrow::Type::FLOAT: + case ::arrow::Type::DATE32: + case ::arrow::Type::TIME32: + FIXED_WIDTH_CASE(4) + case ::arrow::Type::INT64: + case ::arrow::Type::UINT64: + case ::arrow::Type::DOUBLE: + case ::arrow::Type::DATE64: + case ::arrow::Type::TIME64: + case ::arrow::Type::TIMESTAMP: + case ::arrow::Type::DURATION: + FIXED_WIDTH_CASE(8) + case ::arrow::Type::DECIMAL128: + FIXED_WIDTH_CASE(16) + case ::arrow::Type::DECIMAL256: + FIXED_WIDTH_CASE(32) + case ::arrow::Type::BINARY: + BINARY_LIKE_CASE(::arrow::BinaryArray) + case ::arrow::Type::STRING: + BINARY_LIKE_CASE(::arrow::StringArray) + case ::arrow::Type::LARGE_BINARY: + BINARY_LIKE_CASE(::arrow::LargeBinaryArray) + case ::arrow::Type::LARGE_STRING: + BINARY_LIKE_CASE(::arrow::LargeStringArray) + case ::arrow::Type::FIXED_SIZE_BINARY: { + const auto& array = static_cast(values); + const auto byte_width = array.byte_width(); + return Calculate(def_levels, rep_levels, num_levels, + [&](int64_t i) { Roll(array.GetValue(i), byte_width); }); + } + case ::arrow::Type::DICTIONARY: + return GetChunks(def_levels, rep_levels, num_levels, + *static_cast(values).indices()); + default: + throw ParquetException("Unsupported Arrow array type " + + values.type()->ToString()); } - case ::arrow::Type::DICTIONARY: - return GetBoundaries( - def_levels, rep_levels, num_levels, - *static_cast(values).indices()); - default: - throw ParquetException("Unsupported Arrow array type " + values.type()->ToString()); } + + private: + // Reference to the column's level information + const internal::LevelInfo& level_info_; + // Minimum chunk size in bytes, the rolling hash will not be updated until this size is + // reached for each chunk. Note that all data sent through the hash function is counted + // towards the chunk size, including definition and repetition levels. + const int64_t min_size_; + const int64_t max_size_; + // The mask to match the rolling hash against to determine if a new chunk should be + // created. The mask is calculated based on min/max chunk size and the normalization + // factor. + const uint64_t hash_mask_; + + // Whether the rolling hash has matched the mask since the last chunk creation. This + // flag is set true by the Roll() function when the mask is matched and reset to false + // by NeedNewChunk() method. + bool has_matched_ = false; + // The current run of the rolling hash, used to normalize the chunk size distribution + // by requiring multiple consecutive matches to create a new chunk. + int8_t nth_run_ = 0; + // Current chunk size in bytes, reset to 0 when a new chunk is created. + int64_t chunk_size_ = 0; + // Rolling hash state, never reset only initialized once for the entire column. + uint64_t rolling_hash_ = 0; +}; + +ContentDefinedChunker::ContentDefinedChunker(const LevelInfo& level_info, + int64_t min_size, int64_t max_size, + int8_t norm_factor) + : impl_(new Impl(level_info, min_size, max_size, norm_factor)) {} + +ContentDefinedChunker::~ContentDefinedChunker() = default; + +std::vector ContentDefinedChunker::GetChunks(const int16_t* def_levels, + const int16_t* rep_levels, + int64_t num_levels, + const ::arrow::Array& values) { + return impl_->GetChunks(def_levels, rep_levels, num_levels, values); } } // namespace parquet::internal diff --git a/cpp/src/parquet/chunker_internal.h b/cpp/src/parquet/chunker_internal.h index 53830d41a5b..50530051754 100644 --- a/cpp/src/parquet/chunker_internal.h +++ b/cpp/src/parquet/chunker_internal.h @@ -31,11 +31,6 @@ struct Chunk { int64_t level_offset; int64_t value_offset; int64_t levels_to_write; - - Chunk(int64_t level_offset, int64_t value_offset, int64_t levels_to_write) - : level_offset(level_offset), - value_offset(value_offset), - levels_to_write(levels_to_write) {} }; /// CDC (Content-Defined Chunking) is a technique that divides data into variable-sized @@ -119,6 +114,7 @@ class ContentDefinedChunker { /// ratio. ContentDefinedChunker(const LevelInfo& level_info, int64_t min_size, int64_t max_size, int8_t norm_factor = 0); + ~ContentDefinedChunker(); /// Get the chunk boundaries for the given column data /// @@ -127,57 +123,12 @@ class ContentDefinedChunker { /// @param num_levels Number of levels /// @param values Column values as an Arrow array /// @return Vector of Chunk objects representing the chunk boundaries - const std::vector GetBoundaries(const int16_t* def_levels, - const int16_t* rep_levels, int64_t num_levels, - const ::arrow::Array& values); + std::vector GetChunks(const int16_t* def_levels, const int16_t* rep_levels, + int64_t num_levels, const ::arrow::Array& values); private: - inline void Roll(const bool value); - - // Update the rolling hash with a compile-time known sized value, set has_matched_ to - // true if the hash matches the mask. - template - void inline Roll(const uint8_t* value); - - template - inline void Roll(const T* value); - - // Update the rolling hash with a binary-like value, set has_matched_ to true if the - // hash matches the mask. - inline void Roll(const uint8_t* value, int64_t length); - - // Evaluate whether a new chunk should be created based on the has_matched_, nth_run_ - // and chunk_size_ state. - inline bool NeedNewChunk(); - - // Calculate the chunk boundaries for typed Arrow arrays. - template - std::vector Calculate(const int16_t* def_levels, const int16_t* rep_levels, - int64_t num_levels, const RollFunc& RollValue); - - // Reference to the column's level information - const internal::LevelInfo& level_info_; - // Minimum chunk size in bytes, the rolling hash will not be updated until this size is - // reached for each chunk. Note that all data sent through the hash function is counted - // towards the chunk size, including definition and repetition levels. - const int64_t min_size_; - const int64_t max_size_; - // The mask to match the rolling hash against to determine if a new chunk should be - // created. The mask is calculated based on min/max chunk size and the normalization - // factor. - const uint64_t hash_mask_; - - // Whether the rolling hash has matched the mask since the last chunk creation. This - // flag is set true by the Roll() function when the mask is matched and reset to false - // by NeedNewChunk() method. - bool has_matched_ = false; - // The current run of the rolling hash, used to normalize the chunk size distribution - // by requiring multiple consecutive matches to create a new chunk. - int8_t nth_run_ = 0; - // Current chunk size in bytes, reset to 0 when a new chunk is created. - int64_t chunk_size_ = 0; - // Rolling hash state, never reset only initialized once for the entire column. - uint64_t rolling_hash_ = 0; + class Impl; + std::unique_ptr impl_; }; } // namespace parquet::internal diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 2661e260d0c..d59a8a43d8b 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1364,8 +1364,8 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, } if (properties_->cdc_enabled()) { - auto boundaries = content_defined_chunker_.GetBoundaries(def_levels, rep_levels, - num_levels, leaf_array); + auto boundaries = content_defined_chunker_.GetChunks(def_levels, rep_levels, + num_levels, leaf_array); for (auto chunk : boundaries) { auto chunk_array = leaf_array.Slice(chunk.value_offset); auto chunk_def_levels = AddIfNotNull(def_levels, chunk.level_offset); From 983ade9c2a24c24c442d3a835436393cced04e2c Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Mon, 17 Mar 2025 12:45:03 +0100 Subject: [PATCH 049/102] Prefer templated methods over macros --- cpp/src/parquet/chunker_internal.cc | 57 ++++++++++++++++------------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index f3c64a9d422..adb5a40e42a 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -29,6 +29,8 @@ namespace parquet::internal { +using ::arrow::internal::checked_cast; + /// Calculate the mask to use for the rolling hash, the mask is used to determine if a /// new chunk should be created based on the rolling hash value. The mask is calculated /// based on the min_size, max_size and norm_factor parameters. @@ -244,23 +246,28 @@ class ContentDefinedChunker::Impl { return chunks; } -#define FIXED_WIDTH_CASE(ByteWidth) \ - { \ - const auto raw_values = values.data()->GetValues(1); \ - return Calculate(def_levels, rep_levels, num_levels, [&](int64_t i) { \ - return Roll(raw_values + i * ByteWidth); \ - }); \ + template + std::vector CalculateFixedWidth(const int16_t* def_levels, + const int16_t* rep_levels, int64_t num_levels, + const ::arrow::Array& values) { + const uint8_t* raw_values = + values.data()->GetValues(1, 0) + values.offset() * kByteWidth; + return Calculate(def_levels, rep_levels, num_levels, [&](int64_t i) { + return Roll(&raw_values[i * kByteWidth]); + }); } -#define BINARY_LIKE_CASE(ArrayType) \ - { \ - const auto& array = static_cast(values); \ - const uint8_t* value; \ - ArrayType::offset_type length; \ - return Calculate(def_levels, rep_levels, num_levels, [&](int64_t i) { \ - value = array.GetValue(i, &length); \ - Roll(value, length); \ - }); \ + template + std::vector CalculateBinaryLike(const int16_t* def_levels, + const int16_t* rep_levels, int64_t num_levels, + const ::arrow::Array& values) { + const auto& array = checked_cast(values); + const uint8_t* value; + typename ArrayType::offset_type length; + return Calculate(def_levels, rep_levels, num_levels, [&](int64_t i) { + value = array.GetValue(i, &length); + Roll(value, length); + }); } std::vector GetChunks(const int16_t* def_levels, const int16_t* rep_levels, @@ -277,17 +284,17 @@ class ContentDefinedChunker::Impl { } case ::arrow::Type::INT8: case ::arrow::Type::UINT8: - FIXED_WIDTH_CASE(1) + return CalculateFixedWidth<1>(def_levels, rep_levels, num_levels, values); case ::arrow::Type::INT16: case ::arrow::Type::UINT16: case ::arrow::Type::HALF_FLOAT: - FIXED_WIDTH_CASE(2) + return CalculateFixedWidth<2>(def_levels, rep_levels, num_levels, values); case ::arrow::Type::INT32: case ::arrow::Type::UINT32: case ::arrow::Type::FLOAT: case ::arrow::Type::DATE32: case ::arrow::Type::TIME32: - FIXED_WIDTH_CASE(4) + return CalculateFixedWidth<4>(def_levels, rep_levels, num_levels, values); case ::arrow::Type::INT64: case ::arrow::Type::UINT64: case ::arrow::Type::DOUBLE: @@ -295,19 +302,19 @@ class ContentDefinedChunker::Impl { case ::arrow::Type::TIME64: case ::arrow::Type::TIMESTAMP: case ::arrow::Type::DURATION: - FIXED_WIDTH_CASE(8) + return CalculateFixedWidth<8>(def_levels, rep_levels, num_levels, values); case ::arrow::Type::DECIMAL128: - FIXED_WIDTH_CASE(16) + return CalculateFixedWidth<16>(def_levels, rep_levels, num_levels, values); case ::arrow::Type::DECIMAL256: - FIXED_WIDTH_CASE(32) + return CalculateFixedWidth<32>(def_levels, rep_levels, num_levels, values); case ::arrow::Type::BINARY: - BINARY_LIKE_CASE(::arrow::BinaryArray) case ::arrow::Type::STRING: - BINARY_LIKE_CASE(::arrow::StringArray) + return CalculateBinaryLike<::arrow::BinaryArray>(def_levels, rep_levels, + num_levels, values); case ::arrow::Type::LARGE_BINARY: - BINARY_LIKE_CASE(::arrow::LargeBinaryArray) case ::arrow::Type::LARGE_STRING: - BINARY_LIKE_CASE(::arrow::LargeStringArray) + return CalculateBinaryLike<::arrow::LargeBinaryArray>(def_levels, rep_levels, + num_levels, values); case ::arrow::Type::FIXED_SIZE_BINARY: { const auto& array = static_cast(values); const auto byte_width = array.byte_width(); From cc88a79eabc82b8aedd52b1dcabd070d0178a2ee Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Mon, 17 Mar 2025 14:46:43 +0100 Subject: [PATCH 050/102] Use VisitType instead of manual switch based dispatching --- cpp/src/parquet/chunker_internal.cc | 77 ++++++++++++----------------- 1 file changed, 31 insertions(+), 46 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index adb5a40e42a..99329f7a53d 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -23,6 +23,8 @@ #include "arrow/array.h" #include "arrow/util/logging.h" +#include "arrow/util/unreachable.h" +#include "arrow/visit_type_inline.h" #include "parquet/chunker_internal_generated.h" #include "parquet/exception.h" #include "parquet/level_conversion.h" @@ -272,62 +274,45 @@ class ContentDefinedChunker::Impl { std::vector GetChunks(const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, const ::arrow::Array& values) { - auto type_id = values.type()->id(); - switch (type_id) { - case ::arrow::Type::NA: { + auto handle_type = [&](auto&& type) -> std::vector { + using ArrowType = std::decay_t; + if constexpr (std::is_same<::arrow::DataType, ArrowType>::value) { + // TODO(kszucs): this branch should be removed once #45816 is resolved + ::arrow::Unreachable("DataType is not a concrete type"); + } else if constexpr (ArrowType::type_id == ::arrow::Type::NA) { return Calculate(def_levels, rep_levels, num_levels, [](int64_t) {}); - } - case ::arrow::Type::BOOL: { - const auto& bool_array = static_cast(values); + } else if constexpr (ArrowType::type_id == ::arrow::Type::BOOL) { + const auto& array = static_cast(values); return Calculate(def_levels, rep_levels, num_levels, - [&](int64_t i) { return Roll(bool_array.Value(i)); }); - } - case ::arrow::Type::INT8: - case ::arrow::Type::UINT8: - return CalculateFixedWidth<1>(def_levels, rep_levels, num_levels, values); - case ::arrow::Type::INT16: - case ::arrow::Type::UINT16: - case ::arrow::Type::HALF_FLOAT: - return CalculateFixedWidth<2>(def_levels, rep_levels, num_levels, values); - case ::arrow::Type::INT32: - case ::arrow::Type::UINT32: - case ::arrow::Type::FLOAT: - case ::arrow::Type::DATE32: - case ::arrow::Type::TIME32: - return CalculateFixedWidth<4>(def_levels, rep_levels, num_levels, values); - case ::arrow::Type::INT64: - case ::arrow::Type::UINT64: - case ::arrow::Type::DOUBLE: - case ::arrow::Type::DATE64: - case ::arrow::Type::TIME64: - case ::arrow::Type::TIMESTAMP: - case ::arrow::Type::DURATION: - return CalculateFixedWidth<8>(def_levels, rep_levels, num_levels, values); - case ::arrow::Type::DECIMAL128: - return CalculateFixedWidth<16>(def_levels, rep_levels, num_levels, values); - case ::arrow::Type::DECIMAL256: - return CalculateFixedWidth<32>(def_levels, rep_levels, num_levels, values); - case ::arrow::Type::BINARY: - case ::arrow::Type::STRING: - return CalculateBinaryLike<::arrow::BinaryArray>(def_levels, rep_levels, - num_levels, values); - case ::arrow::Type::LARGE_BINARY: - case ::arrow::Type::LARGE_STRING: - return CalculateBinaryLike<::arrow::LargeBinaryArray>(def_levels, rep_levels, - num_levels, values); - case ::arrow::Type::FIXED_SIZE_BINARY: { + [&](int64_t i) { return Roll(array.Value(i)); }); + } else if constexpr (ArrowType::type_id == ::arrow::Type::FIXED_SIZE_BINARY) { const auto& array = static_cast(values); const auto byte_width = array.byte_width(); return Calculate(def_levels, rep_levels, num_levels, [&](int64_t i) { Roll(array.GetValue(i), byte_width); }); - } - case ::arrow::Type::DICTIONARY: + } else if constexpr (ArrowType::type_id == ::arrow::Type::DICTIONARY) { return GetChunks(def_levels, rep_levels, num_levels, *static_cast(values).indices()); - default: + + } else if constexpr (::arrow::is_primitive(ArrowType::type_id)) { + using c_type = typename ArrowType::c_type; + return CalculateFixedWidth(def_levels, rep_levels, num_levels, + values); + } else if constexpr (::arrow::is_decimal(ArrowType::type_id)) { + return CalculateFixedWidth(def_levels, rep_levels, + num_levels, values); + } else if constexpr (::arrow::is_binary_like(ArrowType::type_id)) { + return CalculateBinaryLike<::arrow::BinaryArray>(def_levels, rep_levels, + num_levels, values); + } else if constexpr (::arrow::is_large_binary_like(ArrowType::type_id)) { + return CalculateBinaryLike<::arrow::LargeBinaryArray>(def_levels, rep_levels, + num_levels, values); + } else { throw ParquetException("Unsupported Arrow array type " + values.type()->ToString()); - } + } + }; + return ::arrow::VisitType(*values.type(), handle_type); } private: From 2e38fc07502810d233a1b5f7faa1e88ccdeb507c Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Mon, 17 Mar 2025 19:06:28 +0100 Subject: [PATCH 051/102] Refactor CDC settings + add python docstrings --- cpp/src/parquet/chunker_internal.cc | 7 +-- cpp/src/parquet/chunker_internal.h | 40 +++++++------ cpp/src/parquet/chunker_internal_test.cc | 5 +- cpp/src/parquet/column_writer.cc | 9 +-- cpp/src/parquet/properties.h | 75 ++++++++++++------------ python/pyarrow/_parquet.pxd | 14 ++--- python/pyarrow/_parquet.pyx | 33 +++++------ python/pyarrow/parquet/core.py | 31 ++++++++++ python/run_test.sh | 72 +++++++++++++++++++++++ 9 files changed, 197 insertions(+), 89 deletions(-) create mode 100755 python/run_test.sh diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index 99329f7a53d..ff3d1fe5afc 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -290,10 +290,6 @@ class ContentDefinedChunker::Impl { const auto byte_width = array.byte_width(); return Calculate(def_levels, rep_levels, num_levels, [&](int64_t i) { Roll(array.GetValue(i), byte_width); }); - } else if constexpr (ArrowType::type_id == ::arrow::Type::DICTIONARY) { - return GetChunks(def_levels, rep_levels, num_levels, - *static_cast(values).indices()); - } else if constexpr (::arrow::is_primitive(ArrowType::type_id)) { using c_type = typename ArrowType::c_type; return CalculateFixedWidth(def_levels, rep_levels, num_levels, @@ -307,6 +303,9 @@ class ContentDefinedChunker::Impl { } else if constexpr (::arrow::is_large_binary_like(ArrowType::type_id)) { return CalculateBinaryLike<::arrow::LargeBinaryArray>(def_levels, rep_levels, num_levels, values); + } else if constexpr (::arrow::is_dictionary(ArrowType::type_id)) { + return GetChunks(def_levels, rep_levels, num_levels, + *static_cast(values).indices()); } else { throw ParquetException("Unsupported Arrow array type " + values.type()->ToString()); diff --git a/cpp/src/parquet/chunker_internal.h b/cpp/src/parquet/chunker_internal.h index 50530051754..0288be237ce 100644 --- a/cpp/src/parquet/chunker_internal.h +++ b/cpp/src/parquet/chunker_internal.h @@ -22,6 +22,7 @@ #include #include "arrow/array.h" #include "parquet/level_conversion.h" +#include "parquet/properties.h" namespace parquet::internal { @@ -94,24 +95,29 @@ class ContentDefinedChunker { /// Create a new ContentDefinedChunker instance /// /// @param level_info Information about definition and repetition levels - /// @param min_size Minimum chunk size in bytes, the rolling hash will not be updated - /// until this size is reached for each chunk. Note that all data sent - /// through the hash function is counted towards the chunk size, - /// including definition and repetition levels if present. - /// @param max_size Maximum chunk size in bytes, the chunker will create a new chunk - /// whenever the chunk size exceeds this value. The chunker will - /// attempt to uniformly distribute the chunks between min_size and - /// max_size. + /// @param min_size Minimum chunk size in bytes + /// The rolling hash will not be updated until this size is reached for + /// each chunk. Note that all data sent through the hash function is + /// counted towards the chunk size, including definition and repetition + /// levels if present. + /// @param max_size Maximum chunk size in bytes + /// The chunker creates a new chunk whenever the chunk size exceeds this + /// value. The chunk size distribution approximates a normal + /// distribution between min_size and max_size. Note that the parquet + /// writer has a related `data_pagesize` property that controls the + /// maximum size of a parquet data page after encoding. While setting + /// `data_pagesize` to a smaller value than `max_size` doesn't affect + /// the chunking effectiveness, it results in more small parquet data + /// pages. /// @param norm_factor Normalization factor to center the chunk size around the average - /// size more aggressively. By increasing the normalization factor, - /// probability of finding a chunk boundary increases improving the - /// deduplication ratio, but also increases the number of small - /// chunks resulting in small parquet data pages. The default value - /// provides a good balance between deduplication ratio and - /// fragmentation. Use norm_factor=1 or norm_factor=2 if a higher - /// deduplication ratio is required at the expense of fragmentation, - /// norm_factor>2 is typically not increasing the deduplication - /// ratio. + /// size more aggressively, default 0. + /// Increasing the normalization factor increases the probability of + /// finding a chunk boundary, improving the deduplication ratio, but + /// also increases the number of small chunks resulting in many small + /// parquet data pages. The default value provides a good balance + /// between deduplication ratio and fragmentation. Use norm_factor=1 + /// or norm_factor=2 to reach a higher deduplication ratio at the + /// expense of fragmentation. ContentDefinedChunker(const LevelInfo& level_info, int64_t min_size, int64_t max_size, int8_t norm_factor = 0); ~ContentDefinedChunker(); diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index d5c625a0935..bb6fd954f28 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -264,9 +264,8 @@ Result> WriteTableToBuffer(const std::shared_ptr
& auto sink = CreateOutputStream(); auto builder = WriterProperties::Builder(); - builder.enable_cdc() - ->cdc_size_range(min_chunk_size, max_chunk_size) - ->cdc_norm_factor(0); + builder.enable_content_defined_chunking()->content_defined_chunking_options( + min_chunk_size, max_chunk_size, /*norm_factor=*/0); if (enable_dictionary) { builder.enable_dictionary(); } else { diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index d59a8a43d8b..4123930cf3a 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -754,9 +754,10 @@ class ColumnWriterImpl { fallback_(false), definition_levels_sink_(allocator_), repetition_levels_sink_(allocator_), - content_defined_chunker_(level_info_, properties->cdc_size_range().first, - properties->cdc_size_range().second, - properties->cdc_norm_factor()) { + content_defined_chunker_( + level_info_, properties->content_defined_chunking_options().min_chunk_size, + properties->content_defined_chunking_options().max_chunk_size, + properties->content_defined_chunking_options().norm_factor) { definition_levels_rle_ = std::static_pointer_cast(AllocateBuffer(allocator_, 0)); repetition_levels_rle_ = @@ -1363,7 +1364,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, bits_buffer_->ZeroPadding(); } - if (properties_->cdc_enabled()) { + if (properties_->content_defined_chunking_enabled()) { auto boundaries = content_defined_chunker_.GetChunks(def_levels, rep_levels, num_levels, leaf_array); for (auto chunk : boundaries) { diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index c6d97acc1d1..79fe7198111 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -167,9 +167,6 @@ static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOM static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = true; static constexpr SizeStatisticsLevel DEFAULT_SIZE_STATISTICS_LEVEL = SizeStatisticsLevel::PageAndColumnChunk; -static constexpr std::pair DEFAULT_CDC_SIZE_RANGE = - std::make_pair(256 * 1024, 1024 * 1024); -static constexpr int8_t DEFAULT_CDC_NORM_FACTOR = 0; class PARQUET_EXPORT ColumnProperties { public: @@ -248,6 +245,14 @@ class PARQUET_EXPORT ColumnProperties { bool page_index_enabled_; }; +struct CDCOptions { + int64_t min_chunk_size; + int64_t max_chunk_size; + int8_t norm_factor; +}; + +static constexpr CDCOptions kDefaultCdcOptions = CDCOptions{256 * 1024, 1024 * 1024, 0}; + class PARQUET_EXPORT WriterProperties { public: class Builder { @@ -264,9 +269,8 @@ class PARQUET_EXPORT WriterProperties { store_decimal_as_integer_(false), page_checksum_enabled_(false), size_statistics_level_(DEFAULT_SIZE_STATISTICS_LEVEL), - cdc_enabled_(false), - cdc_size_range_(DEFAULT_CDC_SIZE_RANGE), - cdc_norm_factor_(DEFAULT_CDC_NORM_FACTOR) {} + content_defined_chunking_enabled_(false), + content_defined_chunking_options_(kDefaultCdcOptions) {} explicit Builder(const WriterProperties& properties) : pool_(properties.memory_pool()), @@ -282,29 +286,28 @@ class PARQUET_EXPORT WriterProperties { size_statistics_level_(properties.size_statistics_level()), sorting_columns_(properties.sorting_columns()), default_column_properties_(properties.default_column_properties()), - cdc_enabled_(properties.cdc_enabled()), - cdc_size_range_(properties.cdc_size_range()), - cdc_norm_factor_(properties.cdc_norm_factor()) {} + content_defined_chunking_enabled_( + properties.content_defined_chunking_enabled()), + content_defined_chunking_options_( + properties.content_defined_chunking_options()) {} virtual ~Builder() {} - Builder* enable_cdc() { - cdc_enabled_ = true; - return this; - } - - Builder* disable_cdc() { - cdc_enabled_ = false; + Builder* enable_content_defined_chunking() { + content_defined_chunking_enabled_ = true; return this; } - Builder* cdc_size_range(int64_t min_size, int64_t max_size) { - cdc_size_range_ = std::make_pair(min_size, max_size); + Builder* disable_content_defined_chunking() { + content_defined_chunking_enabled_ = false; return this; } - Builder* cdc_norm_factor(int8_t norm_factor) { - cdc_norm_factor_ = norm_factor; + Builder* content_defined_chunking_options( + int64_t min_chunk_size, int64_t max_chunk_size, + int8_t norm_factor = kDefaultCdcOptions.norm_factor) { + content_defined_chunking_options_ = + CDCOptions{min_chunk_size, max_chunk_size, norm_factor}; return this; } @@ -730,8 +733,8 @@ class PARQUET_EXPORT WriterProperties { pagesize_, version_, created_by_, page_checksum_enabled_, size_statistics_level_, std::move(file_encryption_properties_), default_column_properties_, column_properties, data_page_version_, - store_decimal_as_integer_, std::move(sorting_columns_), cdc_enabled_, - cdc_size_range_, cdc_norm_factor_)); + store_decimal_as_integer_, std::move(sorting_columns_), + content_defined_chunking_enabled_, content_defined_chunking_options_)); } private: @@ -761,9 +764,8 @@ class PARQUET_EXPORT WriterProperties { std::unordered_map statistics_enabled_; std::unordered_map page_index_enabled_; - bool cdc_enabled_; - std::pair cdc_size_range_; - int8_t cdc_norm_factor_; + bool content_defined_chunking_enabled_; + CDCOptions content_defined_chunking_options_; }; inline MemoryPool* memory_pool() const { return pool_; } @@ -788,9 +790,12 @@ class PARQUET_EXPORT WriterProperties { inline bool page_checksum_enabled() const { return page_checksum_enabled_; } - inline bool cdc_enabled() const { return cdc_enabled_; } - inline std::pair cdc_size_range() const { return cdc_size_range_; } - inline int8_t cdc_norm_factor() const { return cdc_norm_factor_; } + inline bool content_defined_chunking_enabled() const { + return content_defined_chunking_enabled_; + } + inline CDCOptions content_defined_chunking_options() const { + return content_defined_chunking_options_; + } inline SizeStatisticsLevel size_statistics_level() const { return size_statistics_level_; @@ -894,8 +899,8 @@ class PARQUET_EXPORT WriterProperties { const ColumnProperties& default_column_properties, const std::unordered_map& column_properties, ParquetDataPageVersion data_page_version, bool store_short_decimal_as_integer, - std::vector sorting_columns, bool cdc_enabled, - std::pair cdc_size_range, int8_t cdc_norm_factor) + std::vector sorting_columns, bool content_defined_chunking_enabled, + CDCOptions content_defined_chunking_options) : pool_(pool), dictionary_pagesize_limit_(dictionary_pagesize_limit), write_batch_size_(write_batch_size), @@ -911,9 +916,8 @@ class PARQUET_EXPORT WriterProperties { sorting_columns_(std::move(sorting_columns)), default_column_properties_(default_column_properties), column_properties_(column_properties), - cdc_enabled_(cdc_enabled), - cdc_size_range_(cdc_size_range), - cdc_norm_factor_(cdc_norm_factor) {} + content_defined_chunking_enabled_(content_defined_chunking_enabled), + content_defined_chunking_options_(content_defined_chunking_options) {} MemoryPool* pool_; int64_t dictionary_pagesize_limit_; @@ -934,9 +938,8 @@ class PARQUET_EXPORT WriterProperties { ColumnProperties default_column_properties_; std::unordered_map column_properties_; - bool cdc_enabled_; - std::pair cdc_size_range_; - int8_t cdc_norm_factor_; + bool content_defined_chunking_enabled_; + CDCOptions content_defined_chunking_options_; }; PARQUET_EXPORT const std::shared_ptr& default_writer_properties(); diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 02bb070aadb..dec04eb77e0 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -495,10 +495,11 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: Builder* disable_write_page_index() Builder* enable_page_checksum() Builder* disable_page_checksum() - Builder* enable_cdc() - Builder* disable_cdc() - Builder* cdc_size_range(uint64_t min_size, uint64_t max_size) - Builder* cdc_norm_factor(uint8_t norm_factor) + Builder* enable_content_defined_chunking() + Builder* disable_content_defined_chunking() + Builder* content_defined_chunking_options(int64_t min_size, + int64_t max_size, + int8_t norm_factor) shared_ptr[WriterProperties] build() cdef cppclass ArrowWriterProperties: @@ -516,7 +517,6 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: shared_ptr[ArrowWriterProperties] build() c_bool support_deprecated_int96_timestamps() - cdef extern from "parquet/arrow/reader.h" namespace "parquet::arrow" nogil: cdef cppclass FileReader: FileReader(CMemoryPool* pool, unique_ptr[ParquetFileReader] reader) @@ -650,9 +650,7 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( write_page_checksum=*, sorting_columns=*, store_decimal_as_integer=*, - cdc=*, - cdc_size_range=*, - cdc_norm_factor=*, + use_content_defined_chunking=* ) except * diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index e0b39666e1e..c6c58b456c9 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1973,9 +1973,7 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( write_page_checksum=False, sorting_columns=None, store_decimal_as_integer=False, - cdc=False, - cdc_size_range=None, - cdc_norm_factor=None) except *: + use_content_defined_chunking=False) except *: """General writer properties""" cdef: @@ -2105,7 +2103,6 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( "'column_encoding' should be a dictionary or a string") # size limits - if data_page_size is not None: props.data_pagesize(data_page_size) @@ -2116,16 +2113,20 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( props.dictionary_pagesize_limit(dictionary_pagesize_limit) # content defined chunking - if cdc: - props.enable_cdc() - else: - props.disable_cdc() - if cdc_size_range is not None: - min_size, max_size = cdc_size_range - props.cdc_size_range(min_size, max_size) - if cdc_norm_factor is not None: - props.cdc_norm_factor(cdc_norm_factor) + if use_content_defined_chunking is True: + props.enable_content_defined_chunking() + elif use_content_defined_chunking is False: + props.disable_content_defined_chunking() + elif isinstance(use_content_defined_chunking, dict): + props.enable_content_defined_chunking() + props.content_defined_chunking_options( + use_content_defined_chunking["min_chunk_size"], + use_content_defined_chunking["max_chunk_size"], + use_content_defined_chunking.get("norm_factor", 0) + ) + else: + raise TypeError("'use_content_defined_chunking' should be either boolean or a dictionary") # encryption @@ -2277,9 +2278,7 @@ cdef class ParquetWriter(_Weakrefable): write_page_checksum=False, sorting_columns=None, store_decimal_as_integer=False, - cdc=False, - cdc_size_range=None, - cdc_norm_factor=None): + use_content_defined_chunking=False): cdef: shared_ptr[WriterProperties] properties shared_ptr[ArrowWriterProperties] arrow_properties @@ -2314,7 +2313,7 @@ cdef class ParquetWriter(_Weakrefable): write_page_checksum=write_page_checksum, sorting_columns=sorting_columns, store_decimal_as_integer=store_decimal_as_integer, - cdc=cdc, cdc_size_range=cdc_size_range, cdc_norm_factor=cdc_norm_factor + use_content_defined_chunking=use_content_defined_chunking ) arrow_properties = _create_arrow_writer_properties( use_deprecated_int96_timestamps=use_deprecated_int96_timestamps, diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 3196686b7ae..9610a859da4 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -892,6 +892,37 @@ def _sanitize_table(table, new_schema, flavor): - fixed_len_byte_array: for precision > 18. As a consequence, decimal columns stored in integer types are more compact. +use_content_defined_chunking : bool or dict, default False + Optimize parquet files for content addressable storage (CAS) systems by writing + data pages according to content-defined chunk boundaries. This allows for more + efficient deduplication of data across files, hence more efficient network + transfers and storage. The chunking is based on a rolling hash algorithm that + identifies chunk boundaries based on the actual content of the data. + + If set to ``True``, a default configuration is used with `min_chunk_size=256 KiB` + and `max_chunk_size=1024 KiB`. The chunk size distribution approximates a normal + distribution between `min_chunk_size` and `max_chunk_size` (sizes are accounted + before any Parquet encodings). + + A `dict` can be passed to adjust the chunker parameters with the following keys: + - `min_chunk_size`: minimum chunk size in bytes, default 256 KiB + The rolling hash will not be updated until this size is reached for each chunk. + Note that all data sent through the hash function is counted towards the chunk + size, including definition and repetition levels if present. + - `max_chunk_size`: maximum chunk size in bytes, default is 1024 KiB + The chunker will create a new chunk whenever the chunk size exceeds this value. + Note that the parquet writer has a related `data_pagesize` property that controls + the maximum size of a parquet data page after encoding. While setting + `data_page_size` to a smaller value than `max_chunk_size` doesn't affect the + chunking effectiveness, it results in more small parquet data pages. + - `norm_factor`: normalization factor to center the chunk size around the average + size more aggressively, default 0 + Increasing the normalization factor increases the probability of finding a chunk, + improving the deduplication ratio, but also increasing the number of small chunks + resulting in many small parquet data pages. The default value provides a good + balance between deduplication ratio and fragmentation. Use norm_factor=1 or + norm_factor=2 to reach a higher deduplication ratio at the expense of + fragmentation. """ _parquet_writer_example_doc = """\ diff --git a/python/run_test.sh b/python/run_test.sh new file mode 100755 index 00000000000..6476c12dcd4 --- /dev/null +++ b/python/run_test.sh @@ -0,0 +1,72 @@ +set -e + +# -DARROW_USE_ASAN=OFF \ +# -DARROW_USE_UBSAN=OFF \ +# -DARROW_USE_TSAN=OFF \ + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd) +ARROW_DIR=${SCRIPT_DIR}/.. +export ARROW_BUILD_TYPE=${ARROW_BUILD_TYPE:-debug} +export ARROW_HOME=$CONDA_PREFIX +export PARQUET_TEST_DATA=${ARROW_DIR}/cpp/submodules/parquet-testing/data +export ARROW_TEST_DATA=${ARROW_DIR}/testing/data + +export ARROW_HDFS_TEST_HOST=impala +export ARROW_HDFS_TEST_PORT=8020 +export ARROW_HDFS_TEST_USER=hdfs + +mkdir -p ${ARROW_DIR}/cpp/build +pushd ${ARROW_DIR}/cpp/build + +cmake -GNinja \ + -DARROW_BUILD_BENCHMARKS=OFF \ + -DARROW_BUILD_STATIC=OFF \ + -DARROW_BUILD_TESTS=ON \ + -DARROW_USE_ASAN=OFF \ + -DARROW_DATASET=ON \ + -DARROW_EXTRA_ERROR_CONTEXT=ON \ + -DARROW_BUILD_INTEGRATION=ON \ + -DARROW_DEPENDENCY_SOURCE=CONDA \ + -DARROW_FLIGHT=OFF \ + -DARROW_GANDIVA=OFF \ + -DARROW_JEMALLOC=ON \ + -DARROW_MIMALLOC=ON \ + -DARROW_WITH_SNAPPY=ON \ + -DARROW_WITH_LZ4=ON \ + -DARROW_WITH_ZSTD=ON \ + -DARROW_COMPUTE=ON \ + -DARROW_PARQUET=ON \ + -DARROW_CSV=ON \ + -DARROW_ORC=OFF \ + -DARROW_USE_CCACHE=ON \ + -DARROW_S3=ON \ + -DARROW_TEST_MEMCHECK=OFF \ + -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=YES \ + -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ + .. + +ninja +ninja install + +popd + +export PYARROW_CMAKE_GENERATOR=Ninja +export PYARROW_BUILD_TYPE=debug +export PYARROW_WITH_PARQUET=1 +# export PYARROW_WITH_HDFS=1 +# export PYARROW_WITH_GANDIVA=0 +export PYARROW_WITH_DATASET=1 +# export PYARROW_WITH_FLIGHT=1 +export PYARROW_WITH_S3=1 +export PYARROW_PARALLEL=8 +# export PYARROW_WITH_ORC=1 + +# # export DYLD_INSERT_LIBRARIES=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/clang/12.0.0/lib/darwin/libclang_rt.asan_osx_dynamic.dylib +# # export DYLD_INSERT_LIBRARIES=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/clang/12.0.0/lib/darwin/libclang_rt.tsan_osx_dynamic.dylib + +pushd ${ARROW_DIR}/python +#python setup.py build_ext --inplace +python setup.py develop +popd +# pytest -sv "$@" From 4558a6c338bf7cd825e96fc56cdf035f4365dd19 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Mon, 17 Mar 2025 19:55:18 +0100 Subject: [PATCH 052/102] Fix python linting error --- python/pyarrow/_parquet.pyx | 3 +- python/run_test.sh | 72 ------------------------------------- 2 files changed, 2 insertions(+), 73 deletions(-) delete mode 100755 python/run_test.sh diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index c6c58b456c9..0516a999621 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -2126,7 +2126,8 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( use_content_defined_chunking.get("norm_factor", 0) ) else: - raise TypeError("'use_content_defined_chunking' should be either boolean or a dictionary") + raise TypeError( + "'use_content_defined_chunking' should be either boolean or a dictionary") # encryption diff --git a/python/run_test.sh b/python/run_test.sh deleted file mode 100755 index 6476c12dcd4..00000000000 --- a/python/run_test.sh +++ /dev/null @@ -1,72 +0,0 @@ -set -e - -# -DARROW_USE_ASAN=OFF \ -# -DARROW_USE_UBSAN=OFF \ -# -DARROW_USE_TSAN=OFF \ - -SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd) -ARROW_DIR=${SCRIPT_DIR}/.. -export ARROW_BUILD_TYPE=${ARROW_BUILD_TYPE:-debug} -export ARROW_HOME=$CONDA_PREFIX -export PARQUET_TEST_DATA=${ARROW_DIR}/cpp/submodules/parquet-testing/data -export ARROW_TEST_DATA=${ARROW_DIR}/testing/data - -export ARROW_HDFS_TEST_HOST=impala -export ARROW_HDFS_TEST_PORT=8020 -export ARROW_HDFS_TEST_USER=hdfs - -mkdir -p ${ARROW_DIR}/cpp/build -pushd ${ARROW_DIR}/cpp/build - -cmake -GNinja \ - -DARROW_BUILD_BENCHMARKS=OFF \ - -DARROW_BUILD_STATIC=OFF \ - -DARROW_BUILD_TESTS=ON \ - -DARROW_USE_ASAN=OFF \ - -DARROW_DATASET=ON \ - -DARROW_EXTRA_ERROR_CONTEXT=ON \ - -DARROW_BUILD_INTEGRATION=ON \ - -DARROW_DEPENDENCY_SOURCE=CONDA \ - -DARROW_FLIGHT=OFF \ - -DARROW_GANDIVA=OFF \ - -DARROW_JEMALLOC=ON \ - -DARROW_MIMALLOC=ON \ - -DARROW_WITH_SNAPPY=ON \ - -DARROW_WITH_LZ4=ON \ - -DARROW_WITH_ZSTD=ON \ - -DARROW_COMPUTE=ON \ - -DARROW_PARQUET=ON \ - -DARROW_CSV=ON \ - -DARROW_ORC=OFF \ - -DARROW_USE_CCACHE=ON \ - -DARROW_S3=ON \ - -DARROW_TEST_MEMCHECK=OFF \ - -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ - -DCMAKE_EXPORT_COMPILE_COMMANDS=YES \ - -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ - .. - -ninja -ninja install - -popd - -export PYARROW_CMAKE_GENERATOR=Ninja -export PYARROW_BUILD_TYPE=debug -export PYARROW_WITH_PARQUET=1 -# export PYARROW_WITH_HDFS=1 -# export PYARROW_WITH_GANDIVA=0 -export PYARROW_WITH_DATASET=1 -# export PYARROW_WITH_FLIGHT=1 -export PYARROW_WITH_S3=1 -export PYARROW_PARALLEL=8 -# export PYARROW_WITH_ORC=1 - -# # export DYLD_INSERT_LIBRARIES=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/clang/12.0.0/lib/darwin/libclang_rt.asan_osx_dynamic.dylib -# # export DYLD_INSERT_LIBRARIES=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/clang/12.0.0/lib/darwin/libclang_rt.tsan_osx_dynamic.dylib - -pushd ${ARROW_DIR}/python -#python setup.py build_ext --inplace -python setup.py develop -popd -# pytest -sv "$@" From 119393a1ac36d27f003e790bfe3559e71be28eb0 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Mon, 17 Mar 2025 20:29:55 +0100 Subject: [PATCH 053/102] Calculate mask bits using arrow bit utils --- cpp/src/parquet/chunker_internal.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index ff3d1fe5afc..859fabb4eb5 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -22,6 +22,7 @@ #include #include "arrow/array.h" +#include "arrow/util/bit_util.h" #include "arrow/util/logging.h" #include "arrow/util/unreachable.h" #include "arrow/visit_type_inline.h" @@ -67,8 +68,8 @@ static uint64_t GetMask(int64_t min_size, int64_t max_size, uint8_t norm_factor) // `min_size` bytes int64_t target_size = avg_size - min_size; // assuming that the gear hash has a uniform distribution, we can calculate the mask - // by taking the log2 of the target size - size_t mask_bits = static_cast(std::floor(std::log2(target_size))); + // by taking the floor(log2(target_size)) + size_t mask_bits = ::arrow::bit_util::NumRequiredBits(target_size) - 1; // -3 because we are using 8 hash tables to have more gaussian-like distribution, // a user defined `norm_factor` can be used to adjust the mask size, hence the matching // probability, by increasing the norm_factor we increase the probability of matching From d7f366654c26e02d73ac6fc05a5dca958d874a20 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Tue, 18 Mar 2025 11:34:17 +0100 Subject: [PATCH 054/102] Raise from WriteBatch() and WriteBatchSpaced() if CDC is enabled --- cpp/src/parquet/column_writer.cc | 42 +++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 4123930cf3a..b29c01712b1 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1276,6 +1276,16 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, int64_t WriteBatch(int64_t num_values, const int16_t* def_levels, const int16_t* rep_levels, const T* values) override { + if (properties_->content_defined_chunking_enabled()) { + throw ParquetException( + "Content-defined chunking is not yet supported for WriteBatch() and " + "WriteBatchSpaced(), use WriteArrow() instead"); + } + return WriteBatchInternal(num_values, def_levels, rep_levels, values); + } + + int64_t WriteBatchInternal(int64_t num_values, const int16_t* def_levels, + const int16_t* rep_levels, const T* values) { // We check for DataPage limits only after we have inserted the values. If a user // writes a large number of values, the DataPage size can be much above the limit. // The purpose of this chunking is to bound this. Even if a user writes large number @@ -1308,6 +1318,18 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels, const int16_t* rep_levels, const uint8_t* valid_bits, int64_t valid_bits_offset, const T* values) override { + if (properties_->content_defined_chunking_enabled()) { + throw ParquetException( + "Content-defined chunking is not yet supported for WriteBatch() and " + "WriteBatchSpaced(), use WriteArrow() instead"); + } + return WriteBatchSpacedInternal(num_values, def_levels, rep_levels, valid_bits, + valid_bits_offset, values); + } + + void WriteBatchSpacedInternal(int64_t num_values, const int16_t* def_levels, + const int16_t* rep_levels, const uint8_t* valid_bits, + int64_t valid_bits_offset, const T* values) { // Like WriteBatch, but for spaced values int64_t value_offset = 0; auto WriteChunk = [&](int64_t offset, int64_t batch_size, bool check_page) { @@ -1436,11 +1458,12 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, this->descr()->schema_node()->is_required() || (array.null_count() == 0); if (!maybe_parent_nulls && no_nulls) { - PARQUET_CATCH_NOT_OK(WriteBatch(num_levels, def_levels, rep_levels, values)); + PARQUET_CATCH_NOT_OK( + WriteBatchInternal(num_levels, def_levels, rep_levels, values)); } else { - PARQUET_CATCH_NOT_OK(WriteBatchSpaced(num_levels, def_levels, rep_levels, - data.null_bitmap_data(), data.offset(), - values)); + PARQUET_CATCH_NOT_OK(WriteBatchSpacedInternal(num_levels, def_levels, rep_levels, + data.null_bitmap_data(), + data.offset(), values)); } return Status::OK(); } @@ -2003,11 +2026,11 @@ Status TypedColumnWriterImpl::WriteArrowSerialize( bool no_nulls = this->descr()->schema_node()->is_required() || (array.null_count() == 0); if (!maybe_parent_nulls && no_nulls) { - PARQUET_CATCH_NOT_OK(WriteBatch(num_levels, def_levels, rep_levels, buffer)); + PARQUET_CATCH_NOT_OK(WriteBatchInternal(num_levels, def_levels, rep_levels, buffer)); } else { - PARQUET_CATCH_NOT_OK(WriteBatchSpaced(num_levels, def_levels, rep_levels, - array.null_bitmap_data(), array.offset(), - buffer)); + PARQUET_CATCH_NOT_OK(WriteBatchSpacedInternal(num_levels, def_levels, rep_levels, + array.null_bitmap_data(), + array.offset(), buffer)); } return Status::OK(); } @@ -2131,7 +2154,8 @@ Status TypedColumnWriterImpl::WriteArrowDense( const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) { switch (array.type()->id()) { case ::arrow::Type::NA: { - PARQUET_CATCH_NOT_OK(WriteBatch(num_levels, def_levels, rep_levels, nullptr)); + PARQUET_CATCH_NOT_OK( + WriteBatchInternal(num_levels, def_levels, rep_levels, nullptr)); } break; WRITE_SERIALIZE_CASE(INT8) WRITE_SERIALIZE_CASE(UINT8) From 1b67e6bc91101f6661df1338fb4f131b95ffbab4 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 19 Mar 2025 12:56:45 +0100 Subject: [PATCH 055/102] Test that WriteBatch() and WriteBatchSpaced() raises with CDC enabled --- cpp/src/parquet/chunker_internal_test.cc | 30 ++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index bb6fd954f28..b76fc277e48 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -959,7 +959,37 @@ INSTANTIATE_TEST_SUITE_P( CaseConfig{::arrow::struct_({::arrow::field("f0", ::arrow::float64())}), true, 10})); +TEST(TestColumnCDC, WriteSingleColumnParquetFile) { + // Define the schema with a single column "number" + auto schema = std::dynamic_pointer_cast(schema::GroupNode::Make( + "root", Repetition::REQUIRED, + {schema::PrimitiveNode::Make("number", Repetition::REQUIRED, Type::INT32)})); + + auto sink = CreateOutputStream(); + auto builder = WriterProperties::Builder(); + auto props = builder.enable_content_defined_chunking()->build(); + + auto writer = ParquetFileWriter::Open(sink, schema, props); + auto row_group_writer = writer->AppendRowGroup(); + + // Create a column writer for the "number" column + auto column_writer = row_group_writer->NextColumn(); + auto& int_column_writer = dynamic_cast(*column_writer); + + std::vector numbers = {1, 2, 3, 4, 5}; + std::vector valid_bits = {1, 0, 1, 0, 1}; + EXPECT_THROW( + int_column_writer.WriteBatch(numbers.size(), nullptr, nullptr, numbers.data()), + ParquetException); + EXPECT_THROW(int_column_writer.WriteBatchSpaced(numbers.size(), nullptr, nullptr, + valid_bits.data(), 0, numbers.data()), + ParquetException); +} + } // namespace parquet // TODO: // - test multiple row groups +// - place information about the used CDC parameters to the metadata +// - test the effect of the normalization factor +// - do more validation on min/max chunk size From 53282cc0c5090a202000cd41a5ce8f1eaf093ce6 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 19 Mar 2025 17:57:52 +0100 Subject: [PATCH 056/102] Add tests for the multi-row-group use case --- cpp/src/parquet/chunker_internal_test.cc | 523 ++++++++++++++++------- 1 file changed, 375 insertions(+), 148 deletions(-) diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index b76fc277e48..34431469e28 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -42,6 +42,7 @@ using ::arrow::DataType; using ::arrow::default_memory_pool; using ::arrow::Field; using ::arrow::Result; +using ::arrow::Schema; using ::arrow::Table; using ::arrow::io::BufferReader; using ::parquet::arrow::FileReader; @@ -294,16 +295,20 @@ Result> ReadTableFromBuffer(const std::shared_ptr using ChunkList = std::vector; // Type to represent the sizes and lengths of the data pages in a column. -struct PageInfo { - ChunkList lengths; - ChunkList sizes; + +struct RowGroupInfo { + ChunkList page_lengths; + ChunkList page_sizes; }; -PageInfo GetColumnPageInfo(const std::shared_ptr& data, int column_index = 0) { +using ParquetInfo = std::vector; + +ParquetInfo GetColumnParquetInfo(const std::shared_ptr& data, + int column_index = 0) { // Read the parquet data out of the buffer and get the sizes and lengths of the // data pages in given column. We assert on the sizes and lengths of the pages // to ensure that the chunking is done correctly. - PageInfo result; + ParquetInfo result; auto buffer_reader = std::make_shared(data); auto parquet_reader = ParquetFileReader::Open(std::move(buffer_reader)); @@ -311,26 +316,30 @@ PageInfo GetColumnPageInfo(const std::shared_ptr& data, int column_index auto metadata = parquet_reader->metadata(); for (int rg = 0; rg < metadata->num_row_groups(); rg++) { auto page_reader = parquet_reader->RowGroup(rg)->GetColumnPageReader(column_index); + RowGroupInfo rg_info; while (auto page = page_reader->NextPage()) { if (page->type() == PageType::DATA_PAGE || page->type() == PageType::DATA_PAGE_V2) { auto data_page = static_cast(page.get()); - result.sizes.push_back(data_page->size()); - result.lengths.push_back(data_page->num_values()); + rg_info.page_sizes.push_back(data_page->size()); + rg_info.page_lengths.push_back(data_page->num_values()); } } + result.push_back(rg_info); } return result; } -Result WriteAndGetPageInfo(const std::shared_ptr
& table, - uint64_t min_chunk_size, uint64_t max_chunk_size, - bool enable_dictionary = false, - int column_index = 0) { +Result WriteAndGetParquetInfo(const std::shared_ptr
& table, + uint64_t min_chunk_size, + uint64_t max_chunk_size, + bool enable_dictionary = false, + int64_t row_group_size = 1024 * 1024, + int column_index = 0) { // Write the table to a buffer and read it back to get the page sizes - ARROW_ASSIGN_OR_RAISE( - auto buffer, - WriteTableToBuffer(table, min_chunk_size, max_chunk_size, enable_dictionary)); + ARROW_ASSIGN_OR_RAISE(auto buffer, + WriteTableToBuffer(table, min_chunk_size, max_chunk_size, + enable_dictionary, row_group_size)); ARROW_ASSIGN_OR_RAISE(auto readback, ReadTableFromBuffer(buffer)); RETURN_NOT_OK(readback->ValidateFull()); @@ -338,7 +347,7 @@ Result WriteAndGetPageInfo(const std::shared_ptr
& table, ARROW_RETURN_IF(!readback->Equals(*table), Status::Invalid("Readback table not equal to original")); } - return GetColumnPageInfo(buffer, column_index); + return GetColumnParquetInfo(buffer, column_index); } // A git-hunk like side-by-side data structure to represent the differences between two @@ -574,80 +583,82 @@ TEST(TestFindDifferences, AdditionalCase) { } } -void AssertUpdateCase(const std::shared_ptr<::arrow::DataType>& dtype, - const ChunkList& original, const ChunkList& modified, - uint8_t n_modifications) { - auto diffs = FindDifferences(original, modified); - if (diffs.size() > n_modifications) { - PrintDifferences(original, modified, diffs); - } - ASSERT_LE(diffs.size(), n_modifications); - - for (const auto& diff : diffs) { - if (!::arrow::is_list_like(dtype->id())) { - uint64_t left_sum = 0, right_sum = 0; - for (const auto& val : diff.first) left_sum += val; - for (const auto& val : diff.second) right_sum += val; - ASSERT_EQ(left_sum, right_sum); - } - ASSERT_LE(diff.first.size(), 2); - ASSERT_LE(diff.second.size(), 2); +void AssertPageLengthDifferences(const RowGroupInfo& original, + const RowGroupInfo& modified, + int8_t exact_number_of_equal_diffs, + int8_t exact_number_of_larger_diffs, + int8_t exact_number_of_smaller_diffs, + int64_t edit_length = 0) { + // Asserts that the differences between the original and modified page lengths + // are as expected. A longest common subsequence diff is calculated on the original + // and modified sequences of page lengths. The exact_number_of_equal_diffs, + // exact_number_of_larger_diffs, and exact_number_of_smaller_diffs parameters specify + // the expected number of differences with equal, larger, and smaller sums of the page + // lengths respectively. The edit_length parameter is used to verify that the page + // lenght differences are exactly equal to the edit_length. + auto diffs = FindDifferences(original.page_lengths, modified.page_lengths); + size_t expected_number_of_diffs = exact_number_of_equal_diffs + + exact_number_of_larger_diffs + + exact_number_of_smaller_diffs; + if (diffs.size() != expected_number_of_diffs) { + PrintDifferences(original.page_lengths, modified.page_lengths, diffs); } - if (diffs.size() == 0) { // no differences found, the arrays are equal - ASSERT_TRUE(original == modified); - } -} - -void AssertDeleteCase(const std::shared_ptr<::arrow::DataType>& dtype, - const ChunkList& original, const ChunkList& modified, - uint8_t n_modifications, uint64_t edit_length) { - auto diffs = FindDifferences(original, modified); - if (diffs.size() != n_modifications) { - PrintDifferences(original, modified, diffs); + ASSERT_TRUE(original.page_lengths == modified.page_lengths); } - ASSERT_EQ(diffs.size(), n_modifications); + ASSERT_EQ(diffs.size(), expected_number_of_diffs); + uint8_t equal_diffs = 0; + int8_t larger_diffs = 0; + int8_t smaller_diffs = 0; for (const auto& diff : diffs) { - if (!::arrow::is_list_like(dtype->id())) { - uint64_t left_sum = 0, right_sum = 0; - for (const auto& val : diff.first) left_sum += val; - for (const auto& val : diff.second) right_sum += val; - ASSERT_EQ(left_sum, right_sum + edit_length); + uint64_t original_sum = 0, modified_sum = 0; + for (const auto& val : diff.first) original_sum += val; + for (const auto& val : diff.second) modified_sum += val; + + if (original_sum == modified_sum) { + equal_diffs++; + } else if (original_sum < modified_sum) { + larger_diffs++; + ASSERT_EQ(original_sum + edit_length, modified_sum); + } else if (original_sum > modified_sum) { + smaller_diffs++; + ASSERT_EQ(original_sum, modified_sum + edit_length); } ASSERT_LE(diff.first.size(), 2); ASSERT_LE(diff.second.size(), 2); } + + ASSERT_EQ(equal_diffs, exact_number_of_equal_diffs); + ASSERT_EQ(larger_diffs, exact_number_of_larger_diffs); + ASSERT_EQ(smaller_diffs, exact_number_of_smaller_diffs); } -void AssertInsertCase(const std::shared_ptr<::arrow::DataType>& dtype, - const ChunkList& original, const ChunkList& modified, - uint8_t n_modifications, uint64_t edit_length) { - auto diffs = FindDifferences(original, modified); - if (diffs.size() != n_modifications) { - PrintDifferences(original, modified, diffs); +void AssertPageLengthDifferences(const RowGroupInfo& original, + const RowGroupInfo& modified, + uint8_t max_number_of_equal_diffs) { + // A less restrictive version of the above assertion function mainly used to + // assert the update case. + auto diffs = FindDifferences(original.page_lengths, modified.page_lengths); + if (diffs.size() > max_number_of_equal_diffs) { + PrintDifferences(original.page_lengths, modified.page_lengths, diffs); } - ASSERT_EQ(diffs.size(), n_modifications); + ASSERT_LE(diffs.size(), max_number_of_equal_diffs); for (const auto& diff : diffs) { - if (!::arrow::is_list_like(dtype->id())) { - uint64_t left_sum = 0, right_sum = 0; - for (const auto& val : diff.first) left_sum += val; - for (const auto& val : diff.second) right_sum += val; - ASSERT_EQ(left_sum + edit_length, right_sum); - } + uint64_t left_sum = 0, right_sum = 0; + for (const auto& val : diff.first) left_sum += val; + for (const auto& val : diff.second) right_sum += val; + ASSERT_EQ(left_sum, right_sum); ASSERT_LE(diff.first.size(), 2); ASSERT_LE(diff.second.size(), 2); } -} -void AssertAppendCase(const ChunkList& original, const ChunkList& modified) { - ASSERT_GE(modified.size(), original.size()); - for (size_t i = 0; i < original.size() - 1; i++) { - ASSERT_EQ(original[i], modified[i]); + if (diffs.size() == 0) { + // no differences found, the arrays are equal + ASSERT_TRUE(original.page_lengths == modified.page_lengths); } - ASSERT_GT(modified[original.size() - 1], original.back()); } uint64_t ElementCount(int64_t size, int32_t byte_width, bool nullable) { @@ -687,8 +698,8 @@ void AssertAllBetween(const ChunkList& chunks, int64_t min, int64_t max, } void AssertChunkSizes(const std::shared_ptr<::arrow::DataType>& dtype, - PageInfo base_result, PageInfo modified_result, bool nullable, - bool enable_dictionary, int64_t min_chunk_size, + const RowGroupInfo& base_info, const RowGroupInfo& modified_info, + bool nullable, bool enable_dictionary, int64_t min_chunk_size, int64_t max_chunk_size) { if (::arrow::is_fixed_width(dtype->id()) && !nullable) { // for nullable types we cannot calculate the exact number of elements because @@ -696,14 +707,14 @@ void AssertChunkSizes(const std::shared_ptr<::arrow::DataType>& dtype, auto byte_width = (dtype->id() == ::arrow::Type::BOOL) ? 1 : dtype->byte_width(); auto min_length = ElementCount(min_chunk_size, byte_width, nullable); auto max_length = ElementCount(max_chunk_size, byte_width, nullable); - AssertAllBetween(base_result.lengths, min_length, max_length, + AssertAllBetween(base_info.page_lengths, min_length, max_length, /*expect_dictionary_fallback=*/enable_dictionary); - AssertAllBetween(modified_result.lengths, min_length, max_length, + AssertAllBetween(modified_info.page_lengths, min_length, max_length, /*expect_dictionary_fallback=*/enable_dictionary); } else if (::arrow::is_base_binary_like(dtype->id()) && !nullable && !enable_dictionary) { - AssertAllBetween(base_result.sizes, min_chunk_size, max_chunk_size); - AssertAllBetween(modified_result.sizes, min_chunk_size, max_chunk_size); + AssertAllBetween(base_info.page_sizes, min_chunk_size, max_chunk_size); + AssertAllBetween(modified_info.page_sizes, min_chunk_size, max_chunk_size); } } @@ -763,17 +774,26 @@ TEST_P(TestColumnCDC, DeleteOnce) { for (bool enable_dictionary : {false, true}) { ASSERT_OK_AND_ASSIGN( - auto base_result, - WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); - ASSERT_OK_AND_ASSIGN( - auto modified_result, - WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary)); - - AssertChunkSizes(param.dtype, base_result, modified_result, param.is_nullable, - enable_dictionary, kMinChunkSize, kMaxChunkSize); - - AssertDeleteCase(param.dtype, base_result.lengths, modified_result.lengths, 1, - part2_->num_rows()); + auto base_info, + WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto modified_info, + WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, + enable_dictionary)); + + // assert that there is only one row group + ASSERT_EQ(base_info.size(), 1); + ASSERT_EQ(modified_info.size(), 1); + AssertChunkSizes(param.dtype, base_info.front(), modified_info.front(), + param.is_nullable, enable_dictionary, kMinChunkSize, kMaxChunkSize); + + auto edit_length = part2_->num_rows(); + if (::arrow::is_list_like(param.dtype->id())) { + edit_length += 1; + } + AssertPageLengthDifferences(base_info.front(), modified_info.front(), + /*exact_number_of_equal_diffs=*/0, + /*exact_number_of_larger_diffs=*/0, + /*exact_number_of_smaller_diffs=*/1, edit_length); } } @@ -787,16 +807,26 @@ TEST_P(TestColumnCDC, DeleteTwice) { for (bool enable_dictionary : {false, true}) { ASSERT_OK_AND_ASSIGN( - auto base_result, - WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); - ASSERT_OK_AND_ASSIGN( - auto modified_result, - WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary)); - - AssertChunkSizes(param.dtype, base_result, modified_result, param.is_nullable, - enable_dictionary, kMinChunkSize, kMaxChunkSize); - AssertDeleteCase(param.dtype, base_result.lengths, modified_result.lengths, 2, - part2_->num_rows()); + auto base_info, + WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto modified_info, + WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, + enable_dictionary)); + + // assert that there is only one row group + ASSERT_EQ(base_info.size(), 1); + ASSERT_EQ(modified_info.size(), 1); + AssertChunkSizes(param.dtype, base_info.front(), modified_info.front(), + param.is_nullable, enable_dictionary, kMinChunkSize, kMaxChunkSize); + + auto edit_length = part2_->num_rows(); + if (::arrow::is_list_like(param.dtype->id())) { + edit_length += 1; + } + AssertPageLengthDifferences(base_info.front(), modified_info.front(), + /*exact_number_of_equal_diffs=*/0, + /*exact_number_of_larger_diffs=*/0, + /*exact_number_of_smaller_diffs=*/2, edit_length); } } @@ -809,15 +839,18 @@ TEST_P(TestColumnCDC, UpdateOnce) { for (bool enable_dictionary : {false, true}) { ASSERT_OK_AND_ASSIGN( - auto base_result, - WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); - ASSERT_OK_AND_ASSIGN( - auto modified_result, - WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary)); - - AssertChunkSizes(param.dtype, base_result, modified_result, param.is_nullable, - enable_dictionary, kMinChunkSize, kMaxChunkSize); - AssertUpdateCase(param.dtype, base_result.lengths, modified_result.lengths, 1); + auto base_info, + WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto modified_info, + WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, + enable_dictionary)); + // assert that there is only one row group + ASSERT_EQ(base_info.size(), 1); + ASSERT_EQ(modified_info.size(), 1); + AssertChunkSizes(param.dtype, base_info.front(), modified_info.front(), + param.is_nullable, enable_dictionary, kMinChunkSize, kMaxChunkSize); + AssertPageLengthDifferences(base_info.front(), modified_info.front(), + /*max_number_of_equal_diffs=*/1); } } @@ -832,15 +865,18 @@ TEST_P(TestColumnCDC, UpdateTwice) { for (bool enable_dictionary : {false, true}) { ASSERT_OK_AND_ASSIGN( - auto base_result, - WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); - ASSERT_OK_AND_ASSIGN( - auto modified_result, - WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary)); - - AssertChunkSizes(param.dtype, base_result, modified_result, param.is_nullable, - enable_dictionary, kMinChunkSize, kMaxChunkSize); - AssertUpdateCase(param.dtype, base_result.lengths, modified_result.lengths, 2); + auto base_info, + WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto modified_info, + WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, + enable_dictionary)); + // assert that there is only one row group + ASSERT_EQ(base_info.size(), 1); + ASSERT_EQ(modified_info.size(), 1); + AssertChunkSizes(param.dtype, base_info.front(), modified_info.front(), + param.is_nullable, enable_dictionary, kMinChunkSize, kMaxChunkSize); + AssertPageLengthDifferences(base_info.front(), modified_info.front(), + /*max_number_of_equal_diffs=*/2); } } @@ -853,16 +889,25 @@ TEST_P(TestColumnCDC, InsertOnce) { for (bool enable_dictionary : {false, true}) { ASSERT_OK_AND_ASSIGN( - auto base_result, - WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); - ASSERT_OK_AND_ASSIGN( - auto modified_result, - WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary)); - - AssertChunkSizes(param.dtype, base_result, modified_result, param.is_nullable, - enable_dictionary, kMinChunkSize, kMaxChunkSize); - AssertInsertCase(param.dtype, base_result.lengths, modified_result.lengths, 1, - part2_->num_rows()); + auto base_info, + WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto modified_info, + WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, + enable_dictionary)); + // assert that there is only one row group + ASSERT_EQ(base_info.size(), 1); + ASSERT_EQ(modified_info.size(), 1); + AssertChunkSizes(param.dtype, base_info.front(), modified_info.front(), + param.is_nullable, enable_dictionary, kMinChunkSize, kMaxChunkSize); + + auto edit_length = part2_->num_rows(); + if (::arrow::is_list_like(param.dtype->id())) { + edit_length += 1; + } + AssertPageLengthDifferences(base_info.front(), modified_info.front(), + /*exact_number_of_equal_diffs=*/0, + /*exact_number_of_larger_diffs=*/1, + /*exact_number_of_smaller_diffs=*/0, edit_length); } } @@ -876,16 +921,25 @@ TEST_P(TestColumnCDC, InsertTwice) { for (bool enable_dictionary : {false, true}) { ASSERT_OK_AND_ASSIGN( - auto base_result, - WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); - ASSERT_OK_AND_ASSIGN( - auto modified_result, - WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary)); - - AssertChunkSizes(param.dtype, base_result, modified_result, param.is_nullable, - enable_dictionary, kMinChunkSize, kMaxChunkSize); - AssertInsertCase(param.dtype, base_result.lengths, modified_result.lengths, 2, - part2_->num_rows()); + auto base_info, + WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto modified_info, + WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, + enable_dictionary)); + // assert that there is only one row group + ASSERT_EQ(base_info.size(), 1); + ASSERT_EQ(modified_info.size(), 1); + AssertChunkSizes(param.dtype, base_info.front(), modified_info.front(), + param.is_nullable, enable_dictionary, kMinChunkSize, kMaxChunkSize); + + auto edit_length = part2_->num_rows(); + if (::arrow::is_list_like(param.dtype->id())) { + edit_length += 1; + } + AssertPageLengthDifferences(base_info.front(), modified_info.front(), + /*exact_number_of_equal_diffs=*/0, + /*exact_number_of_larger_diffs=*/2, + /*exact_number_of_smaller_diffs=*/0, edit_length); } } @@ -898,15 +952,24 @@ TEST_P(TestColumnCDC, Append) { for (bool enable_dictionary : {false, true}) { ASSERT_OK_AND_ASSIGN( - auto base_result, - WriteAndGetPageInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); - ASSERT_OK_AND_ASSIGN( - auto modified_result, - WriteAndGetPageInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary)); - - AssertChunkSizes(param.dtype, base_result, modified_result, param.is_nullable, - enable_dictionary, kMinChunkSize, kMaxChunkSize); - AssertAppendCase(base_result.lengths, modified_result.lengths); + auto base_info, + WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); + ASSERT_OK_AND_ASSIGN(auto modified_info, + WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, + enable_dictionary)); + // assert that there is only one row group + ASSERT_EQ(base_info.size(), 1); + ASSERT_EQ(modified_info.size(), 1); + AssertChunkSizes(param.dtype, base_info.front(), modified_info.front(), + param.is_nullable, enable_dictionary, kMinChunkSize, kMaxChunkSize); + + auto original_page_lengths = base_info.front().page_lengths; + auto modified_page_lengths = modified_info.front().page_lengths; + ASSERT_GE(original_page_lengths.size(), modified_page_lengths.size()); + for (size_t i = 0; i < original_page_lengths.size() - 1; i++) { + ASSERT_EQ(original_page_lengths[i], modified_page_lengths[i]); + } + ASSERT_GT(modified_page_lengths.back(), original_page_lengths.back()); } } @@ -919,12 +982,13 @@ TEST_P(TestColumnCDC, EmptyTable) { for (bool enable_dictionary : {false, true}) { ASSERT_OK_AND_ASSIGN(auto result, - WriteAndGetPageInfo(empty_table, kMinChunkSize, kMaxChunkSize, - enable_dictionary)); + WriteAndGetParquetInfo(empty_table, kMinChunkSize, kMaxChunkSize, + enable_dictionary)); // An empty table should result in no data pages - ASSERT_TRUE(result.lengths.empty()); - ASSERT_TRUE(result.sizes.empty()); + ASSERT_EQ(result.size(), 1); + ASSERT_TRUE(result.front().page_lengths.empty()); + ASSERT_TRUE(result.front().page_sizes.empty()); } } @@ -986,10 +1050,173 @@ TEST(TestColumnCDC, WriteSingleColumnParquetFile) { ParquetException); } +class TestColumnCDCMultipleRowGroups : public ::testing::Test { + protected: + // Column random table parts for testing + std::shared_ptr dtype_; + std::shared_ptr
part1_, part2_, part3_; + std::shared_ptr
edit1_, edit2_, edit3_; + + void SetUp() override { + auto constexpr kPartLength = 256 * 1024; + auto constexpr kEditLength = 128; + + dtype_ = ::arrow::int32(); + auto field = ::arrow::field("f0", dtype_, true); + auto schema = ::arrow::schema({field}); + + ASSERT_OK_AND_ASSIGN(part1_, GenerateTable(schema, kPartLength, 0)); + ASSERT_OK_AND_ASSIGN(part2_, GenerateTable(schema, kPartLength, 2)); + ASSERT_OK_AND_ASSIGN(part3_, GenerateTable(schema, kPartLength, 4)); + + ASSERT_OK_AND_ASSIGN(edit1_, GenerateTable(schema, kEditLength, 1)); + ASSERT_OK_AND_ASSIGN(edit2_, GenerateTable(schema, kEditLength, 3)); + ASSERT_OK_AND_ASSIGN(edit3_, GenerateTable(schema, kEditLength, 5)); + } +}; + +TEST_F(TestColumnCDCMultipleRowGroups, InsertOnce) { + auto constexpr kRowGroupLength = 128 * 1024; + auto constexpr kEnableDictionary = false; + auto constexpr kMinChunkSize = 0 * 1024; + auto constexpr kMaxChunkSize = 128 * 1024; + + ASSERT_OK_AND_ASSIGN(auto base, ConcatAndCombine({part1_, edit1_, part2_, part3_})); + ASSERT_OK_AND_ASSIGN(auto inserted, + ConcatAndCombine({part1_, edit1_, edit2_, part2_, part3_})); + ASSERT_FALSE(base->Equals(*inserted)); + ASSERT_EQ(inserted->num_rows(), base->num_rows() + edit2_->num_rows()); + + ASSERT_OK_AND_ASSIGN(auto base_info, + WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, + kEnableDictionary, kRowGroupLength)); + ASSERT_OK_AND_ASSIGN(auto inserted_info, + WriteAndGetParquetInfo(inserted, kMinChunkSize, kMaxChunkSize, + kEnableDictionary, kRowGroupLength)); + + ASSERT_EQ(base_info.size(), 7); + ASSERT_EQ(inserted_info.size(), 7); + + ASSERT_EQ(base_info.at(0).page_lengths, inserted_info.at(0).page_lengths); + ASSERT_EQ(base_info.at(1).page_lengths, inserted_info.at(1).page_lengths); + for (size_t i = 2; i < inserted_info.size() - 1; i++) { + AssertPageLengthDifferences(base_info.at(i), inserted_info.at(i), + /*exact_number_of_equal_diffs=*/0, + /*exact_number_of_larger_diffs=*/1, + /*exact_number_of_smaller_diffs=*/1, edit2_->num_rows()); + } + AssertPageLengthDifferences(base_info.back(), inserted_info.back(), + /*exact_number_of_equal_diffs=*/0, + /*exact_number_of_larger_diffs=*/1, + /*exact_number_of_smaller_diffs=*/0, edit2_->num_rows()); +} + +TEST_F(TestColumnCDCMultipleRowGroups, DeleteOnce) { + auto constexpr kRowGroupLength = 128 * 1024; + auto constexpr kEnableDictionary = false; + auto constexpr kMinChunkSize = 0 * 1024; + auto constexpr kMaxChunkSize = 128 * 1024; + + ASSERT_OK_AND_ASSIGN(auto base, + ConcatAndCombine({part1_, edit1_, part2_, part3_, edit2_})); + ASSERT_OK_AND_ASSIGN(auto deleted, ConcatAndCombine({part1_, part2_, part3_, edit2_})); + ASSERT_FALSE(base->Equals(*deleted)); + ASSERT_EQ(deleted->num_rows(), base->num_rows() - edit1_->num_rows()); + + ASSERT_OK_AND_ASSIGN(auto base_info, + WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, + kEnableDictionary, kRowGroupLength)); + ASSERT_OK_AND_ASSIGN(auto deleted_info, + WriteAndGetParquetInfo(deleted, kMinChunkSize, kMaxChunkSize, + kEnableDictionary, kRowGroupLength)); + + ASSERT_EQ(base_info.size(), 7); + ASSERT_EQ(deleted_info.size(), 7); + + ASSERT_EQ(base_info.at(0).page_lengths, deleted_info.at(0).page_lengths); + ASSERT_EQ(base_info.at(1).page_lengths, deleted_info.at(1).page_lengths); + for (size_t i = 2; i < deleted_info.size() - 1; i++) { + AssertPageLengthDifferences(base_info.at(i), deleted_info.at(i), + /*exact_number_of_equal_diffs=*/0, + /*exact_number_of_larger_diffs=*/1, + /*exact_number_of_smaller_diffs=*/1, edit1_->num_rows()); + } + AssertPageLengthDifferences(base_info.back(), deleted_info.back(), + /*exact_number_of_equal_diffs=*/0, + /*exact_number_of_larger_diffs=*/0, + /*exact_number_of_smaller_diffs=*/1, edit1_->num_rows()); +} + +TEST_F(TestColumnCDCMultipleRowGroups, UpdateOnce) { + auto constexpr kRowGroupLength = 128 * 1024; + auto constexpr kEnableDictionary = false; + auto constexpr kMinChunkSize = 0 * 1024; + auto constexpr kMaxChunkSize = 128 * 1024; + + ASSERT_OK_AND_ASSIGN(auto base, + ConcatAndCombine({part1_, edit1_, part2_, part3_, edit2_})); + ASSERT_OK_AND_ASSIGN(auto updated, + ConcatAndCombine({part1_, edit3_, part2_, part3_, edit2_})); + ASSERT_FALSE(base->Equals(*updated)); + + ASSERT_OK_AND_ASSIGN(auto base_info, + WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, + kEnableDictionary, kRowGroupLength)); + ASSERT_OK_AND_ASSIGN(auto updated_info, + WriteAndGetParquetInfo(updated, kMinChunkSize, kMaxChunkSize, + kEnableDictionary, kRowGroupLength)); + + ASSERT_EQ(base_info.size(), 7); + ASSERT_EQ(updated_info.size(), 7); + + ASSERT_EQ(base_info.at(0).page_lengths, updated_info.at(0).page_lengths); + ASSERT_EQ(base_info.at(1).page_lengths, updated_info.at(1).page_lengths); + AssertPageLengthDifferences(base_info.at(2), updated_info.at(2), + /*max_number_of_equal_diffs=*/1); + for (size_t i = 2; i < updated_info.size(); i++) { + ASSERT_EQ(base_info.at(i).page_lengths, updated_info.at(i).page_lengths); + } +} + +TEST_F(TestColumnCDCMultipleRowGroups, Append) { + auto constexpr kRowGroupLength = 128 * 1024; + auto constexpr kEnableDictionary = false; + auto constexpr kMinChunkSize = 0 * 1024; + auto constexpr kMaxChunkSize = 128 * 1024; + + ASSERT_OK_AND_ASSIGN(auto base, ConcatAndCombine({part1_, edit1_, part2_, part3_})); + ASSERT_OK_AND_ASSIGN(auto appended, + ConcatAndCombine({part1_, edit1_, part2_, part3_, edit2_})); + ASSERT_FALSE(base->Equals(*appended)); + ASSERT_EQ(appended->num_rows(), base->num_rows() + edit2_->num_rows()); + + ASSERT_OK_AND_ASSIGN(auto base_info, + WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, + kEnableDictionary, kRowGroupLength)); + ASSERT_OK_AND_ASSIGN(auto appended_info, + WriteAndGetParquetInfo(appended, kMinChunkSize, kMaxChunkSize, + kEnableDictionary, kRowGroupLength)); + + ASSERT_EQ(base_info.size(), 7); + ASSERT_EQ(appended_info.size(), 7); + + for (size_t i = 0; i < appended_info.size() - 1; i++) { + ASSERT_EQ(base_info.at(i).page_lengths, appended_info.at(i).page_lengths); + } + // only the last row group should have more or equal number of pages + auto original_page_lengths = base_info.back().page_lengths; + auto appended_page_lengths = appended_info.back().page_lengths; + ASSERT_GE(original_page_lengths.size(), appended_page_lengths.size()); + for (size_t i = 0; i < original_page_lengths.size() - 1; i++) { + ASSERT_EQ(original_page_lengths[i], appended_page_lengths[i]); + } + ASSERT_GT(appended_page_lengths.back(), original_page_lengths.back()); +} + } // namespace parquet // TODO: -// - test multiple row groups // - place information about the used CDC parameters to the metadata // - test the effect of the normalization factor // - do more validation on min/max chunk size +// - test extension types From 761392974ffd281c1854f9c4937cfee2bb6acda4 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Thu, 20 Mar 2025 10:19:49 +0100 Subject: [PATCH 057/102] Support extension types --- cpp/src/parquet/chunker_internal.cc | 3 +++ cpp/src/parquet/chunker_internal_test.cc | 15 +++++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index 859fabb4eb5..2154dbe9710 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -291,6 +291,9 @@ class ContentDefinedChunker::Impl { const auto byte_width = array.byte_width(); return Calculate(def_levels, rep_levels, num_levels, [&](int64_t i) { Roll(array.GetValue(i), byte_width); }); + } else if constexpr (ArrowType::type_id == ::arrow::Type::EXTENSION) { + const auto& array = static_cast(values); + return GetChunks(def_levels, rep_levels, num_levels, *array.storage()); } else if constexpr (::arrow::is_primitive(ArrowType::type_id)) { using c_type = typename ArrowType::c_type; return CalculateFixedWidth(def_levels, rep_levels, num_levels, diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index 34431469e28..11872c83f36 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -24,6 +24,7 @@ #include #include "arrow/table.h" +#include "arrow/testing/extension_type.h" #include "arrow/type_fwd.h" #include "arrow/util/float16.h" #include "parquet/arrow/reader.h" @@ -234,6 +235,15 @@ Result> GenerateArray(const std::shared_ptr& field return list_array; } + case ::arrow::Type::EXTENSION: { + auto extension_type = dynamic_cast<::arrow::ExtensionType*>(type.get()); + auto storage_type = extension_type->storage_type(); + auto storage_field = ::arrow::field("storage", storage_type, true); + ARROW_ASSIGN_OR_RAISE(auto storage_array, + GenerateArray(storage_field, length, seed)); + return ::arrow::ExtensionType::WrapArray(type, storage_array); + } + default: return ::arrow::Status::NotImplemented("Unsupported data type " + type->ToString()); } @@ -1021,7 +1031,9 @@ INSTANTIATE_TEST_SUITE_P( CaseConfig{::arrow::list(::arrow::utf8()), true, 18}, CaseConfig{::arrow::struct_({::arrow::field("f0", ::arrow::int32())}), false, 8}, CaseConfig{::arrow::struct_({::arrow::field("f0", ::arrow::float64())}), true, - 10})); + 10}, + // Extension type + CaseConfig{::arrow::uuid(), true, 16})); TEST(TestColumnCDC, WriteSingleColumnParquetFile) { // Define the schema with a single column "number" @@ -1219,4 +1231,3 @@ TEST_F(TestColumnCDCMultipleRowGroups, Append) { // - place information about the used CDC parameters to the metadata // - test the effect of the normalization factor // - do more validation on min/max chunk size -// - test extension types From 4e7dc0b63a56e760b1cc7103991e2e9c7086a792 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Thu, 20 Mar 2025 10:36:15 +0100 Subject: [PATCH 058/102] Test sliced tables --- cpp/src/parquet/chunker_internal_test.cc | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index 11872c83f36..0880e18ee7d 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -1002,6 +1002,22 @@ TEST_P(TestColumnCDC, EmptyTable) { } } +TEST_P(TestColumnCDC, ArrayOffsets) { + ASSERT_OK_AND_ASSIGN(auto table, ConcatAndCombine({part1_, part2_, part3_})); + + for (auto offset : {0, 512, 1024}) { + auto sliced_table = table->Slice(offset); + + // assert that the first column has a non-zero offset + auto column = sliced_table->column(0); + auto first_chunk = column->chunk(0); + ASSERT_EQ(first_chunk->offset(), offset); + + // write out the sliced table, read it back and compare + ASSERT_OK(WriteAndGetParquetInfo(sliced_table, kMinChunkSize, kMaxChunkSize, true)); + } +} + INSTANTIATE_TEST_SUITE_P( FixedSizedTypes, TestColumnCDC, testing::Values( From 804b00dcaa3b222166b5e02f366082aacd610a1f Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Thu, 20 Mar 2025 10:42:07 +0100 Subject: [PATCH 059/102] Add comments about the hash value generation --- cpp/src/parquet/chunker_internal_codegen.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/chunker_internal_codegen.py b/cpp/src/parquet/chunker_internal_codegen.py index f01e1a56d26..ad31e4b5185 100644 --- a/cpp/src/parquet/chunker_internal_codegen.py +++ b/cpp/src/parquet/chunker_internal_codegen.py @@ -80,7 +80,12 @@ def generate_hash(n: int, seed: int): - """Produce predictable hash values for a given seed and n using MD5.""" + """Produce predictable hash values for a given seed and n using MD5. + + The value can be arbitrary as long as it is deterministic and has a uniform + distribution. The MD5 hash is used to produce a 16 character hexadecimal + string which is then converted to a 64-bit integer. + """ value = bytes([seed] * 64 + [n] * 64) hasher = hashlib.md5(value) return hasher.hexdigest()[:16] From 9735c4c90698f24ab656114d3473500c2914cc77 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Thu, 20 Mar 2025 11:36:45 +0100 Subject: [PATCH 060/102] Test that dictionary fallback is being triggered during testing --- cpp/src/parquet/chunker_internal_test.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index 0880e18ee7d..f43bd928799 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -309,6 +309,7 @@ using ChunkList = std::vector; struct RowGroupInfo { ChunkList page_lengths; ChunkList page_sizes; + bool has_dictionary_page = false; }; using ParquetInfo = std::vector; @@ -332,6 +333,8 @@ ParquetInfo GetColumnParquetInfo(const std::shared_ptr& data, auto data_page = static_cast(page.get()); rg_info.page_sizes.push_back(data_page->size()); rg_info.page_lengths.push_back(data_page->num_values()); + } else if (page->type() == PageType::DICTIONARY_PAGE) { + rg_info.has_dictionary_page = true; } } result.push_back(rg_info); @@ -711,6 +714,10 @@ void AssertChunkSizes(const std::shared_ptr<::arrow::DataType>& dtype, const RowGroupInfo& base_info, const RowGroupInfo& modified_info, bool nullable, bool enable_dictionary, int64_t min_chunk_size, int64_t max_chunk_size) { + if (dtype->id() != ::arrow::Type::BOOL) { + ASSERT_EQ(base_info.has_dictionary_page, enable_dictionary); + ASSERT_EQ(modified_info.has_dictionary_page, enable_dictionary); + } if (::arrow::is_fixed_width(dtype->id()) && !nullable) { // for nullable types we cannot calculate the exact number of elements because // not all elements are fed through the chunker (null elements are skipped) From 9e2434a027b2255b2fefa940bb0e9cb76c825d18 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Sat, 22 Mar 2025 10:12:27 +0100 Subject: [PATCH 061/102] Disabled unity build for mingw --- .github/workflows/ruby.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index b1ef66e921c..1d1d6917991 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -237,7 +237,8 @@ jobs: ARROW_CMAKE_ARGS: >- -DARROW_PACKAGE_PREFIX=/ucrt${{ matrix.mingw-n-bits }} -DCMAKE_FIND_PACKAGE_PREFER_CONFIG=ON - CMAKE_UNITY_BUILD: ON + # Disabled unity build because of https://github.com/apache/arrow/pull/45360#issuecomment-2739878882 + CMAKE_UNITY_BUILD: OFF steps: - name: Disable Crash Dialogs run: | From f4a28699bd4ad751a2bb4e03cd2eb50e3b5c4ff0 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Sat, 22 Mar 2025 12:35:29 +0100 Subject: [PATCH 062/102] Do more validation for the chunk size parameters and norm_factor --- cpp/src/parquet/chunker_internal.cc | 104 ++++++++++++++--------- cpp/src/parquet/chunker_internal.h | 20 ++--- cpp/src/parquet/chunker_internal_test.cc | 52 ++++++++++++ 3 files changed, 125 insertions(+), 51 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index 2154dbe9710..16605e0e4ee 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -36,7 +36,7 @@ using ::arrow::internal::checked_cast; /// Calculate the mask to use for the rolling hash, the mask is used to determine if a /// new chunk should be created based on the rolling hash value. The mask is calculated -/// based on the min_size, max_size and norm_factor parameters. +/// based on the min_chunk_size, max_chunk_size and norm_factor parameters. /// /// Assuming that the gear hash hash random values with a uniform distribution, then each /// bit in the actual value of rolling_hash_ has even probability of being set so a mask @@ -45,7 +45,7 @@ using ::arrow::internal::checked_cast; /// The main drawback of this approach is the non-uniform distribution of the chunk sizes. /// /// Later on the FastCDC has improved the process by introducing: -/// - sub-minimum chunk cut-point skipping (not hashing the first `min_size` bytes) +/// - sub-minimum chunk cut-point skipping (not hashing the first `min_chunk_size` bytes) /// - chunk size normalization (using two masks) /// /// This implementation uses cut-point skipping because it improves the overall @@ -56,55 +56,77 @@ using ::arrow::internal::checked_cast; /// switching between the used hashtables. This approach is based on central limit theorem /// and approximates normal distribution of the chunk sizes. // -// @param min_size The minimum chunk size (default 256KiB) -// @param max_size The maximum chunk size (default 1MiB) +// @param min_chunk_size The minimum chunk size (default 256KiB) +// @param max_chunk_size The maximum chunk size (default 1MiB) // @param norm_factor Normalization factor (default 0) // @return The mask used to compare against the rolling hash -static uint64_t GetMask(int64_t min_size, int64_t max_size, uint8_t norm_factor) { +static uint64_t GetMask(int64_t min_chunk_size, int64_t max_chunk_size, + int8_t norm_factor) { + if (min_chunk_size < 0) { + throw ParquetException("min_chunk_size must be positive"); + } + if (max_chunk_size < min_chunk_size) { + throw ParquetException( + "max_chunk_size must be greater than or equal to min_chunk_size"); + } + // calculate the average size of the chunks - int64_t avg_size = (min_size + max_size) / 2; - // since we are skipping the first `min_size` bytes for each chunk, we need to + int64_t avg_chunk_size = (min_chunk_size + max_chunk_size) / 2; + // since we are skipping the first `min_chunk_size` bytes for each chunk, we need to // target a smaller chunk size to reach the average size after skipping the first - // `min_size` bytes - int64_t target_size = avg_size - min_size; + // `min_chunk_size` bytes + int64_t target_size = avg_chunk_size - min_chunk_size; // assuming that the gear hash has a uniform distribution, we can calculate the mask // by taking the floor(log2(target_size)) - size_t mask_bits = ::arrow::bit_util::NumRequiredBits(target_size) - 1; - // -3 because we are using 8 hash tables to have more gaussian-like distribution, + size_t mask_bits = std::max(0, ::arrow::bit_util::NumRequiredBits(target_size) - 1); + + // 3 because we are using 8 hash tables to have more gaussian-like distribution, // a user defined `norm_factor` can be used to adjust the mask size, hence the matching // probability, by increasing the norm_factor we increase the probability of matching - // the mask, forcing the distribution closer to the average size - size_t effective_bits = mask_bits - 3 - norm_factor; - return std::numeric_limits::max() << (64 - effective_bits); + // the mask, forcing the distribution closer to the average size; norm_factor is 0 by + // default + if (norm_factor < -4 || norm_factor > 4) { + ARROW_LOG(WARNING) << "norm_factor=" << std::to_string(norm_factor) + << " is outside the recommended range (-4, 4)"; + } + size_t mask_bit_adjustment = norm_factor + 3; + + if (mask_bits - mask_bit_adjustment < 0) { + auto min_size_differece = 2 << mask_bit_adjustment; + + throw ParquetException( + "The difference between min_chunk_size=" + std::to_string(min_chunk_size) + + " and max_chunk_size=" + std::to_string(max_chunk_size) + + " is too small for the given " + "norm_factor=" + + std::to_string(norm_factor) + ", increase the difference to be at least " + + std::to_string(min_size_differece) + " bytes."); + } else if (mask_bits - mask_bit_adjustment > 64) { + throw ParquetException("The number of bits in the mask is too large, max 64 bits"); + } + mask_bits -= mask_bit_adjustment; + + // create the mask by setting the top mask_bits bits + return std::numeric_limits::max() << (64 - mask_bits); } class ContentDefinedChunker::Impl { public: - Impl(const LevelInfo& level_info, int64_t min_size, int64_t max_size, + Impl(const LevelInfo& level_info, int64_t min_chunk_size, int64_t max_chunk_size, int8_t norm_factor) : level_info_(level_info), - min_size_(min_size), - max_size_(max_size), - hash_mask_(GetMask(min_size, max_size, norm_factor)) { - if (min_size_ < 0) { - throw ParquetException("min_size must be non-negative"); - } - if (max_size_ < 0) { - throw ParquetException("max_size must be non-negative"); - } - if (min_size_ > max_size_) { - throw ParquetException("min_size must be less than or equal to max_size"); - } - } + min_chunk_size_(min_chunk_size), + max_chunk_size_(max_chunk_size), + rolling_hash_mask_(GetMask(min_chunk_size, max_chunk_size, norm_factor)) {} void Roll(const bool value) { - if (chunk_size_++ < min_size_) { + if (chunk_size_++ < min_chunk_size_) { // short-circuit if we haven't reached the minimum chunk size, this speeds up the // chunking process since the gearhash doesn't need to be updated return; } rolling_hash_ = (rolling_hash_ << 1) + kGearhashTable[nth_run_][value]; - has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0); + has_matched_ = has_matched_ || ((rolling_hash_ & rolling_hash_mask_) == 0); } template @@ -113,14 +135,14 @@ class ContentDefinedChunker::Impl { // true if the hash matches the mask. chunk_size_ += ByteWidth; - if (chunk_size_ < min_size_) { + if (chunk_size_ < min_chunk_size_) { // short-circuit if we haven't reached the minimum chunk size, this speeds up the // chunking process since the gearhash doesn't need to be updated return; } for (size_t i = 0; i < ByteWidth; ++i) { rolling_hash_ = (rolling_hash_ << 1) + kGearhashTable[nth_run_][value[i]]; - has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0); + has_matched_ = has_matched_ || ((rolling_hash_ & rolling_hash_mask_) == 0); } } @@ -134,14 +156,14 @@ class ContentDefinedChunker::Impl { // hash matches the mask. chunk_size_ += length; - if (chunk_size_ < min_size_) { + if (chunk_size_ < min_chunk_size_) { // short-circuit if we haven't reached the minimum chunk size, this speeds up the // chunking process since the gearhash doesn't need to be updated return; } for (auto i = 0; i < length; ++i) { rolling_hash_ = (rolling_hash_ << 1) + kGearhashTable[nth_run_][value[i]]; - has_matched_ = has_matched_ || ((rolling_hash_ & hash_mask_) == 0); + has_matched_ = has_matched_ || ((rolling_hash_ & rolling_hash_mask_) == 0); } } @@ -161,7 +183,7 @@ class ContentDefinedChunker::Impl { return true; } } - if (ARROW_PREDICT_FALSE(chunk_size_ >= max_size_)) { + if (ARROW_PREDICT_FALSE(chunk_size_ >= max_chunk_size_)) { // we have a hard limit on the maximum chunk size, note that we don't reset the // rolling hash state here, so the next NeedNewChunk() call will continue from the // current state @@ -324,12 +346,12 @@ class ContentDefinedChunker::Impl { // Minimum chunk size in bytes, the rolling hash will not be updated until this size is // reached for each chunk. Note that all data sent through the hash function is counted // towards the chunk size, including definition and repetition levels. - const int64_t min_size_; - const int64_t max_size_; + const int64_t min_chunk_size_; + const int64_t max_chunk_size_; // The mask to match the rolling hash against to determine if a new chunk should be // created. The mask is calculated based on min/max chunk size and the normalization // factor. - const uint64_t hash_mask_; + const uint64_t rolling_hash_mask_; // Whether the rolling hash has matched the mask since the last chunk creation. This // flag is set true by the Roll() function when the mask is matched and reset to false @@ -345,9 +367,9 @@ class ContentDefinedChunker::Impl { }; ContentDefinedChunker::ContentDefinedChunker(const LevelInfo& level_info, - int64_t min_size, int64_t max_size, - int8_t norm_factor) - : impl_(new Impl(level_info, min_size, max_size, norm_factor)) {} + int64_t min_chunk_size, + int64_t max_chunk_size, int8_t norm_factor) + : impl_(new Impl(level_info, min_chunk_size, max_chunk_size, norm_factor)) {} ContentDefinedChunker::~ContentDefinedChunker() = default; diff --git a/cpp/src/parquet/chunker_internal.h b/cpp/src/parquet/chunker_internal.h index 0288be237ce..9a7f04abe5c 100644 --- a/cpp/src/parquet/chunker_internal.h +++ b/cpp/src/parquet/chunker_internal.h @@ -95,20 +95,20 @@ class ContentDefinedChunker { /// Create a new ContentDefinedChunker instance /// /// @param level_info Information about definition and repetition levels - /// @param min_size Minimum chunk size in bytes + /// @param min_chunk_size Minimum chunk size in bytes /// The rolling hash will not be updated until this size is reached for /// each chunk. Note that all data sent through the hash function is /// counted towards the chunk size, including definition and repetition /// levels if present. - /// @param max_size Maximum chunk size in bytes + /// @param max_chunk_size Maximum chunk size in bytes /// The chunker creates a new chunk whenever the chunk size exceeds this /// value. The chunk size distribution approximates a normal - /// distribution between min_size and max_size. Note that the parquet - /// writer has a related `data_pagesize` property that controls the - /// maximum size of a parquet data page after encoding. While setting - /// `data_pagesize` to a smaller value than `max_size` doesn't affect - /// the chunking effectiveness, it results in more small parquet data - /// pages. + /// distribution between min_chunk_size and max_chunk_size. Note that + /// the parquet writer has a related `data_pagesize` property that + /// controls the maximum size of a parquet data page after encoding. + /// While setting `data_pagesize` to a smaller value than + /// `max_chunk_size` doesn't affect the chunking effectiveness, it + /// results in more small parquet data pages. /// @param norm_factor Normalization factor to center the chunk size around the average /// size more aggressively, default 0. /// Increasing the normalization factor increases the probability of @@ -118,8 +118,8 @@ class ContentDefinedChunker { /// between deduplication ratio and fragmentation. Use norm_factor=1 /// or norm_factor=2 to reach a higher deduplication ratio at the /// expense of fragmentation. - ContentDefinedChunker(const LevelInfo& level_info, int64_t min_size, int64_t max_size, - int8_t norm_factor = 0); + ContentDefinedChunker(const LevelInfo& level_info, int64_t min_chunk_size, + int64_t max_chunk_size, int8_t norm_factor = 0); ~ContentDefinedChunker(); /// Get the chunk boundaries for the given column data diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index f43bd928799..5bd9153fa09 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -31,6 +31,7 @@ #include "parquet/arrow/reader_internal.h" #include "parquet/arrow/test_util.h" #include "parquet/arrow/writer.h" +#include "parquet/chunker_internal.h" #include "parquet/column_writer.h" #include "parquet/file_writer.h" @@ -1085,6 +1086,57 @@ TEST(TestColumnCDC, WriteSingleColumnParquetFile) { ParquetException); } +TEST(TestColumnCDC, ChunkSizeParameterValidation) { + // Test that constructor validates min/max chunk size parameters + auto li = internal::LevelInfo(); + + ASSERT_NO_THROW(internal::ContentDefinedChunker(li, 256 * 1024, 1024 * 1024)); + + // with norm_factor=0 the difference between min and max chunk size must be + // at least 16 + ASSERT_THROW(internal::ContentDefinedChunker(li, 0, -1), ParquetException); + ASSERT_THROW(internal::ContentDefinedChunker(li, 1024, 512), ParquetException); + + ASSERT_THROW(internal::ContentDefinedChunker(li, -1, 0), ParquetException); + ASSERT_THROW(internal::ContentDefinedChunker(li, 0, 0), ParquetException); + ASSERT_NO_THROW(internal::ContentDefinedChunker(li, 0, 16)); + ASSERT_THROW(internal::ContentDefinedChunker(li, -16, -16), ParquetException); + ASSERT_THROW(internal::ContentDefinedChunker(li, 16, 0), ParquetException); + ASSERT_THROW(internal::ContentDefinedChunker(li, 32, 32), ParquetException); + ASSERT_NO_THROW(internal::ContentDefinedChunker(li, 32, 48)); + ASSERT_NO_THROW(internal::ContentDefinedChunker(li, 1024 * 1024, 2 * 1024 * 1024)); + ASSERT_NO_THROW(internal::ContentDefinedChunker(li, 1024 * 1024 * 1024L, + 2LL * 1024 * 1024 * 1024L)); + + // with norm_factor=1 the difference between min and max chunk size must be + // at least 64 + ASSERT_THROW(internal::ContentDefinedChunker(li, 1, -1, 1), ParquetException); + ASSERT_THROW(internal::ContentDefinedChunker(li, -1, 1, 1), ParquetException); + ASSERT_THROW(internal::ContentDefinedChunker(li, 1, 1, 1), ParquetException); + ASSERT_THROW(internal::ContentDefinedChunker(li, 1, 32, 1), ParquetException); + ASSERT_NO_THROW(internal::ContentDefinedChunker(li, 1, 33, 1)); + + // with norm_factor=2 the difference between min and max chunk size must be + // at least 128 + ASSERT_THROW(internal::ContentDefinedChunker(li, 0, 63, 2), ParquetException); + ASSERT_NO_THROW(internal::ContentDefinedChunker(li, 0, 64, 2)); + + // with norm_factor=-1 the difference between min and max chunk size must be + // at least 8 + ASSERT_THROW(internal::ContentDefinedChunker(li, 0, 7, -1), ParquetException); + ASSERT_NO_THROW(internal::ContentDefinedChunker(li, 0, 8, -1)); + ASSERT_NO_THROW(internal::ContentDefinedChunker(li, 0, 16, -1)); + + // test the norm_factor extremes + ASSERT_THROW(internal::ContentDefinedChunker(li, 0, 0, -68), ParquetException); + ASSERT_NO_THROW(internal::ContentDefinedChunker(li, 0, 0, -67)); + ASSERT_THROW( + internal::ContentDefinedChunker(li, 0, std::numeric_limits::max(), 59), + ParquetException); + ASSERT_NO_THROW( + internal::ContentDefinedChunker(li, 0, std::numeric_limits::max(), 58)); +} + class TestColumnCDCMultipleRowGroups : public ::testing::Test { protected: // Column random table parts for testing From 1d9cbc30e074693403fae8f2d6aef4c5fb3584ac Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Sat, 22 Mar 2025 13:07:34 +0100 Subject: [PATCH 063/102] Reorder the validation to prevent UB in case of shifting with more than 64 bits --- cpp/src/parquet/chunker_internal.cc | 7 +++---- cpp/src/parquet/chunker_internal_test.cc | 5 ----- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index 16605e0e4ee..22ad3a5b767 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -91,9 +91,10 @@ static uint64_t GetMask(int64_t min_chunk_size, int64_t max_chunk_size, } size_t mask_bit_adjustment = norm_factor + 3; - if (mask_bits - mask_bit_adjustment < 0) { + if (mask_bits - mask_bit_adjustment > 64) { + throw ParquetException("The number of bits in the mask is too large, max 64 bits"); + } else if (mask_bits - mask_bit_adjustment < 0) { auto min_size_differece = 2 << mask_bit_adjustment; - throw ParquetException( "The difference between min_chunk_size=" + std::to_string(min_chunk_size) + " and max_chunk_size=" + std::to_string(max_chunk_size) + @@ -101,8 +102,6 @@ static uint64_t GetMask(int64_t min_chunk_size, int64_t max_chunk_size, "norm_factor=" + std::to_string(norm_factor) + ", increase the difference to be at least " + std::to_string(min_size_differece) + " bytes."); - } else if (mask_bits - mask_bit_adjustment > 64) { - throw ParquetException("The number of bits in the mask is too large, max 64 bits"); } mask_bits -= mask_bit_adjustment; diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index 5bd9153fa09..1b575eda219 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -1301,8 +1301,3 @@ TEST_F(TestColumnCDCMultipleRowGroups, Append) { } } // namespace parquet - -// TODO: -// - place information about the used CDC parameters to the metadata -// - test the effect of the normalization factor -// - do more validation on min/max chunk size From 433d263098c8c2c02df465f41781b44d74489437 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Sat, 22 Mar 2025 13:23:06 +0100 Subject: [PATCH 064/102] Mark ContentDefinedChunker as PARQUET_EXPORT to prevent link errors on windows --- cpp/src/parquet/chunker_internal.h | 38 +++++++++++++----------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.h b/cpp/src/parquet/chunker_internal.h index 9a7f04abe5c..5a9a724e575 100644 --- a/cpp/src/parquet/chunker_internal.h +++ b/cpp/src/parquet/chunker_internal.h @@ -90,34 +90,30 @@ struct Chunk { /// https://www.usenix.org/system/files/conference/atc16/atc16-paper-xia.pdf /// - Git is for Data (chunk size normalization used here is described in section 6.2.1): /// https://www.cidrdb.org/cidr2023/papers/p43-low.pdf -class ContentDefinedChunker { +class PARQUET_EXPORT ContentDefinedChunker { public: /// Create a new ContentDefinedChunker instance /// /// @param level_info Information about definition and repetition levels /// @param min_chunk_size Minimum chunk size in bytes - /// The rolling hash will not be updated until this size is reached for - /// each chunk. Note that all data sent through the hash function is - /// counted towards the chunk size, including definition and repetition - /// levels if present. + /// The rolling hash will not be updated until this size is reached for each chunk. + /// Note that all data sent through the hash function is counted towards the chunk + /// size, including definition and repetition levels if present. /// @param max_chunk_size Maximum chunk size in bytes - /// The chunker creates a new chunk whenever the chunk size exceeds this - /// value. The chunk size distribution approximates a normal - /// distribution between min_chunk_size and max_chunk_size. Note that - /// the parquet writer has a related `data_pagesize` property that - /// controls the maximum size of a parquet data page after encoding. - /// While setting `data_pagesize` to a smaller value than - /// `max_chunk_size` doesn't affect the chunking effectiveness, it - /// results in more small parquet data pages. + /// The chunker creates a new chunk whenever the chunk size exceeds this value. The + /// chunk size distribution approximates a normal distribution between min_chunk_size + /// and max_chunk_size. Note that the parquet writer has a related `data_pagesize` + // property that controls the maximum size of a parquet data page after encoding. + /// While setting `data_pagesize` to a smaller value than `max_chunk_size` doesn't + /// affect the chunking effectiveness, it results in more small parquet data pages. /// @param norm_factor Normalization factor to center the chunk size around the average - /// size more aggressively, default 0. - /// Increasing the normalization factor increases the probability of - /// finding a chunk boundary, improving the deduplication ratio, but - /// also increases the number of small chunks resulting in many small - /// parquet data pages. The default value provides a good balance - /// between deduplication ratio and fragmentation. Use norm_factor=1 - /// or norm_factor=2 to reach a higher deduplication ratio at the - /// expense of fragmentation. + /// size more aggressively, default 0. + /// Increasing the normalization factor increases the probability of finding a chunk + /// boundary, improving the deduplication ratio, but also increases the number of + /// small chunks resulting in many small parquet data pages. The default value + /// provides a good balance between deduplication ratio and fragmentation. + /// Use norm_factor=1 or norm_factor=2 to reach a higher deduplication ratio at the + /// expense of fragmentation. ContentDefinedChunker(const LevelInfo& level_info, int64_t min_chunk_size, int64_t max_chunk_size, int8_t norm_factor = 0); ~ContentDefinedChunker(); From 496e2e56f2ead60e622233a90b131ea1241d3b19 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Sat, 22 Mar 2025 13:44:28 +0100 Subject: [PATCH 065/102] Add test for the pyarrow API and fix UB error --- cpp/src/parquet/chunker_internal.cc | 29 +++---- python/pyarrow/_parquet.pyx | 11 +++ .../tests/parquet/test_parquet_writer.py | 79 +++++++++++++++++++ 3 files changed, 103 insertions(+), 16 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index 22ad3a5b767..b4c0dabe4c8 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -66,8 +66,7 @@ static uint64_t GetMask(int64_t min_chunk_size, int64_t max_chunk_size, throw ParquetException("min_chunk_size must be positive"); } if (max_chunk_size < min_chunk_size) { - throw ParquetException( - "max_chunk_size must be greater than or equal to min_chunk_size"); + throw ParquetException("max_chunk_size must be greater than min_chunk_size"); } // calculate the average size of the chunks @@ -85,28 +84,26 @@ static uint64_t GetMask(int64_t min_chunk_size, int64_t max_chunk_size, // probability, by increasing the norm_factor we increase the probability of matching // the mask, forcing the distribution closer to the average size; norm_factor is 0 by // default - if (norm_factor < -4 || norm_factor > 4) { + if (norm_factor < -3 || norm_factor > 3) { ARROW_LOG(WARNING) << "norm_factor=" << std::to_string(norm_factor) - << " is outside the recommended range (-4, 4)"; + << " is outside the recommended range (-3, 3)"; } - size_t mask_bit_adjustment = norm_factor + 3; + size_t effective_bits = mask_bits - 3 - norm_factor; - if (mask_bits - mask_bit_adjustment > 64) { + if (effective_bits == 0) { + return 0; + } else if (effective_bits > 64) { throw ParquetException("The number of bits in the mask is too large, max 64 bits"); - } else if (mask_bits - mask_bit_adjustment < 0) { - auto min_size_differece = 2 << mask_bit_adjustment; + } else if (effective_bits < 0) { throw ParquetException( "The difference between min_chunk_size=" + std::to_string(min_chunk_size) + " and max_chunk_size=" + std::to_string(max_chunk_size) + - " is too small for the given " - "norm_factor=" + - std::to_string(norm_factor) + ", increase the difference to be at least " + - std::to_string(min_size_differece) + " bytes."); + " is too small for the given norm_factor=" + std::to_string(norm_factor) + + ", increase the difference."); + } else { + // create the mask by setting the top mask_bits bits + return std::numeric_limits::max() << (64 - effective_bits); } - mask_bits -= mask_bit_adjustment; - - // create the mask by setting the top mask_bits bits - return std::numeric_limits::max() << (64 - mask_bits); } class ContentDefinedChunker::Impl { diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 0516a999621..d7909dd44af 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -2119,6 +2119,17 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( elif use_content_defined_chunking is False: props.disable_content_defined_chunking() elif isinstance(use_content_defined_chunking, dict): + defined_keys = use_content_defined_chunking.keys() + mandatory_keys = {"min_chunk_size", "max_chunk_size"} + allowed_keys = {"min_chunk_size", "max_chunk_size", "norm_factor"} + unknown_keys = defined_keys - allowed_keys + missing_keys = mandatory_keys - defined_keys + if unknown_keys: + raise ValueError( + f"Unknown options in 'use_content_defined_chunking': {unknown_keys}") + if missing_keys: + raise ValueError( + f"Missing options in 'use_content_defined_chunking': {missing_keys}") props.enable_content_defined_chunking() props.content_defined_chunking_options( use_content_defined_chunking["min_chunk_size"], diff --git a/python/pyarrow/tests/parquet/test_parquet_writer.py b/python/pyarrow/tests/parquet/test_parquet_writer.py index bc3714a6232..7bc14f3da2f 100644 --- a/python/pyarrow/tests/parquet/test_parquet_writer.py +++ b/python/pyarrow/tests/parquet/test_parquet_writer.py @@ -361,3 +361,82 @@ def test_parquet_writer_append_key_value_metadata(tempdir): assert metadata[b'key1'] == b'1' assert metadata[b'key2'] == b'2' assert metadata[b'key3'] == b'3' + + +def test_parquet_content_defined_chunking(tempdir): + table = pa.table({'a': range(100_000)}) + + pq.write_table(table, tempdir / 'unchunked.parquet') + pq.write_table(table, tempdir / 'chunked-default.parquet', + use_content_defined_chunking=True) + pq.write_table(table, tempdir / 'chunked-custom.parquet', + use_content_defined_chunking={"min_chunk_size": 32_768, + "max_chunk_size": 65_536}) + + # the data must be the same + unchunked = pq.read_table(tempdir / 'unchunked.parquet') + chunked_default = pq.read_table(tempdir / 'chunked-default.parquet') + chunked_custom = pq.read_table(tempdir / 'chunked-custom.parquet') + assert unchunked.equals(chunked_default) + assert unchunked.equals(chunked_custom) + + # number of row groups and their sizes are not affected by content defined chunking + unchunked_metadata = pq.read_metadata(tempdir / 'unchunked.parquet') + chunked_default_metadata = pq.read_metadata(tempdir / 'chunked-default.parquet') + chunked_custom_metadata = pq.read_metadata(tempdir / 'chunked-custom.parquet') + + assert unchunked_metadata.num_row_groups == chunked_default_metadata.num_row_groups + assert unchunked_metadata.num_row_groups == chunked_custom_metadata.num_row_groups + + for i in range(unchunked_metadata.num_row_groups): + rg_unchunked = unchunked_metadata.row_group(i) + rg_chunked_default = chunked_default_metadata.row_group(i) + rg_chunked_custom = chunked_custom_metadata.row_group(i) + assert rg_unchunked.num_rows == rg_chunked_default.num_rows + assert rg_unchunked.num_rows == rg_chunked_custom.num_rows + # since PageReader is not exposed we don't cannot inspect the page sizes + # so just check that the total byte size is different + assert rg_unchunked.total_byte_size != rg_chunked_default.total_byte_size + assert rg_unchunked.total_byte_size != rg_chunked_custom.total_byte_size + assert rg_chunked_default.total_byte_size != rg_chunked_custom.total_byte_size + + +def test_parquet_content_defined_chunking_parameters(tempdir): + table = pa.table({'a': range(100)}) + path = tempdir / 'chunked-invalid.parquet' + + # it raises OSError, not ideal but this is how parquet exceptions are handled + # currently + msg = "max_chunk_size must be greater than min_chunk_size" + with pytest.raises(Exception, match=msg): + cdc_options = {"min_chunk_size": 65_536, "max_chunk_size": 32_768} + pq.write_table(table, path, use_content_defined_chunking=cdc_options) + + cases = [ + ( + {"min_chunk_size": 64 * 1024, "unknown_option": True}, + "Unknown options in 'use_content_defined_chunking': {'unknown_option'}" + ), + ( + {"min_chunk_size": 64 * 1024}, + "Missing options in 'use_content_defined_chunking': {'max_chunk_size'}" + ), + ( + {"max_chunk_size": 64 * 1024}, + "Missing options in 'use_content_defined_chunking': {'min_chunk_size'}" + ) + ] + for cdc_options, msg in cases: + with pytest.raises(ValueError, match=msg): + pq.write_table(table, path, use_content_defined_chunking=cdc_options) + + # using the default parametrization + pq.write_table(table, path, use_content_defined_chunking=True) + + # using min_chunk_size and max_chunk_size + cdc_options = {"min_chunk_size": 32_768, "max_chunk_size": 65_536} + pq.write_table(table, path, use_content_defined_chunking=cdc_options) + + # using min_chunk_size, max_chunk_size and norm_factor + cdc_options = {"min_chunk_size": 32_768, "max_chunk_size": 65_536, "norm_factor": 1} + pq.write_table(table, path, use_content_defined_chunking=cdc_options) From c99e7cf22cc849ecf97133d624b0333127f55e36 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Sat, 22 Mar 2025 20:24:59 +0100 Subject: [PATCH 066/102] Test mask values calculated from the parameters --- cpp/src/parquet/chunker_internal.cc | 25 ++- cpp/src/parquet/chunker_internal.h | 6 + cpp/src/parquet/chunker_internal_test.cc | 232 +++++++++++++---------- 3 files changed, 158 insertions(+), 105 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index b4c0dabe4c8..95380f477c6 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -60,12 +60,12 @@ using ::arrow::internal::checked_cast; // @param max_chunk_size The maximum chunk size (default 1MiB) // @param norm_factor Normalization factor (default 0) // @return The mask used to compare against the rolling hash -static uint64_t GetMask(int64_t min_chunk_size, int64_t max_chunk_size, - int8_t norm_factor) { +static uint64_t CalculateMask(int64_t min_chunk_size, int64_t max_chunk_size, + int8_t norm_factor) { if (min_chunk_size < 0) { throw ParquetException("min_chunk_size must be positive"); } - if (max_chunk_size < min_chunk_size) { + if (max_chunk_size <= min_chunk_size) { throw ParquetException("max_chunk_size must be greater than min_chunk_size"); } @@ -75,9 +75,10 @@ static uint64_t GetMask(int64_t min_chunk_size, int64_t max_chunk_size, // target a smaller chunk size to reach the average size after skipping the first // `min_chunk_size` bytes int64_t target_size = avg_chunk_size - min_chunk_size; + // assuming that the gear hash has a uniform distribution, we can calculate the mask // by taking the floor(log2(target_size)) - size_t mask_bits = std::max(0, ::arrow::bit_util::NumRequiredBits(target_size) - 1); + int mask_bits = std::max(0, ::arrow::bit_util::NumRequiredBits(target_size) - 1); // 3 because we are using 8 hash tables to have more gaussian-like distribution, // a user defined `norm_factor` can be used to adjust the mask size, hence the matching @@ -86,20 +87,22 @@ static uint64_t GetMask(int64_t min_chunk_size, int64_t max_chunk_size, // default if (norm_factor < -3 || norm_factor > 3) { ARROW_LOG(WARNING) << "norm_factor=" << std::to_string(norm_factor) - << " is outside the recommended range (-3, 3)"; + << " is outside the recommended range [-3, 3]"; } - size_t effective_bits = mask_bits - 3 - norm_factor; + int mask_adjustement = 3 + norm_factor; + int effective_bits = mask_bits - mask_adjustement; if (effective_bits == 0) { return 0; } else if (effective_bits > 64) { - throw ParquetException("The number of bits in the mask is too large, max 64 bits"); + throw ParquetException("The number of bits in the mask cannot exceed 64, got " + + std::to_string(effective_bits)); } else if (effective_bits < 0) { throw ParquetException( "The difference between min_chunk_size=" + std::to_string(min_chunk_size) + " and max_chunk_size=" + std::to_string(max_chunk_size) + " is too small for the given norm_factor=" + std::to_string(norm_factor) + - ", increase the difference."); + ", either increase the size range or decrease the norm_factor."); } else { // create the mask by setting the top mask_bits bits return std::numeric_limits::max() << (64 - effective_bits); @@ -113,7 +116,9 @@ class ContentDefinedChunker::Impl { : level_info_(level_info), min_chunk_size_(min_chunk_size), max_chunk_size_(max_chunk_size), - rolling_hash_mask_(GetMask(min_chunk_size, max_chunk_size, norm_factor)) {} + rolling_hash_mask_(CalculateMask(min_chunk_size, max_chunk_size, norm_factor)) {} + + uint64_t GetRollingHashMask() const { return rolling_hash_mask_; } void Roll(const bool value) { if (chunk_size_++ < min_chunk_size_) { @@ -376,4 +381,6 @@ std::vector ContentDefinedChunker::GetChunks(const int16_t* def_levels, return impl_->GetChunks(def_levels, rep_levels, num_levels, values); } +uint64_t ContentDefinedChunker::GetMask() const { return impl_->GetRollingHashMask(); } + } // namespace parquet::internal diff --git a/cpp/src/parquet/chunker_internal.h b/cpp/src/parquet/chunker_internal.h index 5a9a724e575..9a6e7ff4193 100644 --- a/cpp/src/parquet/chunker_internal.h +++ b/cpp/src/parquet/chunker_internal.h @@ -26,6 +26,8 @@ namespace parquet::internal { +class TestPina; + // Represents a chunk of data with level offsets and value offsets due to the // record shredding for nested data. struct Chunk { @@ -129,8 +131,12 @@ class PARQUET_EXPORT ContentDefinedChunker { int64_t num_levels, const ::arrow::Array& values); private: + uint64_t GetMask() const; + class Impl; std::unique_ptr impl_; + + friend class TestCDC; }; } // namespace parquet::internal diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index 1b575eda219..f6d82e283b9 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -35,7 +35,7 @@ #include "parquet/column_writer.h" #include "parquet/file_writer.h" -namespace parquet { +namespace parquet::internal { using ::arrow::Array; using ::arrow::ChunkedArray; @@ -760,7 +760,125 @@ void PrintTo(const CaseConfig& param, std::ostream* os) { *os << " }"; } -class TestColumnCDC : public ::testing::TestWithParam { +class TestCDC : public ::testing::Test { + public: + uint64_t GetMask(const ContentDefinedChunker& cdc) const { return cdc.GetMask(); } +}; + +TEST_F(TestCDC, RollingHashMaskCalculation) { + auto le = LevelInfo(); + auto min_size = 256 * 1024; + auto max_size = 1024 * 1024; + + auto cdc0 = ContentDefinedChunker(le, min_size, max_size, 0); + ASSERT_EQ(GetMask(cdc0), 0xFFFE000000000000); + + auto cdc1 = ContentDefinedChunker(le, min_size, max_size, 1); + ASSERT_EQ(GetMask(cdc1), 0xFFFC000000000000); + + auto cdc2 = ContentDefinedChunker(le, min_size, max_size, 2); + ASSERT_EQ(GetMask(cdc2), 0xFFF8000000000000); + + auto cdc3 = ContentDefinedChunker(le, min_size, max_size, 3); + ASSERT_EQ(GetMask(cdc3), 0xFFF0000000000000); + + auto cdc4 = ContentDefinedChunker(le, min_size, max_size, -1); + ASSERT_EQ(GetMask(cdc4), 0xFFFF000000000000); + + // this is the smallest possible mask always matching, by using 8 hashtables + // we are going to have a match every 8 bytes; this is an unrealistic case + // but checking for the correctness of the mask calculation + auto cdc5 = ContentDefinedChunker(le, 0, 16, 0); + ASSERT_EQ(GetMask(cdc5), 0x0000000000000000); + + auto cdc6 = ContentDefinedChunker(le, 0, 32, 1); + ASSERT_EQ(GetMask(cdc6), 0x0000000000000000); + + auto cdc7 = ContentDefinedChunker(le, 0, 16, -1); + ASSERT_EQ(GetMask(cdc7), 0x8000000000000000); + + // another unrealistic case, checking for the validation + auto cdc8 = ContentDefinedChunker(le, 128, 384, -60); + ASSERT_EQ(GetMask(cdc8), 0xFFFFFFFFFFFFFFFF); +} + +TEST_F(TestCDC, WriteSingleColumnParquetFile) { + // Define the schema with a single column "number" + auto schema = std::dynamic_pointer_cast(schema::GroupNode::Make( + "root", Repetition::REQUIRED, + {schema::PrimitiveNode::Make("number", Repetition::REQUIRED, Type::INT32)})); + + auto sink = CreateOutputStream(); + auto builder = WriterProperties::Builder(); + auto props = builder.enable_content_defined_chunking()->build(); + + auto writer = ParquetFileWriter::Open(sink, schema, props); + auto row_group_writer = writer->AppendRowGroup(); + + // Create a column writer for the "number" column + auto column_writer = row_group_writer->NextColumn(); + auto& int_column_writer = dynamic_cast(*column_writer); + + std::vector numbers = {1, 2, 3, 4, 5}; + std::vector valid_bits = {1, 0, 1, 0, 1}; + EXPECT_THROW( + int_column_writer.WriteBatch(numbers.size(), nullptr, nullptr, numbers.data()), + ParquetException); + EXPECT_THROW(int_column_writer.WriteBatchSpaced(numbers.size(), nullptr, nullptr, + valid_bits.data(), 0, numbers.data()), + ParquetException); +} + +TEST_F(TestCDC, ChunkSizeParameterValidation) { + // Test that constructor validates min/max chunk size parameters + auto li = LevelInfo(); + + ASSERT_NO_THROW(ContentDefinedChunker(li, 256 * 1024, 1024 * 1024)); + + // with norm_factor=0 the difference between min and max chunk size must be + // at least 16 + ASSERT_THROW(ContentDefinedChunker(li, 0, -1), ParquetException); + ASSERT_THROW(ContentDefinedChunker(li, 1024, 512), ParquetException); + + ASSERT_THROW(ContentDefinedChunker(li, -1, 0), ParquetException); + ASSERT_THROW(ContentDefinedChunker(li, 0, 0), ParquetException); + ASSERT_NO_THROW(ContentDefinedChunker(li, 0, 16)); + ASSERT_THROW(ContentDefinedChunker(li, -16, -16), ParquetException); + ASSERT_THROW(ContentDefinedChunker(li, 16, 0), ParquetException); + ASSERT_THROW(ContentDefinedChunker(li, 32, 32), ParquetException); + ASSERT_NO_THROW(ContentDefinedChunker(li, 32, 48)); + ASSERT_NO_THROW(ContentDefinedChunker(li, 1024 * 1024, 2 * 1024 * 1024)); + ASSERT_NO_THROW( + ContentDefinedChunker(li, 1024 * 1024 * 1024L, 2LL * 1024 * 1024 * 1024L)); + + // with norm_factor=1 the difference between min and max chunk size must be + // at least 64 + ASSERT_THROW(ContentDefinedChunker(li, 1, -1, 1), ParquetException); + ASSERT_THROW(ContentDefinedChunker(li, -1, 1, 1), ParquetException); + ASSERT_THROW(ContentDefinedChunker(li, 1, 1, 1), ParquetException); + ASSERT_THROW(ContentDefinedChunker(li, 1, 32, 1), ParquetException); + ASSERT_NO_THROW(ContentDefinedChunker(li, 1, 33, 1)); + + // with norm_factor=2 the difference between min and max chunk size must be + // at least 128 + ASSERT_THROW(ContentDefinedChunker(li, 0, 63, 2), ParquetException); + ASSERT_NO_THROW(ContentDefinedChunker(li, 0, 64, 2)); + + // with norm_factor=-1 the difference between min and max chunk size must be + // at least 8 + ASSERT_THROW(ContentDefinedChunker(li, 0, 7, -1), ParquetException); + ASSERT_NO_THROW(ContentDefinedChunker(li, 0, 8, -1)); + ASSERT_NO_THROW(ContentDefinedChunker(li, 0, 16, -1)); + + // test the norm_factor extremes + ASSERT_THROW(ContentDefinedChunker(li, 0, 0, -68), ParquetException); + ASSERT_NO_THROW(ContentDefinedChunker(li, 0, 1, -67)); + ASSERT_THROW(ContentDefinedChunker(li, 0, std::numeric_limits::max(), 59), + ParquetException); + ASSERT_NO_THROW(ContentDefinedChunker(li, 0, std::numeric_limits::max(), 58)); +} + +class TestCDCSingleRowGroup : public ::testing::TestWithParam { protected: // Column random table parts for testing std::shared_ptr field_; @@ -783,7 +901,7 @@ class TestColumnCDC : public ::testing::TestWithParam { } }; -TEST_P(TestColumnCDC, DeleteOnce) { +TEST_P(TestCDCSingleRowGroup, DeleteOnce) { const auto& param = GetParam(); ASSERT_OK_AND_ASSIGN(auto base, ConcatAndCombine({part1_, part2_, part3_})); @@ -815,7 +933,7 @@ TEST_P(TestColumnCDC, DeleteOnce) { } } -TEST_P(TestColumnCDC, DeleteTwice) { +TEST_P(TestCDCSingleRowGroup, DeleteTwice) { const auto& param = GetParam(); ASSERT_OK_AND_ASSIGN(auto base, @@ -848,7 +966,7 @@ TEST_P(TestColumnCDC, DeleteTwice) { } } -TEST_P(TestColumnCDC, UpdateOnce) { +TEST_P(TestCDCSingleRowGroup, UpdateOnce) { const auto& param = GetParam(); ASSERT_OK_AND_ASSIGN(auto base, ConcatAndCombine({part1_, part2_, part3_})); @@ -872,7 +990,7 @@ TEST_P(TestColumnCDC, UpdateOnce) { } } -TEST_P(TestColumnCDC, UpdateTwice) { +TEST_P(TestCDCSingleRowGroup, UpdateTwice) { const auto& param = GetParam(); ASSERT_OK_AND_ASSIGN(auto base, @@ -898,7 +1016,7 @@ TEST_P(TestColumnCDC, UpdateTwice) { } } -TEST_P(TestColumnCDC, InsertOnce) { +TEST_P(TestCDCSingleRowGroup, InsertOnce) { const auto& param = GetParam(); ASSERT_OK_AND_ASSIGN(auto base, ConcatAndCombine({part1_, part3_})); @@ -929,7 +1047,7 @@ TEST_P(TestColumnCDC, InsertOnce) { } } -TEST_P(TestColumnCDC, InsertTwice) { +TEST_P(TestCDCSingleRowGroup, InsertTwice) { const auto& param = GetParam(); ASSERT_OK_AND_ASSIGN(auto base, ConcatAndCombine({part1_, part3_, part5_})); @@ -961,7 +1079,7 @@ TEST_P(TestColumnCDC, InsertTwice) { } } -TEST_P(TestColumnCDC, Append) { +TEST_P(TestCDCSingleRowGroup, Append) { const auto& param = GetParam(); ASSERT_OK_AND_ASSIGN(auto base, ConcatAndCombine({part1_, part2_, part3_})); @@ -991,7 +1109,7 @@ TEST_P(TestColumnCDC, Append) { } } -TEST_P(TestColumnCDC, EmptyTable) { +TEST_P(TestCDCSingleRowGroup, EmptyTable) { const auto& param = GetParam(); auto schema = ::arrow::schema({::arrow::field("f0", param.dtype, param.is_nullable)}); @@ -1010,7 +1128,7 @@ TEST_P(TestColumnCDC, EmptyTable) { } } -TEST_P(TestColumnCDC, ArrayOffsets) { +TEST_P(TestCDCSingleRowGroup, ArrayOffsets) { ASSERT_OK_AND_ASSIGN(auto table, ConcatAndCombine({part1_, part2_, part3_})); for (auto offset : {0, 512, 1024}) { @@ -1027,7 +1145,7 @@ TEST_P(TestColumnCDC, ArrayOffsets) { } INSTANTIATE_TEST_SUITE_P( - FixedSizedTypes, TestColumnCDC, + FixedSizedTypes, TestCDCSingleRowGroup, testing::Values( // Boolean CaseConfig{::arrow::boolean(), false, 1}, @@ -1059,85 +1177,7 @@ INSTANTIATE_TEST_SUITE_P( // Extension type CaseConfig{::arrow::uuid(), true, 16})); -TEST(TestColumnCDC, WriteSingleColumnParquetFile) { - // Define the schema with a single column "number" - auto schema = std::dynamic_pointer_cast(schema::GroupNode::Make( - "root", Repetition::REQUIRED, - {schema::PrimitiveNode::Make("number", Repetition::REQUIRED, Type::INT32)})); - - auto sink = CreateOutputStream(); - auto builder = WriterProperties::Builder(); - auto props = builder.enable_content_defined_chunking()->build(); - - auto writer = ParquetFileWriter::Open(sink, schema, props); - auto row_group_writer = writer->AppendRowGroup(); - - // Create a column writer for the "number" column - auto column_writer = row_group_writer->NextColumn(); - auto& int_column_writer = dynamic_cast(*column_writer); - - std::vector numbers = {1, 2, 3, 4, 5}; - std::vector valid_bits = {1, 0, 1, 0, 1}; - EXPECT_THROW( - int_column_writer.WriteBatch(numbers.size(), nullptr, nullptr, numbers.data()), - ParquetException); - EXPECT_THROW(int_column_writer.WriteBatchSpaced(numbers.size(), nullptr, nullptr, - valid_bits.data(), 0, numbers.data()), - ParquetException); -} - -TEST(TestColumnCDC, ChunkSizeParameterValidation) { - // Test that constructor validates min/max chunk size parameters - auto li = internal::LevelInfo(); - - ASSERT_NO_THROW(internal::ContentDefinedChunker(li, 256 * 1024, 1024 * 1024)); - - // with norm_factor=0 the difference between min and max chunk size must be - // at least 16 - ASSERT_THROW(internal::ContentDefinedChunker(li, 0, -1), ParquetException); - ASSERT_THROW(internal::ContentDefinedChunker(li, 1024, 512), ParquetException); - - ASSERT_THROW(internal::ContentDefinedChunker(li, -1, 0), ParquetException); - ASSERT_THROW(internal::ContentDefinedChunker(li, 0, 0), ParquetException); - ASSERT_NO_THROW(internal::ContentDefinedChunker(li, 0, 16)); - ASSERT_THROW(internal::ContentDefinedChunker(li, -16, -16), ParquetException); - ASSERT_THROW(internal::ContentDefinedChunker(li, 16, 0), ParquetException); - ASSERT_THROW(internal::ContentDefinedChunker(li, 32, 32), ParquetException); - ASSERT_NO_THROW(internal::ContentDefinedChunker(li, 32, 48)); - ASSERT_NO_THROW(internal::ContentDefinedChunker(li, 1024 * 1024, 2 * 1024 * 1024)); - ASSERT_NO_THROW(internal::ContentDefinedChunker(li, 1024 * 1024 * 1024L, - 2LL * 1024 * 1024 * 1024L)); - - // with norm_factor=1 the difference between min and max chunk size must be - // at least 64 - ASSERT_THROW(internal::ContentDefinedChunker(li, 1, -1, 1), ParquetException); - ASSERT_THROW(internal::ContentDefinedChunker(li, -1, 1, 1), ParquetException); - ASSERT_THROW(internal::ContentDefinedChunker(li, 1, 1, 1), ParquetException); - ASSERT_THROW(internal::ContentDefinedChunker(li, 1, 32, 1), ParquetException); - ASSERT_NO_THROW(internal::ContentDefinedChunker(li, 1, 33, 1)); - - // with norm_factor=2 the difference between min and max chunk size must be - // at least 128 - ASSERT_THROW(internal::ContentDefinedChunker(li, 0, 63, 2), ParquetException); - ASSERT_NO_THROW(internal::ContentDefinedChunker(li, 0, 64, 2)); - - // with norm_factor=-1 the difference between min and max chunk size must be - // at least 8 - ASSERT_THROW(internal::ContentDefinedChunker(li, 0, 7, -1), ParquetException); - ASSERT_NO_THROW(internal::ContentDefinedChunker(li, 0, 8, -1)); - ASSERT_NO_THROW(internal::ContentDefinedChunker(li, 0, 16, -1)); - - // test the norm_factor extremes - ASSERT_THROW(internal::ContentDefinedChunker(li, 0, 0, -68), ParquetException); - ASSERT_NO_THROW(internal::ContentDefinedChunker(li, 0, 0, -67)); - ASSERT_THROW( - internal::ContentDefinedChunker(li, 0, std::numeric_limits::max(), 59), - ParquetException); - ASSERT_NO_THROW( - internal::ContentDefinedChunker(li, 0, std::numeric_limits::max(), 58)); -} - -class TestColumnCDCMultipleRowGroups : public ::testing::Test { +class TestCDCMultipleRowGroups : public ::testing::Test { protected: // Column random table parts for testing std::shared_ptr dtype_; @@ -1162,7 +1202,7 @@ class TestColumnCDCMultipleRowGroups : public ::testing::Test { } }; -TEST_F(TestColumnCDCMultipleRowGroups, InsertOnce) { +TEST_F(TestCDCMultipleRowGroups, InsertOnce) { auto constexpr kRowGroupLength = 128 * 1024; auto constexpr kEnableDictionary = false; auto constexpr kMinChunkSize = 0 * 1024; @@ -1198,7 +1238,7 @@ TEST_F(TestColumnCDCMultipleRowGroups, InsertOnce) { /*exact_number_of_smaller_diffs=*/0, edit2_->num_rows()); } -TEST_F(TestColumnCDCMultipleRowGroups, DeleteOnce) { +TEST_F(TestCDCMultipleRowGroups, DeleteOnce) { auto constexpr kRowGroupLength = 128 * 1024; auto constexpr kEnableDictionary = false; auto constexpr kMinChunkSize = 0 * 1024; @@ -1234,7 +1274,7 @@ TEST_F(TestColumnCDCMultipleRowGroups, DeleteOnce) { /*exact_number_of_smaller_diffs=*/1, edit1_->num_rows()); } -TEST_F(TestColumnCDCMultipleRowGroups, UpdateOnce) { +TEST_F(TestCDCMultipleRowGroups, UpdateOnce) { auto constexpr kRowGroupLength = 128 * 1024; auto constexpr kEnableDictionary = false; auto constexpr kMinChunkSize = 0 * 1024; @@ -1265,7 +1305,7 @@ TEST_F(TestColumnCDCMultipleRowGroups, UpdateOnce) { } } -TEST_F(TestColumnCDCMultipleRowGroups, Append) { +TEST_F(TestCDCMultipleRowGroups, Append) { auto constexpr kRowGroupLength = 128 * 1024; auto constexpr kEnableDictionary = false; auto constexpr kMinChunkSize = 0 * 1024; @@ -1300,4 +1340,4 @@ TEST_F(TestColumnCDCMultipleRowGroups, Append) { ASSERT_GT(appended_page_lengths.back(), original_page_lengths.back()); } -} // namespace parquet +} // namespace parquet::internal From 629d7c4c6c7e93c5af414af6f5dba1a1b7c60fa2 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 26 Mar 2025 13:27:40 +0100 Subject: [PATCH 067/102] Add ValidateChunks() sanity checks in debug builds --- cpp/src/parquet/chunker_internal.cc | 31 +++++++++++++++++++++++++++++ cpp/src/parquet/chunker_internal.h | 11 ++++------ 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index 95380f477c6..35c1ad5d75f 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -194,6 +194,33 @@ class ContentDefinedChunker::Impl { return false; } + void ValidateChunks(const std::vector& chunks, int64_t num_levels) { + // chunks must be non-empty and monotonic increasing + DCHECK(!chunks.empty()); + + // the first chunk must start at the first level + auto first_chunk = chunks.front(); + DCHECK_EQ(first_chunk.level_offset, 0); + DCHECK_EQ(first_chunk.value_offset, 0); + + // the following chunks must be contiguous, non-overlapping and monotonically + // increasing + auto sum_levels = first_chunk.levels_to_write; + for (size_t i = 1; i < chunks.size(); ++i) { + auto chunk = chunks[i]; + auto prev_chunk = chunks[i - 1]; + DCHECK_GT(chunk.levels_to_write, 0); + DCHECK_GE(chunk.value_offset, prev_chunk.value_offset); + DCHECK_EQ(chunk.level_offset, prev_chunk.level_offset + prev_chunk.levels_to_write); + sum_levels += chunk.levels_to_write; + } + DCHECK_EQ(sum_levels, num_levels); + + // the last chunk must end at the last level + auto last_chunk = chunks.back(); + DCHECK_EQ(last_chunk.level_offset + last_chunk.levels_to_write, num_levels); + } + template std::vector Calculate(const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, const RollFunc& RollValue) { @@ -269,6 +296,10 @@ class ContentDefinedChunker::Impl { if (prev_offset < num_levels) { chunks.push_back({prev_offset, prev_value_offset, num_levels - prev_offset}); } +#ifndef NDEBUG + ValidateChunks(chunks, num_levels); +#endif + return chunks; } diff --git a/cpp/src/parquet/chunker_internal.h b/cpp/src/parquet/chunker_internal.h index 9a6e7ff4193..609b008ad74 100644 --- a/cpp/src/parquet/chunker_internal.h +++ b/cpp/src/parquet/chunker_internal.h @@ -17,17 +17,14 @@ #pragma once -#include -#include +#include #include + #include "arrow/array.h" #include "parquet/level_conversion.h" -#include "parquet/properties.h" namespace parquet::internal { -class TestPina; - // Represents a chunk of data with level offsets and value offsets due to the // record shredding for nested data. struct Chunk { @@ -51,14 +48,14 @@ struct Chunk { /// new-chunk chunk2 chunk3 /// /// The chunking process will adjust to maintain stable boundaries across data -/// modifications. Each chunk defines a new parquet data page which are contiguously +/// modifications. Each chunk defines a new parquet data page which is contiguously /// written out to the file. Since each page compressed independently, the files' contents /// would look like the following with unique page identifiers: /// /// File1: [Page1][Page2][Page3]... /// File2: [Page4][Page2][Page3]... /// -/// Then the parquet file is being uploaded to a content addressable storage system (CAS) +/// Then the parquet file is being uploaded to a content addressable storage (CAS) system /// which splits the bytes stream into content defined blobs. The CAS system will /// calculate a unique identifier for each blob, then store the blob in a key-value store. /// If the same blob is encountered again, the system can refer to the hash instead of From 4393e917c1cb69d2402b1847014fa2e2b479db71 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 26 Mar 2025 16:36:15 +0100 Subject: [PATCH 068/102] Simplify test assertion --- cpp/src/parquet/chunker_internal_test.cc | 33 +++++------------------- 1 file changed, 7 insertions(+), 26 deletions(-) diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index f6d82e283b9..2d2c4c794bc 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -685,28 +685,11 @@ uint64_t ElementCount(int64_t size, int32_t byte_width, bool nullable) { return size / byte_width; } -void AssertAllBetween(const ChunkList& chunks, int64_t min, int64_t max, - bool expect_dictionary_fallback = false) { - // expect the last chunk since it is not guaranteed to be within the range - if (expect_dictionary_fallback) { - // if dictionary encoding is enabled, the writer can fallback to plain - // encoding splitting within a content defined chunk, so we can't - // guarantee that all chunks are within the range in this case, but we - // know that there can be at most 2 pages smaller than the min_chunk_size - size_t smaller_count = 0; - for (size_t i = 0; i < chunks.size() - 1; i++) { - if (chunks[i] < min) { - smaller_count++; - } else { - ASSERT_LE(chunks[i], max); - } - } - ASSERT_LE(smaller_count, 2); - } else { - for (size_t i = 0; i < chunks.size() - 1; i++) { - ASSERT_GE(chunks[i], min); - ASSERT_LE(chunks[i], max); - } +void AssertAllBetween(const ChunkList& chunks, int64_t min, int64_t max) { + // except the last chunk since it is not guaranteed to be within the range + for (size_t i = 0; i < chunks.size() - 1; i++) { + ASSERT_GE(chunks[i], min); + ASSERT_LE(chunks[i], max); } ASSERT_LE(chunks.back(), max); } @@ -725,10 +708,8 @@ void AssertChunkSizes(const std::shared_ptr<::arrow::DataType>& dtype, auto byte_width = (dtype->id() == ::arrow::Type::BOOL) ? 1 : dtype->byte_width(); auto min_length = ElementCount(min_chunk_size, byte_width, nullable); auto max_length = ElementCount(max_chunk_size, byte_width, nullable); - AssertAllBetween(base_info.page_lengths, min_length, max_length, - /*expect_dictionary_fallback=*/enable_dictionary); - AssertAllBetween(modified_info.page_lengths, min_length, max_length, - /*expect_dictionary_fallback=*/enable_dictionary); + AssertAllBetween(base_info.page_lengths, min_length, max_length); + AssertAllBetween(modified_info.page_lengths, min_length, max_length); } else if (::arrow::is_base_binary_like(dtype->id()) && !nullable && !enable_dictionary) { AssertAllBetween(base_info.page_sizes, min_chunk_size, max_chunk_size); From 4d61fbe689df122379fb0d09cecdeef33344938a Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 26 Mar 2025 19:42:22 +0100 Subject: [PATCH 069/102] Do not trigger an AddDataPage after the last chunk --- cpp/src/parquet/chunker_internal_test.cc | 49 ++++++++++++++++++++++-- cpp/src/parquet/column_writer.cc | 13 +++++-- 2 files changed, 55 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index 2d2c4c794bc..72f2e1bdf47 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -25,6 +25,7 @@ #include "arrow/table.h" #include "arrow/testing/extension_type.h" +#include "arrow/testing/generator.h" #include "arrow/type_fwd.h" #include "arrow/util/float16.h" #include "parquet/arrow/reader.h" @@ -702,7 +703,7 @@ void AssertChunkSizes(const std::shared_ptr<::arrow::DataType>& dtype, ASSERT_EQ(base_info.has_dictionary_page, enable_dictionary); ASSERT_EQ(modified_info.has_dictionary_page, enable_dictionary); } - if (::arrow::is_fixed_width(dtype->id()) && !nullable) { + if (::arrow::is_fixed_width(dtype->id())) { // for nullable types we cannot calculate the exact number of elements because // not all elements are fed through the chunker (null elements are skipped) auto byte_width = (dtype->id() == ::arrow::Type::BOOL) ? 1 : dtype->byte_width(); @@ -710,8 +711,7 @@ void AssertChunkSizes(const std::shared_ptr<::arrow::DataType>& dtype, auto max_length = ElementCount(max_chunk_size, byte_width, nullable); AssertAllBetween(base_info.page_lengths, min_length, max_length); AssertAllBetween(modified_info.page_lengths, min_length, max_length); - } else if (::arrow::is_base_binary_like(dtype->id()) && !nullable && - !enable_dictionary) { + } else if (::arrow::is_base_binary_like(dtype->id()) && !enable_dictionary) { AssertAllBetween(base_info.page_sizes, min_chunk_size, max_chunk_size); AssertAllBetween(modified_info.page_sizes, min_chunk_size, max_chunk_size); } @@ -810,6 +810,49 @@ TEST_F(TestCDC, WriteSingleColumnParquetFile) { ParquetException); } +TEST_F(TestCDC, LastChunkDoesntTriggerAddDataPage) { + // Define the schema with a single column "number" + auto schema = std::dynamic_pointer_cast(schema::GroupNode::Make( + "root", Repetition::REQUIRED, + {schema::PrimitiveNode::Make("number", Repetition::REQUIRED, Type::INT32)})); + + auto sink = CreateOutputStream(); + auto builder = WriterProperties::Builder(); + auto props = builder.enable_content_defined_chunking() + ->content_defined_chunking_options(kMinChunkSize, kMaxChunkSize, 0) + ->disable_dictionary() + ->build(); + + auto writer = ParquetFileWriter::Open(sink, schema, props); + auto row_group_writer = writer->AppendRowGroup(); + + // Create a column writer for the "number" column + auto column_writer = row_group_writer->NextColumn(); + auto& int_column_writer = dynamic_cast(*column_writer); + + ASSERT_OK_AND_ASSIGN(auto array, ::arrow::gen::Step()->Generate(8000)); + auto arrow_props = default_arrow_writer_properties(); + auto arrow_ctx = ArrowWriteContext(default_memory_pool(), arrow_props.get()); + + // Calling WriteArrow twice, we expect that the first call doesn't add a new data page + // at the end allowing subsequent calls to append to the same page + ASSERT_OK(int_column_writer.WriteArrow(nullptr, nullptr, array->length(), *array, + &arrow_ctx, false)); + ASSERT_OK(int_column_writer.WriteArrow(nullptr, nullptr, array->length(), *array, + &arrow_ctx, false)); + + int_column_writer.Close(); + writer->Close(); + ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish()); + + auto info = GetColumnParquetInfo(buffer); + ASSERT_EQ(info.size(), 1); + + // AssertAllBetween allow the last chunk size to be smaller than the min_chunk_size + AssertAllBetween(info[0].page_sizes, kMinChunkSize, kMaxChunkSize); + AssertAllBetween(info[0].page_lengths, 3000, 5000); +} + TEST_F(TestCDC, ChunkSizeParameterValidation) { // Test that constructor validates min/max chunk size parameters auto li = LevelInfo(); diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index b29c01712b1..6e9d5a3a6f4 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1387,9 +1387,10 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, } if (properties_->content_defined_chunking_enabled()) { - auto boundaries = content_defined_chunker_.GetChunks(def_levels, rep_levels, - num_levels, leaf_array); - for (auto chunk : boundaries) { + auto chunks = content_defined_chunker_.GetChunks(def_levels, rep_levels, num_levels, + leaf_array); + for (size_t i = 0; i < chunks.size(); i++) { + auto chunk = chunks[i]; auto chunk_array = leaf_array.Slice(chunk.value_offset); auto chunk_def_levels = AddIfNotNull(def_levels, chunk.level_offset); auto chunk_rep_levels = AddIfNotNull(rep_levels, chunk.level_offset); @@ -1402,11 +1403,15 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, chunk.levels_to_write, *chunk_array, ctx, maybe_parent_nulls)); } - if (num_buffered_values_ > 0) { + bool is_last_chunk = i == chunks.size() - 1; + if (num_buffered_values_ > 0 && !is_last_chunk) { // Explicitly add a new data page according to the content-defined chunk // boundaries. This way the same chunks will have the same byte-sequence // in the resulting file, which can be identified by content addressible // storage. + // Note that the last chunk doesn't trigger a new data page in order to + // allow subsequent WriteArrow() calls to continue writing to the same + // data page, the chunker's state is not being reset after the last chunk. AddDataPage(); } } From 5604ab6d3ae3f32c5fdef78efa01e5f8913b03cd Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Thu, 27 Mar 2025 14:33:05 +0100 Subject: [PATCH 070/102] Re-enable unity in the mingw build --- .github/workflows/ruby.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index 1d1d6917991..2c0c103d560 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -237,8 +237,6 @@ jobs: ARROW_CMAKE_ARGS: >- -DARROW_PACKAGE_PREFIX=/ucrt${{ matrix.mingw-n-bits }} -DCMAKE_FIND_PACKAGE_PREFER_CONFIG=ON - # Disabled unity build because of https://github.com/apache/arrow/pull/45360#issuecomment-2739878882 - CMAKE_UNITY_BUILD: OFF steps: - name: Disable Crash Dialogs run: | From 724d9b3266e7036e30bdee9457ce068297f6c13b Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Thu, 27 Mar 2025 15:39:06 +0100 Subject: [PATCH 071/102] Remove unreachable branch --- cpp/src/parquet/chunker_internal.cc | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index 35c1ad5d75f..26dce9e6f35 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -24,7 +24,6 @@ #include "arrow/array.h" #include "arrow/util/bit_util.h" #include "arrow/util/logging.h" -#include "arrow/util/unreachable.h" #include "arrow/visit_type_inline.h" #include "parquet/chunker_internal_generated.h" #include "parquet/exception.h" @@ -331,10 +330,7 @@ class ContentDefinedChunker::Impl { int64_t num_levels, const ::arrow::Array& values) { auto handle_type = [&](auto&& type) -> std::vector { using ArrowType = std::decay_t; - if constexpr (std::is_same<::arrow::DataType, ArrowType>::value) { - // TODO(kszucs): this branch should be removed once #45816 is resolved - ::arrow::Unreachable("DataType is not a concrete type"); - } else if constexpr (ArrowType::type_id == ::arrow::Type::NA) { + if constexpr (ArrowType::type_id == ::arrow::Type::NA) { return Calculate(def_levels, rep_levels, num_levels, [](int64_t) {}); } else if constexpr (ArrowType::type_id == ::arrow::Type::BOOL) { const auto& array = static_cast(values); From b2fc28b344ec2986247f327904bae01a0113e370 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Thu, 27 Mar 2025 19:00:34 +0100 Subject: [PATCH 072/102] Use unity build again on MinGW and include window fixup in schema.cc --- .github/workflows/ruby.yml | 1 + cpp/src/parquet/chunker_internal.cc | 2 +- cpp/src/parquet/column_writer.cc | 2 +- cpp/src/parquet/schema.cc | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index 2c0c103d560..b1ef66e921c 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -237,6 +237,7 @@ jobs: ARROW_CMAKE_ARGS: >- -DARROW_PACKAGE_PREFIX=/ucrt${{ matrix.mingw-n-bits }} -DCMAKE_FIND_PACKAGE_PREFER_CONFIG=ON + CMAKE_UNITY_BUILD: ON steps: - name: Disable Crash Dialogs run: | diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index 26dce9e6f35..e91f1cc0ae9 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -17,7 +17,7 @@ #include "parquet/chunker_internal.h" -#include +#include #include #include diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 6e9d5a3a6f4..abb63ca48c9 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1403,7 +1403,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, chunk.levels_to_write, *chunk_array, ctx, maybe_parent_nulls)); } - bool is_last_chunk = i == chunks.size() - 1; + bool is_last_chunk = i == (chunks.size() - 1); if (num_buffered_values_ > 0 && !is_last_chunk) { // Explicitly add a new data page according to the content-defined chunk // boundaries. This way the same chunks will have the same byte-sequence diff --git a/cpp/src/parquet/schema.cc b/cpp/src/parquet/schema.cc index 5ee18d730ab..0cfa49c21c1 100644 --- a/cpp/src/parquet/schema.cc +++ b/cpp/src/parquet/schema.cc @@ -28,7 +28,7 @@ #include "parquet/exception.h" #include "parquet/schema_internal.h" #include "parquet/thrift_internal.h" -#include "parquet/windows_fixup.h" +#include "parquet/windows_fixup.h" // for OPTIONAL using parquet::format::SchemaElement; From 8d6c8ec6be1dafc458b34e74c0cdee8e93ab123e Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 28 Mar 2025 13:18:00 +0100 Subject: [PATCH 073/102] Add docstring to WriterProperties; add documentation to the pyarrow parquet page; mark the feature experimental --- cpp/src/parquet/properties.h | 28 ++++++++++++++++++ docs/source/python/parquet.rst | 54 ++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 79fe7198111..fdda79f4a31 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -293,16 +293,44 @@ class PARQUET_EXPORT WriterProperties { virtual ~Builder() {} + /// \brief EXPERIMENTAL: Use content-defined page chunking for all columns. + /// + /// Optimize parquet files for content addressable storage (CAS) systems by writing + /// data pages according to content-defined chunk boundaries. This allows for more + /// efficient deduplication of data across files, hence more efficient network + /// transfers and storage. The chunking is based on a rolling hash algorithm that + /// identifies chunk boundaries based on the actual content of the data. Builder* enable_content_defined_chunking() { content_defined_chunking_enabled_ = true; return this; } + /// \brief EXPERIMENTAL: Disable content-defined page chunking for all columns. Builder* disable_content_defined_chunking() { content_defined_chunking_enabled_ = false; return this; } + /// \brief EXPERIMENTAL: Specify content-defined chunking options. + /// + /// \param min_chunk_size Minimum chunk size in bytes, default 256 KiB + /// The rolling hash will not be updated until this size is reached for each chunk. + /// Note that all data sent through the hash function is counted towards the chunk + /// size, including definition and repetition levels if present. + /// \param max_chunk_size Maximum chunk size in bytes, default is 1024 KiB + /// The chunker will create a new chunk whenever the chunk size exceeds this value. + /// Note that the parquet writer has a related `pagesize` property that controls + /// the maximum size of a parquet data page after encoding. While setting + /// `pagesize` to a smaller value than `max_chunk_size` doesn't affect the + /// chunking effectiveness, it results in more small parquet data pages. + /// \param norm_factor Normalization factor to center the chunk size around the + /// average size more aggressively, default 0 + /// Increasing the normalization factor increases the probability of finding a chunk, + /// improving the deduplication ratio, but also increasing the number of small chunks + /// resulting in many small parquet data pages. The default value provides a good + /// balance between deduplication ratio and fragmentation. Use norm_factor=1 or + /// norm_factor=2 to reach a higher deduplication ratio at the expense of + /// fragmentation. Builder* content_defined_chunking_options( int64_t min_chunk_size, int64_t max_chunk_size, int8_t norm_factor = kDefaultCdcOptions.norm_factor) { diff --git a/docs/source/python/parquet.rst b/docs/source/python/parquet.rst index 73c23800db9..7183fa05d68 100644 --- a/docs/source/python/parquet.rst +++ b/docs/source/python/parquet.rst @@ -782,3 +782,57 @@ file decryption properties) is optional and it includes the following options: * ``cache_lifetime``, the lifetime of cached entities (key encryption keys, local wrapping keys, KMS client objects) represented as a ``datetime.timedelta``. + + +Content-Defined Chunking +------------------------ + +.. note:: + This feature is experimental and may change in future releases. + +PyArrow introduces an experimental feature for optimizing Parquet files for content +addressable storage (CAS) systems using content-defined chunking (CDC). This feature +enables efficient deduplication of data across files, improving network transfers and +storage efficiency. + +When enabled, data pages are written according to content-defined chunk boundaries, +determined by a rolling hash algorithm that identifies chunk boundaries based on the +actual content of the data. When data in a column is modified (e.g., inserted, deleted, +or updated), this approach minimizes the number of changed data pages. + +The feature can be enabled by setting the ``use_content_defined_chunking`` parameter in +the Parquet writer. It accepts either a boolean or a dictionary for configuration: + +- ``True``: Uses the default configuration with: + - Minimum chunk size: 256 KiB + - Maximum chunk size: 1024 KiB + - Normalization factor: 0 + +- ``dict``: Allows customization of the chunking parameters: + - ``min_chunk_size``: Minimum chunk size in bytes (default: 256 KiB). + - ``max_chunk_size``: Maximum chunk size in bytes (default: 1024 KiB). + - ``norm_factor``: Normalization factor to adjust chunk size distribution (default: 0). + +Note that the chunk size is calculated on the logical values before applying any encoding +or compression. The actual size of the data pages may vary based on the encoding and +compression used. + +.. code-block:: python + + import pyarrow as pa + import pyarrow.parquet as p + + table = pa.Table.from_pandas(df) + + # Enable content-defined chunking with default settings + pq.write_table(table, 'example.parquet', use_content_defined_chunking=True) + + # Enable content-defined chunking with custom settings + pq.write_table( + table, + 'example_custom.parquet', + use_content_defined_chunking={ + 'min_chunk_size': 128 * 1024, # 128 KiB + 'max_chunk_size': 512 * 1024, # 512 KiB + } + ) From 899823b60956aad4be57ad296c3e6121f568707a Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 28 Mar 2025 13:20:06 +0100 Subject: [PATCH 074/102] Mark the `use_content_defined_chunking` argument as experimental in the python docstrings --- python/pyarrow/parquet/core.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 9610a859da4..3585e3b9640 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -899,6 +899,8 @@ def _sanitize_table(table, new_schema, flavor): transfers and storage. The chunking is based on a rolling hash algorithm that identifies chunk boundaries based on the actual content of the data. + Note that it is an experimental feature and the API may change in the future. + If set to ``True``, a default configuration is used with `min_chunk_size=256 KiB` and `max_chunk_size=1024 KiB`. The chunk size distribution approximates a normal distribution between `min_chunk_size` and `max_chunk_size` (sizes are accounted From 2b74c3773d659be66e589446724666b159bc1c88 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 28 Mar 2025 14:30:43 +0100 Subject: [PATCH 075/102] Add test configuration for ParquetDataPageVersion::V2 --- cpp/src/parquet/chunker_internal_test.cc | 186 +++++++++++++---------- 1 file changed, 108 insertions(+), 78 deletions(-) diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index 72f2e1bdf47..9d1fa38eb8d 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -165,6 +165,8 @@ Result> GenerateArray(const std::shared_ptr& field std::string("str_") + std::to_string(val)) GENERATE_CASE(BINARY, ::arrow::BinaryBuilder, std::string("bin_") + std::to_string(val)) + GENERATE_CASE(LARGE_BINARY, ::arrow::LargeBinaryBuilder, + std::string("bin_") + std::to_string(val)) case ::arrow::Type::FIXED_SIZE_BINARY: { auto size = static_cast<::arrow::FixedSizeBinaryType*>(type.get())->byte_width(); auto value_func = [size](uint64_t val) { @@ -269,16 +271,16 @@ Result> ConcatAndCombine( return table->CombineChunks(); } -Result> WriteTableToBuffer(const std::shared_ptr
& table, - int64_t min_chunk_size, - int64_t max_chunk_size, - bool enable_dictionary = false, - int64_t row_group_size = 1024 * 1024) { +Result> WriteTableToBuffer( + const std::shared_ptr
& table, int64_t min_chunk_size, int64_t max_chunk_size, + bool enable_dictionary = false, int64_t row_group_size = 1024 * 1024, + ParquetDataPageVersion data_page_version = ParquetDataPageVersion::V1) { auto sink = CreateOutputStream(); auto builder = WriterProperties::Builder(); builder.enable_content_defined_chunking()->content_defined_chunking_options( min_chunk_size, max_chunk_size, /*norm_factor=*/0); + builder.data_page_version(data_page_version); if (enable_dictionary) { builder.enable_dictionary(); } else { @@ -345,16 +347,18 @@ ParquetInfo GetColumnParquetInfo(const std::shared_ptr& data, return result; } -Result WriteAndGetParquetInfo(const std::shared_ptr
& table, - uint64_t min_chunk_size, - uint64_t max_chunk_size, - bool enable_dictionary = false, - int64_t row_group_size = 1024 * 1024, - int column_index = 0) { +Result WriteAndGetParquetInfo( + const std::shared_ptr
& table, uint64_t min_chunk_size, uint64_t max_chunk_size, + bool enable_dictionary = false, + ParquetDataPageVersion data_page_version = ParquetDataPageVersion::V1, + int64_t row_group_size = 1024 * 1024, + + int column_index = 0) { // Write the table to a buffer and read it back to get the page sizes - ARROW_ASSIGN_OR_RAISE(auto buffer, - WriteTableToBuffer(table, min_chunk_size, max_chunk_size, - enable_dictionary, row_group_size)); + ARROW_ASSIGN_OR_RAISE( + auto buffer, + WriteTableToBuffer(table, min_chunk_size, max_chunk_size, enable_dictionary, + row_group_size, data_page_version)); ARROW_ASSIGN_OR_RAISE(auto readback, ReadTableFromBuffer(buffer)); RETURN_NOT_OK(readback->ValidateFull()); @@ -730,6 +734,8 @@ struct CaseConfig { // Approximate number of bytes per record to calculate the number of elements to // generate size_t bytes_per_record; + // Data page version to use + ParquetDataPageVersion data_page_version = ParquetDataPageVersion::V1; }; // Define PrintTo for MyStruct @@ -933,12 +939,13 @@ TEST_P(TestCDCSingleRowGroup, DeleteOnce) { ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { + ASSERT_OK_AND_ASSIGN(auto base_info, WriteAndGetParquetInfo( + base, kMinChunkSize, kMaxChunkSize, + enable_dictionary, param.data_page_version)); ASSERT_OK_AND_ASSIGN( - auto base_info, - WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_info, - WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, - enable_dictionary)); + auto modified_info, + WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary, + param.data_page_version)); // assert that there is only one row group ASSERT_EQ(base_info.size(), 1); @@ -966,12 +973,13 @@ TEST_P(TestCDCSingleRowGroup, DeleteTwice) { ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { + ASSERT_OK_AND_ASSIGN(auto base_info, WriteAndGetParquetInfo( + base, kMinChunkSize, kMaxChunkSize, + enable_dictionary, param.data_page_version)); ASSERT_OK_AND_ASSIGN( - auto base_info, - WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_info, - WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, - enable_dictionary)); + auto modified_info, + WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary, + param.data_page_version)); // assert that there is only one row group ASSERT_EQ(base_info.size(), 1); @@ -998,12 +1006,13 @@ TEST_P(TestCDCSingleRowGroup, UpdateOnce) { ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { + ASSERT_OK_AND_ASSIGN(auto base_info, WriteAndGetParquetInfo( + base, kMinChunkSize, kMaxChunkSize, + enable_dictionary, param.data_page_version)); ASSERT_OK_AND_ASSIGN( - auto base_info, - WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_info, - WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, - enable_dictionary)); + auto modified_info, + WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary, + param.data_page_version)); // assert that there is only one row group ASSERT_EQ(base_info.size(), 1); ASSERT_EQ(modified_info.size(), 1); @@ -1024,12 +1033,13 @@ TEST_P(TestCDCSingleRowGroup, UpdateTwice) { ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { + ASSERT_OK_AND_ASSIGN(auto base_info, WriteAndGetParquetInfo( + base, kMinChunkSize, kMaxChunkSize, + enable_dictionary, param.data_page_version)); ASSERT_OK_AND_ASSIGN( - auto base_info, - WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_info, - WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, - enable_dictionary)); + auto modified_info, + WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary, + param.data_page_version)); // assert that there is only one row group ASSERT_EQ(base_info.size(), 1); ASSERT_EQ(modified_info.size(), 1); @@ -1048,12 +1058,13 @@ TEST_P(TestCDCSingleRowGroup, InsertOnce) { ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { + ASSERT_OK_AND_ASSIGN(auto base_info, WriteAndGetParquetInfo( + base, kMinChunkSize, kMaxChunkSize, + enable_dictionary, param.data_page_version)); ASSERT_OK_AND_ASSIGN( - auto base_info, - WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_info, - WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, - enable_dictionary)); + auto modified_info, + WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary, + param.data_page_version)); // assert that there is only one row group ASSERT_EQ(base_info.size(), 1); ASSERT_EQ(modified_info.size(), 1); @@ -1080,12 +1091,13 @@ TEST_P(TestCDCSingleRowGroup, InsertTwice) { ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { + ASSERT_OK_AND_ASSIGN(auto base_info, WriteAndGetParquetInfo( + base, kMinChunkSize, kMaxChunkSize, + enable_dictionary, param.data_page_version)); ASSERT_OK_AND_ASSIGN( - auto base_info, - WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_info, - WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, - enable_dictionary)); + auto modified_info, + WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary, + param.data_page_version)); // assert that there is only one row group ASSERT_EQ(base_info.size(), 1); ASSERT_EQ(modified_info.size(), 1); @@ -1111,12 +1123,13 @@ TEST_P(TestCDCSingleRowGroup, Append) { ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { + ASSERT_OK_AND_ASSIGN(auto base_info, WriteAndGetParquetInfo( + base, kMinChunkSize, kMaxChunkSize, + enable_dictionary, param.data_page_version)); ASSERT_OK_AND_ASSIGN( - auto base_info, - WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, enable_dictionary)); - ASSERT_OK_AND_ASSIGN(auto modified_info, - WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, - enable_dictionary)); + auto modified_info, + WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary, + param.data_page_version)); // assert that there is only one row group ASSERT_EQ(base_info.size(), 1); ASSERT_EQ(modified_info.size(), 1); @@ -1141,9 +1154,9 @@ TEST_P(TestCDCSingleRowGroup, EmptyTable) { ASSERT_EQ(empty_table->num_rows(), 0); for (bool enable_dictionary : {false, true}) { - ASSERT_OK_AND_ASSIGN(auto result, - WriteAndGetParquetInfo(empty_table, kMinChunkSize, kMaxChunkSize, - enable_dictionary)); + ASSERT_OK_AND_ASSIGN( + auto result, WriteAndGetParquetInfo(empty_table, kMinChunkSize, kMaxChunkSize, + enable_dictionary, param.data_page_version)); // An empty table should result in no data pages ASSERT_EQ(result.size(), 1); @@ -1153,6 +1166,7 @@ TEST_P(TestCDCSingleRowGroup, EmptyTable) { } TEST_P(TestCDCSingleRowGroup, ArrayOffsets) { + const auto& param = GetParam(); ASSERT_OK_AND_ASSIGN(auto table, ConcatAndCombine({part1_, part2_, part3_})); for (auto offset : {0, 512, 1024}) { @@ -1164,7 +1178,8 @@ TEST_P(TestCDCSingleRowGroup, ArrayOffsets) { ASSERT_EQ(first_chunk->offset(), offset); // write out the sliced table, read it back and compare - ASSERT_OK(WriteAndGetParquetInfo(sliced_table, kMinChunkSize, kMaxChunkSize, true)); + ASSERT_OK(WriteAndGetParquetInfo(sliced_table, kMinChunkSize, kMaxChunkSize, true, + param.data_page_version)); } } @@ -1198,8 +1213,15 @@ INSTANTIATE_TEST_SUITE_P( CaseConfig{::arrow::struct_({::arrow::field("f0", ::arrow::int32())}), false, 8}, CaseConfig{::arrow::struct_({::arrow::field("f0", ::arrow::float64())}), true, 10}, + CaseConfig{ + ::arrow::list(::arrow::struct_({::arrow::field("f0", ::arrow::int32())})), + false, 16}, // Extension type - CaseConfig{::arrow::uuid(), true, 16})); + CaseConfig{::arrow::uuid(), true, 16}, + // Use ParquetDataPageVersion::V2 + CaseConfig{::arrow::large_binary(), false, 16, ParquetDataPageVersion::V2}, + CaseConfig{::arrow::list(::arrow::utf8()), true, 18, + ParquetDataPageVersion::V2})); class TestCDCMultipleRowGroups : public ::testing::Test { protected: @@ -1238,12 +1260,14 @@ TEST_F(TestCDCMultipleRowGroups, InsertOnce) { ASSERT_FALSE(base->Equals(*inserted)); ASSERT_EQ(inserted->num_rows(), base->num_rows() + edit2_->num_rows()); - ASSERT_OK_AND_ASSIGN(auto base_info, - WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, - kEnableDictionary, kRowGroupLength)); - ASSERT_OK_AND_ASSIGN(auto inserted_info, - WriteAndGetParquetInfo(inserted, kMinChunkSize, kMaxChunkSize, - kEnableDictionary, kRowGroupLength)); + ASSERT_OK_AND_ASSIGN( + auto base_info, + WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, kEnableDictionary, + ParquetDataPageVersion::V1, kRowGroupLength)); + ASSERT_OK_AND_ASSIGN( + auto inserted_info, + WriteAndGetParquetInfo(inserted, kMinChunkSize, kMaxChunkSize, kEnableDictionary, + ParquetDataPageVersion::V1, kRowGroupLength)); ASSERT_EQ(base_info.size(), 7); ASSERT_EQ(inserted_info.size(), 7); @@ -1274,12 +1298,14 @@ TEST_F(TestCDCMultipleRowGroups, DeleteOnce) { ASSERT_FALSE(base->Equals(*deleted)); ASSERT_EQ(deleted->num_rows(), base->num_rows() - edit1_->num_rows()); - ASSERT_OK_AND_ASSIGN(auto base_info, - WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, - kEnableDictionary, kRowGroupLength)); - ASSERT_OK_AND_ASSIGN(auto deleted_info, - WriteAndGetParquetInfo(deleted, kMinChunkSize, kMaxChunkSize, - kEnableDictionary, kRowGroupLength)); + ASSERT_OK_AND_ASSIGN( + auto base_info, + WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, kEnableDictionary, + ParquetDataPageVersion::V1, kRowGroupLength)); + ASSERT_OK_AND_ASSIGN( + auto deleted_info, + WriteAndGetParquetInfo(deleted, kMinChunkSize, kMaxChunkSize, kEnableDictionary, + ParquetDataPageVersion::V1, kRowGroupLength)); ASSERT_EQ(base_info.size(), 7); ASSERT_EQ(deleted_info.size(), 7); @@ -1310,12 +1336,14 @@ TEST_F(TestCDCMultipleRowGroups, UpdateOnce) { ConcatAndCombine({part1_, edit3_, part2_, part3_, edit2_})); ASSERT_FALSE(base->Equals(*updated)); - ASSERT_OK_AND_ASSIGN(auto base_info, - WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, - kEnableDictionary, kRowGroupLength)); - ASSERT_OK_AND_ASSIGN(auto updated_info, - WriteAndGetParquetInfo(updated, kMinChunkSize, kMaxChunkSize, - kEnableDictionary, kRowGroupLength)); + ASSERT_OK_AND_ASSIGN( + auto base_info, + WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, kEnableDictionary, + ParquetDataPageVersion::V1, kRowGroupLength)); + ASSERT_OK_AND_ASSIGN( + auto updated_info, + WriteAndGetParquetInfo(updated, kMinChunkSize, kMaxChunkSize, kEnableDictionary, + ParquetDataPageVersion::V1, kRowGroupLength)); ASSERT_EQ(base_info.size(), 7); ASSERT_EQ(updated_info.size(), 7); @@ -1341,12 +1369,14 @@ TEST_F(TestCDCMultipleRowGroups, Append) { ASSERT_FALSE(base->Equals(*appended)); ASSERT_EQ(appended->num_rows(), base->num_rows() + edit2_->num_rows()); - ASSERT_OK_AND_ASSIGN(auto base_info, - WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, - kEnableDictionary, kRowGroupLength)); - ASSERT_OK_AND_ASSIGN(auto appended_info, - WriteAndGetParquetInfo(appended, kMinChunkSize, kMaxChunkSize, - kEnableDictionary, kRowGroupLength)); + ASSERT_OK_AND_ASSIGN( + auto base_info, + WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, kEnableDictionary, + ParquetDataPageVersion::V1, kRowGroupLength)); + ASSERT_OK_AND_ASSIGN( + auto appended_info, + WriteAndGetParquetInfo(appended, kMinChunkSize, kMaxChunkSize, kEnableDictionary, + ParquetDataPageVersion::V1, kRowGroupLength)); ASSERT_EQ(base_info.size(), 7); ASSERT_EQ(appended_info.size(), 7); From b9ef8187df7a5f7ed02e96f7f9ba19a55f7e046f Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Sat, 29 Mar 2025 10:15:04 +0100 Subject: [PATCH 076/102] Address review comments --- cpp/src/parquet/chunker_internal.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index e91f1cc0ae9..a9dfb4a1479 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -85,7 +85,7 @@ static uint64_t CalculateMask(int64_t min_chunk_size, int64_t max_chunk_size, // the mask, forcing the distribution closer to the average size; norm_factor is 0 by // default if (norm_factor < -3 || norm_factor > 3) { - ARROW_LOG(WARNING) << "norm_factor=" << std::to_string(norm_factor) + ARROW_LOG(WARNING) << "norm_factor=" << norm_factor << " is outside the recommended range [-3, 3]"; } @@ -120,7 +120,7 @@ class ContentDefinedChunker::Impl { uint64_t GetRollingHashMask() const { return rolling_hash_mask_; } void Roll(const bool value) { - if (chunk_size_++ < min_chunk_size_) { + if (++chunk_size_ < min_chunk_size_) { // short-circuit if we haven't reached the minimum chunk size, this speeds up the // chunking process since the gearhash doesn't need to be updated return; From d49327e221faec9b508739ac9de46263742e47f2 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 2 Apr 2025 17:23:19 +0200 Subject: [PATCH 077/102] Address review comments --- cpp/src/parquet/chunker_internal.cc | 25 +++++++++++++------------ cpp/src/parquet/chunker_internal.h | 5 ++++- cpp/src/parquet/properties.h | 7 ++++--- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index a9dfb4a1479..46d07d76ce2 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -18,6 +18,7 @@ #include "parquet/chunker_internal.h" #include +#include #include #include @@ -33,6 +34,11 @@ namespace parquet::internal { using ::arrow::internal::checked_cast; +static_assert(std::size(kGearhashTable) == 8, + "should update CDC code to reflect number of generated hash tables"); +static_assert(sizeof(kGearhashTable) == 8 * 256 * 8, + "each table should have 256 entries of 64 bit values"); + /// Calculate the mask to use for the rolling hash, the mask is used to determine if a /// new chunk should be created based on the rolling hash value. The mask is calculated /// based on the min_chunk_size, max_chunk_size and norm_factor parameters. @@ -79,16 +85,11 @@ static uint64_t CalculateMask(int64_t min_chunk_size, int64_t max_chunk_size, // by taking the floor(log2(target_size)) int mask_bits = std::max(0, ::arrow::bit_util::NumRequiredBits(target_size) - 1); - // 3 because we are using 8 hash tables to have more gaussian-like distribution, // a user defined `norm_factor` can be used to adjust the mask size, hence the matching // probability, by increasing the norm_factor we increase the probability of matching // the mask, forcing the distribution closer to the average size; norm_factor is 0 by // default - if (norm_factor < -3 || norm_factor > 3) { - ARROW_LOG(WARNING) << "norm_factor=" << norm_factor - << " is outside the recommended range [-3, 3]"; - } - + // adding 3 because we are using 8 hash tables to have more gaussian-like distribution int mask_adjustement = 3 + norm_factor; int effective_bits = mask_bits - mask_adjustement; if (effective_bits == 0) { @@ -103,7 +104,7 @@ static uint64_t CalculateMask(int64_t min_chunk_size, int64_t max_chunk_size, " is too small for the given norm_factor=" + std::to_string(norm_factor) + ", either increase the size range or decrease the norm_factor."); } else { - // create the mask by setting the top mask_bits bits + // create the mask by setting the top bits return std::numeric_limits::max() << (64 - effective_bits); } } @@ -111,7 +112,7 @@ static uint64_t CalculateMask(int64_t min_chunk_size, int64_t max_chunk_size, class ContentDefinedChunker::Impl { public: Impl(const LevelInfo& level_info, int64_t min_chunk_size, int64_t max_chunk_size, - int8_t norm_factor) + int norm_factor) : level_info_(level_info), min_chunk_size_(min_chunk_size), max_chunk_size_(max_chunk_size), @@ -129,18 +130,18 @@ class ContentDefinedChunker::Impl { has_matched_ = has_matched_ || ((rolling_hash_ & rolling_hash_mask_) == 0); } - template + template void Roll(const uint8_t* value) { // Update the rolling hash with a compile-time known sized value, set has_matched_ to // true if the hash matches the mask. - chunk_size_ += ByteWidth; + chunk_size_ += kByteWidth; if (chunk_size_ < min_chunk_size_) { // short-circuit if we haven't reached the minimum chunk size, this speeds up the // chunking process since the gearhash doesn't need to be updated return; } - for (size_t i = 0; i < ByteWidth; ++i) { + for (size_t i = 0; i < kByteWidth; ++i) { rolling_hash_ = (rolling_hash_ << 1) + kGearhashTable[nth_run_][value[i]]; has_matched_ = has_matched_ || ((rolling_hash_ & rolling_hash_mask_) == 0); } @@ -396,7 +397,7 @@ class ContentDefinedChunker::Impl { ContentDefinedChunker::ContentDefinedChunker(const LevelInfo& level_info, int64_t min_chunk_size, - int64_t max_chunk_size, int8_t norm_factor) + int64_t max_chunk_size, int norm_factor) : impl_(new Impl(level_info, min_chunk_size, max_chunk_size, norm_factor)) {} ContentDefinedChunker::~ContentDefinedChunker() = default; diff --git a/cpp/src/parquet/chunker_internal.h b/cpp/src/parquet/chunker_internal.h index 609b008ad74..94e66ccb42b 100644 --- a/cpp/src/parquet/chunker_internal.h +++ b/cpp/src/parquet/chunker_internal.h @@ -28,8 +28,11 @@ namespace parquet::internal { // Represents a chunk of data with level offsets and value offsets due to the // record shredding for nested data. struct Chunk { + // The start offset of this chunk inside the given levels int64_t level_offset; + // The start offset of this chunk inside the given values array int64_t value_offset; + // The length of the chunk in levels int64_t levels_to_write; }; @@ -114,7 +117,7 @@ class PARQUET_EXPORT ContentDefinedChunker { /// Use norm_factor=1 or norm_factor=2 to reach a higher deduplication ratio at the /// expense of fragmentation. ContentDefinedChunker(const LevelInfo& level_info, int64_t min_chunk_size, - int64_t max_chunk_size, int8_t norm_factor = 0); + int64_t max_chunk_size, int norm_factor = 0); ~ContentDefinedChunker(); /// Get the chunk boundaries for the given column data diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index fdda79f4a31..e04086f2e10 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -323,14 +323,15 @@ class PARQUET_EXPORT WriterProperties { /// the maximum size of a parquet data page after encoding. While setting /// `pagesize` to a smaller value than `max_chunk_size` doesn't affect the /// chunking effectiveness, it results in more small parquet data pages. - /// \param norm_factor Normalization factor to center the chunk size around the - /// average size more aggressively, default 0 + /// \param norm_factor Number of bit adjustement to the gearhash mask in order to + /// center the chunk size around the average size more aggressively, default 0 /// Increasing the normalization factor increases the probability of finding a chunk, /// improving the deduplication ratio, but also increasing the number of small chunks /// resulting in many small parquet data pages. The default value provides a good /// balance between deduplication ratio and fragmentation. Use norm_factor=1 or /// norm_factor=2 to reach a higher deduplication ratio at the expense of - /// fragmentation. + /// fragmentation. Negative values can also be used to reduce the probability of + /// finding a chunk, resulting in larger chunks and fewer data pages. Builder* content_defined_chunking_options( int64_t min_chunk_size, int64_t max_chunk_size, int8_t norm_factor = kDefaultCdcOptions.norm_factor) { From 7e0424605ded930175e8915f4143b4c5f9a330ac Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 2 Apr 2025 17:57:10 +0200 Subject: [PATCH 078/102] Use CDCOptions instead of arguments --- cpp/src/parquet/chunker_internal_codegen.py | 5 +- cpp/src/parquet/chunker_internal_test.cc | 4 +- cpp/src/parquet/properties.h | 57 ++++++++++----------- python/pyarrow/_parquet.pxd | 9 ++-- python/pyarrow/_parquet.pyx | 10 ++-- 5 files changed, 43 insertions(+), 42 deletions(-) diff --git a/cpp/src/parquet/chunker_internal_codegen.py b/cpp/src/parquet/chunker_internal_codegen.py index ad31e4b5185..052842367ec 100644 --- a/cpp/src/parquet/chunker_internal_codegen.py +++ b/cpp/src/parquet/chunker_internal_codegen.py @@ -72,7 +72,7 @@ namespace parquet::internal {{ -constexpr uint64_t kGearhashTable[8][256] = {{ +constexpr uint64_t kGearhashTable[{ntables}][256] = {{ {content}}}; }} // namespace parquet::internal @@ -112,7 +112,8 @@ def generate_header(ntables=8, relative_path="chunker_internal_generated.h"): """Generate a header file with multiple gearhash tables.""" path = pathlib.Path(__file__).parent / relative_path tables = [generate_hashtable(seed) for seed in range(ntables)] - text = template.format(content=",\n".join(tables)) + content = ",\n".join(tables) + text = template.format(ntables=ntables, content=content) path.write_text(text) diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index 9d1fa38eb8d..0f572d880a5 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -279,7 +279,7 @@ Result> WriteTableToBuffer( auto builder = WriterProperties::Builder(); builder.enable_content_defined_chunking()->content_defined_chunking_options( - min_chunk_size, max_chunk_size, /*norm_factor=*/0); + {min_chunk_size, max_chunk_size, /*norm_factor=*/0}); builder.data_page_version(data_page_version); if (enable_dictionary) { builder.enable_dictionary(); @@ -825,7 +825,7 @@ TEST_F(TestCDC, LastChunkDoesntTriggerAddDataPage) { auto sink = CreateOutputStream(); auto builder = WriterProperties::Builder(); auto props = builder.enable_content_defined_chunking() - ->content_defined_chunking_options(kMinChunkSize, kMaxChunkSize, 0) + ->content_defined_chunking_options({kMinChunkSize, kMaxChunkSize, 0}) ->disable_dictionary() ->build(); diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index e04086f2e10..b4cf24dd92f 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -245,13 +245,33 @@ class PARQUET_EXPORT ColumnProperties { bool page_index_enabled_; }; -struct CDCOptions { +// EXPERIMENTAL: Options for content-defined chunking. +struct PARQUET_EXPORT CDCOptions { + /// Minimum chunk size in bytes, default 256 KiB + /// The rolling hash will not be updated until this size is reached for each chunk. + /// Note that all data sent through the hash function is counted towards the chunk + /// size, including definition and repetition levels if present. int64_t min_chunk_size; + /// Maximum chunk size in bytes, default is 1024 KiB + /// The chunker will create a new chunk whenever the chunk size exceeds this value. + /// Note that the parquet writer has a related `pagesize` property that controls + /// the maximum size of a parquet data page after encoding. While setting + /// `pagesize` to a smaller value than `max_chunk_size` doesn't affect the + /// chunking effectiveness, it results in more small parquet data pages. int64_t max_chunk_size; - int8_t norm_factor; + /// Number of bit adjustement to the gearhash mask in order to + /// center the chunk size around the average size more aggressively, default 0 + /// Increasing the normalization factor increases the probability of finding a chunk, + /// improving the deduplication ratio, but also increasing the number of small chunks + /// resulting in many small parquet data pages. The default value provides a good + /// balance between deduplication ratio and fragmentation. Use norm_factor=1 or + /// norm_factor=2 to reach a higher deduplication ratio at the expense of + /// fragmentation. Negative values can also be used to reduce the probability of + /// finding a chunk, resulting in larger chunks and fewer data pages. + int norm_factor = 0; }; -static constexpr CDCOptions kDefaultCdcOptions = CDCOptions{256 * 1024, 1024 * 1024, 0}; +static constexpr CDCOptions kDefaultCDCOptions = CDCOptions{256 * 1024, 1024 * 1024, 0}; class PARQUET_EXPORT WriterProperties { public: @@ -270,7 +290,7 @@ class PARQUET_EXPORT WriterProperties { page_checksum_enabled_(false), size_statistics_level_(DEFAULT_SIZE_STATISTICS_LEVEL), content_defined_chunking_enabled_(false), - content_defined_chunking_options_(kDefaultCdcOptions) {} + content_defined_chunking_options_(kDefaultCDCOptions) {} explicit Builder(const WriterProperties& properties) : pool_(properties.memory_pool()), @@ -311,32 +331,9 @@ class PARQUET_EXPORT WriterProperties { return this; } - /// \brief EXPERIMENTAL: Specify content-defined chunking options. - /// - /// \param min_chunk_size Minimum chunk size in bytes, default 256 KiB - /// The rolling hash will not be updated until this size is reached for each chunk. - /// Note that all data sent through the hash function is counted towards the chunk - /// size, including definition and repetition levels if present. - /// \param max_chunk_size Maximum chunk size in bytes, default is 1024 KiB - /// The chunker will create a new chunk whenever the chunk size exceeds this value. - /// Note that the parquet writer has a related `pagesize` property that controls - /// the maximum size of a parquet data page after encoding. While setting - /// `pagesize` to a smaller value than `max_chunk_size` doesn't affect the - /// chunking effectiveness, it results in more small parquet data pages. - /// \param norm_factor Number of bit adjustement to the gearhash mask in order to - /// center the chunk size around the average size more aggressively, default 0 - /// Increasing the normalization factor increases the probability of finding a chunk, - /// improving the deduplication ratio, but also increasing the number of small chunks - /// resulting in many small parquet data pages. The default value provides a good - /// balance between deduplication ratio and fragmentation. Use norm_factor=1 or - /// norm_factor=2 to reach a higher deduplication ratio at the expense of - /// fragmentation. Negative values can also be used to reduce the probability of - /// finding a chunk, resulting in larger chunks and fewer data pages. - Builder* content_defined_chunking_options( - int64_t min_chunk_size, int64_t max_chunk_size, - int8_t norm_factor = kDefaultCdcOptions.norm_factor) { - content_defined_chunking_options_ = - CDCOptions{min_chunk_size, max_chunk_size, norm_factor}; + /// \brief EXPERIMENTAL: Specify content-defined chunking options, see CDCOptions. + Builder* content_defined_chunking_options(const CDCOptions options) { + content_defined_chunking_options_ = options; return this; } diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index dec04eb77e0..4433b17253a 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -462,6 +462,11 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: + cdef cppclass CDCOptions: + int64_t min_chunk_size + int64_t max_chunk_size + int norm_factor + cdef cppclass WriterProperties: cppclass Builder: Builder* data_page_version(ParquetDataPageVersion version) @@ -497,9 +502,7 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: Builder* disable_page_checksum() Builder* enable_content_defined_chunking() Builder* disable_content_defined_chunking() - Builder* content_defined_chunking_options(int64_t min_size, - int64_t max_size, - int8_t norm_factor) + Builder* content_defined_chunking_options(const CDCOptions options) shared_ptr[WriterProperties] build() cdef cppclass ArrowWriterProperties: diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index d7909dd44af..9e79490e3e4 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1979,6 +1979,7 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( cdef: shared_ptr[WriterProperties] properties WriterProperties.Builder props + CDCOptions cdc_options # data_page_version @@ -2130,12 +2131,11 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( if missing_keys: raise ValueError( f"Missing options in 'use_content_defined_chunking': {missing_keys}") + cdc_options.min_chunk_size = use_content_defined_chunking["min_chunk_size"] + cdc_options.max_chunk_size = use_content_defined_chunking["max_chunk_size"] + cdc_options.norm_factor = use_content_defined_chunking.get("norm_factor", 0) props.enable_content_defined_chunking() - props.content_defined_chunking_options( - use_content_defined_chunking["min_chunk_size"], - use_content_defined_chunking["max_chunk_size"], - use_content_defined_chunking.get("norm_factor", 0) - ) + props.content_defined_chunking_options(cdc_options) else: raise TypeError( "'use_content_defined_chunking' should be either boolean or a dictionary") From 3ddd52932900c0ad1f92c11c25edc674a3c90a17 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 2 Apr 2025 18:14:42 +0200 Subject: [PATCH 079/102] Address review comments --- cpp/src/parquet/chunker_internal.cc | 2 +- cpp/src/parquet/chunker_internal_test.cc | 47 +++++++++++++----------- 2 files changed, 26 insertions(+), 23 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index 46d07d76ce2..c8362c42aea 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -336,7 +336,7 @@ class ContentDefinedChunker::Impl { } else if constexpr (ArrowType::type_id == ::arrow::Type::BOOL) { const auto& array = static_cast(values); return Calculate(def_levels, rep_levels, num_levels, - [&](int64_t i) { return Roll(array.Value(i)); }); + [&](int64_t i) { Roll(array.Value(i)); }); } else if constexpr (ArrowType::type_id == ::arrow::Type::FIXED_SIZE_BINARY) { const auto& array = static_cast(values); const auto byte_width = array.byte_width(); diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index 0f572d880a5..cbac1a0cc18 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -#include #include #include #include @@ -23,6 +22,8 @@ #include #include +#include + #include "arrow/table.h" #include "arrow/testing/extension_type.h" #include "arrow/testing/generator.h" @@ -47,6 +48,7 @@ using ::arrow::Field; using ::arrow::Result; using ::arrow::Schema; using ::arrow::Table; +using ::arrow::internal::checked_cast; using ::arrow::io::BufferReader; using ::parquet::arrow::FileReader; using ::parquet::arrow::FileReaderBuilder; @@ -126,7 +128,7 @@ Result> GenerateArray(const std::shared_ptr& field GENERATE_CASE(DOUBLE, ::arrow::DoubleBuilder, static_cast(val % 100000) / 1000.0) case ::arrow::Type::DECIMAL128: { - const auto& decimal_type = static_cast(*type); + const auto& decimal_type = checked_cast(*type); // Limit the value to fit within the specified precision int32_t max_exponent = decimal_type.precision() - decimal_type.scale(); int64_t max_value = static_cast(std::pow(10, max_exponent) - 1); @@ -137,7 +139,7 @@ Result> GenerateArray(const std::shared_ptr& field value_func); } case ::arrow::Type::DECIMAL256: { - const auto& decimal_type = static_cast(*type); + const auto& decimal_type = checked_cast(*type); // Limit the value to fit within the specified precision, capped at 9 to avoid // int64_t overflow int32_t max_exponent = std::min(9, decimal_type.precision() - decimal_type.scale()); @@ -168,7 +170,8 @@ Result> GenerateArray(const std::shared_ptr& field GENERATE_CASE(LARGE_BINARY, ::arrow::LargeBinaryBuilder, std::string("bin_") + std::to_string(val)) case ::arrow::Type::FIXED_SIZE_BINARY: { - auto size = static_cast<::arrow::FixedSizeBinaryType*>(type.get())->byte_width(); + auto size = + checked_cast(type.get())->byte_width(); auto value_func = [size](uint64_t val) { return std::string("bin_") + std::to_string(val).substr(0, size - 4); }; @@ -177,7 +180,7 @@ Result> GenerateArray(const std::shared_ptr& field } case ::arrow::Type::STRUCT: { - auto struct_type = static_cast<::arrow::StructType*>(type.get()); + auto struct_type = checked_cast(type.get()); std::vector> child_arrays; for (auto i = 0; i < struct_type->num_fields(); i++) { ARROW_ASSIGN_OR_RAISE(auto child_array, @@ -191,7 +194,7 @@ Result> GenerateArray(const std::shared_ptr& field } case ::arrow::Type::LIST: { - auto list_type = static_cast<::arrow::ListType*>(type.get()); + auto list_type = checked_cast(type.get()); auto value_field = ::arrow::field("item", list_type->value_type()); ARROW_ASSIGN_OR_RAISE(auto values_array, GenerateArray(value_field, length, seed)); auto offset_builder = ::arrow::Int32Builder(); @@ -199,7 +202,7 @@ Result> GenerateArray(const std::shared_ptr& field int32_t num_nulls = 0; int32_t num_elements = 0; - uint8_t element_size = 0; + int32_t element_size = 0; int32_t current_offset = 0; RETURN_NOT_OK(offset_builder.Append(current_offset)); while (current_offset < length) { @@ -240,7 +243,7 @@ Result> GenerateArray(const std::shared_ptr& field } case ::arrow::Type::EXTENSION: { - auto extension_type = dynamic_cast<::arrow::ExtensionType*>(type.get()); + auto extension_type = checked_cast(type.get()); auto storage_type = extension_type->storage_type(); auto storage_field = ::arrow::field("storage", storage_type, true); ARROW_ASSIGN_OR_RAISE(auto storage_array, @@ -471,7 +474,7 @@ std::vector FindDifferences(const ChunkList& first, const ChunkList& } void PrintDifferences(const ChunkList& original, const ChunkList& modified, - std::vector& diffs) { + const std::vector& diffs) { // Utility function to print the original and modified sequences, and the diffs // between them. Used in case of failing assertions to display the differences. std::cout << "Original: "; @@ -604,9 +607,9 @@ TEST(TestFindDifferences, AdditionalCase) { void AssertPageLengthDifferences(const RowGroupInfo& original, const RowGroupInfo& modified, - int8_t exact_number_of_equal_diffs, - int8_t exact_number_of_larger_diffs, - int8_t exact_number_of_smaller_diffs, + int32_t exact_number_of_equal_diffs, + int32_t exact_number_of_larger_diffs, + int32_t exact_number_of_smaller_diffs, int64_t edit_length = 0) { // Asserts that the differences between the original and modified page lengths // are as expected. A longest common subsequence diff is calculated on the original @@ -628,11 +631,11 @@ void AssertPageLengthDifferences(const RowGroupInfo& original, } ASSERT_EQ(diffs.size(), expected_number_of_diffs); - uint8_t equal_diffs = 0; - int8_t larger_diffs = 0; - int8_t smaller_diffs = 0; + int32_t equal_diffs = 0; + int32_t larger_diffs = 0; + int32_t smaller_diffs = 0; for (const auto& diff : diffs) { - uint64_t original_sum = 0, modified_sum = 0; + int64_t original_sum = 0, modified_sum = 0; for (const auto& val : diff.first) original_sum += val; for (const auto& val : diff.second) modified_sum += val; @@ -656,17 +659,17 @@ void AssertPageLengthDifferences(const RowGroupInfo& original, void AssertPageLengthDifferences(const RowGroupInfo& original, const RowGroupInfo& modified, - uint8_t max_number_of_equal_diffs) { + int32_t max_number_of_equal_diffs) { // A less restrictive version of the above assertion function mainly used to // assert the update case. auto diffs = FindDifferences(original.page_lengths, modified.page_lengths); - if (diffs.size() > max_number_of_equal_diffs) { + if (diffs.size() > static_cast(max_number_of_equal_diffs)) { PrintDifferences(original.page_lengths, modified.page_lengths, diffs); } - ASSERT_LE(diffs.size(), max_number_of_equal_diffs); + ASSERT_LE(diffs.size(), static_cast(max_number_of_equal_diffs)); for (const auto& diff : diffs) { - uint64_t left_sum = 0, right_sum = 0; + int64_t left_sum = 0, right_sum = 0; for (const auto& val : diff.first) left_sum += val; for (const auto& val : diff.second) right_sum += val; ASSERT_EQ(left_sum, right_sum); @@ -680,12 +683,12 @@ void AssertPageLengthDifferences(const RowGroupInfo& original, } } -uint64_t ElementCount(int64_t size, int32_t byte_width, bool nullable) { +int64_t ElementCount(int64_t size, int32_t byte_width, bool nullable) { if (nullable) { // in case of nullable types the def_levels are also fed through the chunker // to identify changes in the null bitmap, this will increase the byte width // and decrease the number of elements per chunk - byte_width += 2; + byte_width += sizeof(uint16_t); } return size / byte_width; } From e6ecef20af11405c2203a2fd824bdc94e14bcfcf Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 2 Apr 2025 18:48:16 +0200 Subject: [PATCH 080/102] Address review comments --- cpp/src/parquet/chunker_internal.cc | 6 +- cpp/src/parquet/chunker_internal.h | 4 +- cpp/src/parquet/chunker_internal_test.cc | 123 ++++++++++++----------- 3 files changed, 69 insertions(+), 64 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index c8362c42aea..61e673e5968 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -194,7 +194,7 @@ class ContentDefinedChunker::Impl { return false; } - void ValidateChunks(const std::vector& chunks, int64_t num_levels) { + void ValidateChunks(const std::vector& chunks, int64_t num_levels) const { // chunks must be non-empty and monotonic increasing DCHECK(!chunks.empty()); @@ -409,6 +409,8 @@ std::vector ContentDefinedChunker::GetChunks(const int16_t* def_levels, return impl_->GetChunks(def_levels, rep_levels, num_levels, values); } -uint64_t ContentDefinedChunker::GetMask() const { return impl_->GetRollingHashMask(); } +uint64_t ContentDefinedChunker::GetRollingHashMask() const { + return impl_->GetRollingHashMask(); +} } // namespace parquet::internal diff --git a/cpp/src/parquet/chunker_internal.h b/cpp/src/parquet/chunker_internal.h index 94e66ccb42b..187f39647bd 100644 --- a/cpp/src/parquet/chunker_internal.h +++ b/cpp/src/parquet/chunker_internal.h @@ -131,7 +131,9 @@ class PARQUET_EXPORT ContentDefinedChunker { int64_t num_levels, const ::arrow::Array& values); private: - uint64_t GetMask() const; + /// @brief Get the rolling hash mask used to determine chunk boundaries, used for + /// testing the mask calculation. + uint64_t GetRollingHashMask() const; class Impl; std::unique_ptr impl_; diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index cbac1a0cc18..a359ff466f5 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -183,9 +183,8 @@ Result> GenerateArray(const std::shared_ptr& field auto struct_type = checked_cast(type.get()); std::vector> child_arrays; for (auto i = 0; i < struct_type->num_fields(); i++) { - ARROW_ASSIGN_OR_RAISE(auto child_array, - GenerateArray(struct_type->field(i), length, - seed + static_cast(i + 300))); + ARROW_ASSIGN_OR_RAISE(auto child_array, GenerateArray(struct_type->field(i), + length, seed + i * 10)); child_arrays.push_back(child_array); } auto struct_array = @@ -260,7 +259,7 @@ Result> GenerateTable( const std::shared_ptr<::arrow::Schema>& schema, int64_t size, int64_t seed = 0) { std::vector> arrays; for (const auto& field : schema->fields()) { - ARROW_ASSIGN_OR_RAISE(auto array, GenerateArray(field, size, seed)); + ARROW_ASSIGN_OR_RAISE(auto array, GenerateArray(field, size, ++seed)); arrays.push_back(array); } return Table::Make(schema, arrays, size); @@ -351,7 +350,7 @@ ParquetInfo GetColumnParquetInfo(const std::shared_ptr& data, } Result WriteAndGetParquetInfo( - const std::shared_ptr
& table, uint64_t min_chunk_size, uint64_t max_chunk_size, + const std::shared_ptr
& table, int64_t min_chunk_size, int64_t max_chunk_size, bool enable_dictionary = false, ParquetDataPageVersion data_page_version = ParquetDataPageVersion::V1, int64_t row_group_size = 1024 * 1024, @@ -452,11 +451,8 @@ std::vector FindDifferences(const ChunkList& first, const ChunkList& if (!merged.empty()) { auto& prev = merged.back(); // Check if we can merge with the previous diff - bool can_merge_a = prev.first.empty() && !prev.second.empty() && - !diff.first.empty() && diff.second.empty(); - bool can_merge_b = prev.second.empty() && !prev.first.empty() && - !diff.second.empty() && diff.first.empty(); - + bool can_merge_a = prev.first.empty() && diff.second.empty(); + bool can_merge_b = prev.second.empty() && diff.first.empty(); if (can_merge_a) { // Combine into one diff: keep prev's second, use diff's first prev.first = std::move(diff.first); @@ -724,35 +720,13 @@ void AssertChunkSizes(const std::shared_ptr<::arrow::DataType>& dtype, } } -constexpr int64_t kMinChunkSize = 8 * 1024; -constexpr int64_t kMaxChunkSize = 32 * 1024; -constexpr int64_t kPartSize = 128 * 1024; -constexpr int64_t kEditSize = 128; - -struct CaseConfig { - // Arrow data type to generate the testing data for - std::shared_ptr<::arrow::DataType> dtype; - // Whether the data type is nullable - bool is_nullable; - // Approximate number of bytes per record to calculate the number of elements to - // generate - size_t bytes_per_record; - // Data page version to use - ParquetDataPageVersion data_page_version = ParquetDataPageVersion::V1; -}; - -// Define PrintTo for MyStruct -void PrintTo(const CaseConfig& param, std::ostream* os) { - *os << "{ " << param.dtype->ToString(); - if (param.is_nullable) { - *os << " nullable"; - } - *os << " }"; -} - class TestCDC : public ::testing::Test { - public: - uint64_t GetMask(const ContentDefinedChunker& cdc) const { return cdc.GetMask(); } + protected: + static constexpr int64_t kMinChunkSize = 8 * 1024; + static constexpr int64_t kMaxChunkSize = 32 * 1024; + uint64_t GetRollingHashMask(const ContentDefinedChunker& cdc) const { + return cdc.GetRollingHashMask(); + } }; TEST_F(TestCDC, RollingHashMaskCalculation) { @@ -761,35 +735,35 @@ TEST_F(TestCDC, RollingHashMaskCalculation) { auto max_size = 1024 * 1024; auto cdc0 = ContentDefinedChunker(le, min_size, max_size, 0); - ASSERT_EQ(GetMask(cdc0), 0xFFFE000000000000); + ASSERT_EQ(GetRollingHashMask(cdc0), 0xFFFE000000000000); auto cdc1 = ContentDefinedChunker(le, min_size, max_size, 1); - ASSERT_EQ(GetMask(cdc1), 0xFFFC000000000000); + ASSERT_EQ(GetRollingHashMask(cdc1), 0xFFFC000000000000); auto cdc2 = ContentDefinedChunker(le, min_size, max_size, 2); - ASSERT_EQ(GetMask(cdc2), 0xFFF8000000000000); + ASSERT_EQ(GetRollingHashMask(cdc2), 0xFFF8000000000000); auto cdc3 = ContentDefinedChunker(le, min_size, max_size, 3); - ASSERT_EQ(GetMask(cdc3), 0xFFF0000000000000); + ASSERT_EQ(GetRollingHashMask(cdc3), 0xFFF0000000000000); auto cdc4 = ContentDefinedChunker(le, min_size, max_size, -1); - ASSERT_EQ(GetMask(cdc4), 0xFFFF000000000000); + ASSERT_EQ(GetRollingHashMask(cdc4), 0xFFFF000000000000); // this is the smallest possible mask always matching, by using 8 hashtables // we are going to have a match every 8 bytes; this is an unrealistic case // but checking for the correctness of the mask calculation auto cdc5 = ContentDefinedChunker(le, 0, 16, 0); - ASSERT_EQ(GetMask(cdc5), 0x0000000000000000); + ASSERT_EQ(GetRollingHashMask(cdc5), 0x0000000000000000); auto cdc6 = ContentDefinedChunker(le, 0, 32, 1); - ASSERT_EQ(GetMask(cdc6), 0x0000000000000000); + ASSERT_EQ(GetRollingHashMask(cdc6), 0x0000000000000000); auto cdc7 = ContentDefinedChunker(le, 0, 16, -1); - ASSERT_EQ(GetMask(cdc7), 0x8000000000000000); + ASSERT_EQ(GetRollingHashMask(cdc7), 0x8000000000000000); // another unrealistic case, checking for the validation auto cdc8 = ContentDefinedChunker(le, 128, 384, -60); - ASSERT_EQ(GetMask(cdc8), 0xFFFFFFFFFFFFFFFF); + ASSERT_EQ(GetRollingHashMask(cdc8), 0xFFFFFFFFFFFFFFFF); } TEST_F(TestCDC, WriteSingleColumnParquetFile) { @@ -911,8 +885,33 @@ TEST_F(TestCDC, ChunkSizeParameterValidation) { ASSERT_NO_THROW(ContentDefinedChunker(li, 0, std::numeric_limits::max(), 58)); } +struct CaseConfig { + // Arrow data type to generate the testing data for + std::shared_ptr<::arrow::DataType> dtype; + // Whether the data type is nullable + bool is_nullable; + // Approximate number of bytes per record to calculate the number of elements to + // generate + int64_t bytes_per_record; + // Data page version to use + ParquetDataPageVersion data_page_version = ParquetDataPageVersion::V1; +}; + +// Define PrintTo for MyStruct +void PrintTo(const CaseConfig& param, std::ostream* os) { + *os << "{ " << param.dtype->ToString(); + if (param.is_nullable) { + *os << " nullable"; + } + *os << " }"; +} + class TestCDCSingleRowGroup : public ::testing::TestWithParam { protected: + static constexpr int64_t kPartSize = 128 * 1024; + static constexpr int64_t kEditSize = 128; + static constexpr int64_t kMinChunkSize = 8 * 1024; + static constexpr int64_t kMaxChunkSize = 32 * 1024; // Column random table parts for testing std::shared_ptr field_; std::shared_ptr
part1_, part2_, part3_, part4_, part5_, part6_, part7_; @@ -924,13 +923,15 @@ class TestCDCSingleRowGroup : public ::testing::TestWithParam { auto part_length = kPartSize / param.bytes_per_record; auto edit_length = kEditSize / param.bytes_per_record; - ASSERT_OK_AND_ASSIGN(part1_, GenerateTable(schema, part_length, 0)); - ASSERT_OK_AND_ASSIGN(part2_, GenerateTable(schema, edit_length, 1)); - ASSERT_OK_AND_ASSIGN(part3_, GenerateTable(schema, part_length, part_length)); - ASSERT_OK_AND_ASSIGN(part4_, GenerateTable(schema, edit_length, 2)); - ASSERT_OK_AND_ASSIGN(part5_, GenerateTable(schema, part_length, 2 * part_length)); - ASSERT_OK_AND_ASSIGN(part6_, GenerateTable(schema, edit_length, 3)); - ASSERT_OK_AND_ASSIGN(part7_, GenerateTable(schema, edit_length, 4)); + ASSERT_OK_AND_ASSIGN(part1_, GenerateTable(schema, part_length, /*seed=*/0)); + ASSERT_OK_AND_ASSIGN(part2_, GenerateTable(schema, edit_length, /*seed=*/1)); + ASSERT_OK_AND_ASSIGN(part3_, + GenerateTable(schema, part_length, /*seed=*/part_length)); + ASSERT_OK_AND_ASSIGN(part4_, GenerateTable(schema, edit_length, /*seed=*/2)); + ASSERT_OK_AND_ASSIGN(part5_, + GenerateTable(schema, part_length, /*seed=*/2 * part_length)); + ASSERT_OK_AND_ASSIGN(part6_, GenerateTable(schema, edit_length, /*seed=*/3)); + ASSERT_OK_AND_ASSIGN(part7_, GenerateTable(schema, edit_length, /*seed=*/4)); } }; @@ -1153,7 +1154,7 @@ TEST_P(TestCDCSingleRowGroup, EmptyTable) { const auto& param = GetParam(); auto schema = ::arrow::schema({::arrow::field("f0", param.dtype, param.is_nullable)}); - ASSERT_OK_AND_ASSIGN(auto empty_table, GenerateTable(schema, 0, 0)); + ASSERT_OK_AND_ASSIGN(auto empty_table, GenerateTable(schema, 0, /*seed=*/0)); ASSERT_EQ(empty_table->num_rows(), 0); for (bool enable_dictionary : {false, true}) { @@ -1241,13 +1242,13 @@ class TestCDCMultipleRowGroups : public ::testing::Test { auto field = ::arrow::field("f0", dtype_, true); auto schema = ::arrow::schema({field}); - ASSERT_OK_AND_ASSIGN(part1_, GenerateTable(schema, kPartLength, 0)); - ASSERT_OK_AND_ASSIGN(part2_, GenerateTable(schema, kPartLength, 2)); - ASSERT_OK_AND_ASSIGN(part3_, GenerateTable(schema, kPartLength, 4)); + ASSERT_OK_AND_ASSIGN(part1_, GenerateTable(schema, kPartLength, /*seed=*/0)); + ASSERT_OK_AND_ASSIGN(part2_, GenerateTable(schema, kPartLength, /*seed=*/2)); + ASSERT_OK_AND_ASSIGN(part3_, GenerateTable(schema, kPartLength, /*seed=*/4)); - ASSERT_OK_AND_ASSIGN(edit1_, GenerateTable(schema, kEditLength, 1)); - ASSERT_OK_AND_ASSIGN(edit2_, GenerateTable(schema, kEditLength, 3)); - ASSERT_OK_AND_ASSIGN(edit3_, GenerateTable(schema, kEditLength, 5)); + ASSERT_OK_AND_ASSIGN(edit1_, GenerateTable(schema, kEditLength, /*seed=*/1)); + ASSERT_OK_AND_ASSIGN(edit2_, GenerateTable(schema, kEditLength, /*seed=*/3)); + ASSERT_OK_AND_ASSIGN(edit3_, GenerateTable(schema, kEditLength, /*seed=*/5)); } }; From c6444c0b6b677b6ba2fdcf19d6ddd5c5fb85b060 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 2 Apr 2025 19:28:54 +0200 Subject: [PATCH 081/102] Use optional to store the cdc chunker in the column writer --- cpp/src/parquet/column_writer.cc | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index abb63ca48c9..4c02758f20a 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -753,11 +753,7 @@ class ColumnWriterImpl { closed_(false), fallback_(false), definition_levels_sink_(allocator_), - repetition_levels_sink_(allocator_), - content_defined_chunker_( - level_info_, properties->content_defined_chunking_options().min_chunk_size, - properties->content_defined_chunking_options().max_chunk_size, - properties->content_defined_chunking_options().norm_factor) { + repetition_levels_sink_(allocator_) { definition_levels_rle_ = std::static_pointer_cast(AllocateBuffer(allocator_, 0)); repetition_levels_rle_ = @@ -769,6 +765,12 @@ class ColumnWriterImpl { compressor_temp_buffer_ = std::static_pointer_cast(AllocateBuffer(allocator_, 0)); } + if (properties_->content_defined_chunking_enabled()) { + auto cdc_options = properties_->content_defined_chunking_options(); + content_defined_chunker_.emplace(level_info_, cdc_options.min_chunk_size, + cdc_options.max_chunk_size, + cdc_options.norm_factor); + } } virtual ~ColumnWriterImpl() = default; @@ -900,7 +902,7 @@ class ColumnWriterImpl { std::vector> data_pages_; - internal::ContentDefinedChunker content_defined_chunker_; + std::optional content_defined_chunker_; private: void InitSinks() { @@ -1387,8 +1389,9 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, } if (properties_->content_defined_chunking_enabled()) { - auto chunks = content_defined_chunker_.GetChunks(def_levels, rep_levels, num_levels, - leaf_array); + DCHECK(content_defined_chunker_.has_value()); + auto chunks = content_defined_chunker_->GetChunks(def_levels, rep_levels, + num_levels, leaf_array); for (size_t i = 0; i < chunks.size(); i++) { auto chunk = chunks[i]; auto chunk_array = leaf_array.Slice(chunk.value_offset); From 52b7a405d0930752a3d7322f0bd99b2753d6c89b Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Thu, 3 Apr 2025 10:34:40 +0200 Subject: [PATCH 082/102] Improve test assertions; check null values for fixed size types --- cpp/src/parquet/chunker_internal.cc | 31 +- cpp/src/parquet/chunker_internal_codegen.py | 2 + cpp/src/parquet/chunker_internal_generated.h | 2 + cpp/src/parquet/chunker_internal_test.cc | 933 +++++++++++-------- 4 files changed, 570 insertions(+), 398 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index 61e673e5968..610b7054dd0 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -34,9 +34,9 @@ namespace parquet::internal { using ::arrow::internal::checked_cast; -static_assert(std::size(kGearhashTable) == 8, +static_assert(std::size(kGearhashTable) == kNumGearhashTables, "should update CDC code to reflect number of generated hash tables"); -static_assert(sizeof(kGearhashTable) == 8 * 256 * 8, +static_assert(sizeof(kGearhashTable) == kNumGearhashTables * 256 * 8, "each table should have 256 entries of 64 bit values"); /// Calculate the mask to use for the rolling hash, the mask is used to determine if a @@ -66,7 +66,7 @@ static_assert(sizeof(kGearhashTable) == 8 * 256 * 8, // @param norm_factor Normalization factor (default 0) // @return The mask used to compare against the rolling hash static uint64_t CalculateMask(int64_t min_chunk_size, int64_t max_chunk_size, - int8_t norm_factor) { + int norm_factor) { if (min_chunk_size < 0) { throw ParquetException("min_chunk_size must be positive"); } @@ -78,8 +78,9 @@ static uint64_t CalculateMask(int64_t min_chunk_size, int64_t max_chunk_size, int64_t avg_chunk_size = (min_chunk_size + max_chunk_size) / 2; // since we are skipping the first `min_chunk_size` bytes for each chunk, we need to // target a smaller chunk size to reach the average size after skipping the first - // `min_chunk_size` bytes - int64_t target_size = avg_chunk_size - min_chunk_size; + // `min_chunk_size` bytes; also divide by the number of gearhash tables to have a + // a more gaussian-like distribution + int64_t target_size = (avg_chunk_size - min_chunk_size) / kNumGearhashTables; // assuming that the gear hash has a uniform distribution, we can calculate the mask // by taking the floor(log2(target_size)) @@ -89,20 +90,12 @@ static uint64_t CalculateMask(int64_t min_chunk_size, int64_t max_chunk_size, // probability, by increasing the norm_factor we increase the probability of matching // the mask, forcing the distribution closer to the average size; norm_factor is 0 by // default - // adding 3 because we are using 8 hash tables to have more gaussian-like distribution - int mask_adjustement = 3 + norm_factor; - int effective_bits = mask_bits - mask_adjustement; - if (effective_bits == 0) { - return 0; - } else if (effective_bits > 64) { - throw ParquetException("The number of bits in the mask cannot exceed 64, got " + - std::to_string(effective_bits)); - } else if (effective_bits < 0) { + int effective_bits = mask_bits - norm_factor; + + if (effective_bits < 1 || effective_bits > 63) { throw ParquetException( - "The difference between min_chunk_size=" + std::to_string(min_chunk_size) + - " and max_chunk_size=" + std::to_string(max_chunk_size) + - " is too small for the given norm_factor=" + std::to_string(norm_factor) + - ", either increase the size range or decrease the norm_factor."); + "The number of bits in the CDC mask must be between 1 and 63, got " + + std::to_string(effective_bits)); } else { // create the mask by setting the top bits return std::numeric_limits::max() << (64 - effective_bits); @@ -178,7 +171,7 @@ class ContentDefinedChunker::Impl { // different gearhash table (gearhash's chunk size has geometric distribution, and // we use central limit theorem to approximate normal distribution, see // section 6.2.1 in paper https://www.cidrdb.org/cidr2023/papers/p43-low.pdf) - if (ARROW_PREDICT_FALSE(++nth_run_ >= 7)) { + if (ARROW_PREDICT_FALSE(++nth_run_ >= kNumGearhashTables)) { nth_run_ = 0; chunk_size_ = 0; return true; diff --git a/cpp/src/parquet/chunker_internal_codegen.py b/cpp/src/parquet/chunker_internal_codegen.py index 052842367ec..5458d31dbfe 100644 --- a/cpp/src/parquet/chunker_internal_codegen.py +++ b/cpp/src/parquet/chunker_internal_codegen.py @@ -72,6 +72,8 @@ namespace parquet::internal {{ +constexpr int64_t kNumGearhashTables = {ntables}; + constexpr uint64_t kGearhashTable[{ntables}][256] = {{ {content}}}; diff --git a/cpp/src/parquet/chunker_internal_generated.h b/cpp/src/parquet/chunker_internal_generated.h index 13a47984b74..a09822a684e 100644 --- a/cpp/src/parquet/chunker_internal_generated.h +++ b/cpp/src/parquet/chunker_internal_generated.h @@ -20,6 +20,8 @@ namespace parquet::internal { +constexpr int64_t kNumGearhashTables = 8; + constexpr uint64_t kGearhashTable[8][256] = { {// seed = 0 0xf09f35a563783945, 0x0dcc5b3bc5ae410a, 0x63f1ea8d22554270, 0xfbe5ee7bd05a7b61, diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index a359ff466f5..435783eaec3 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -193,40 +193,46 @@ Result> GenerateArray(const std::shared_ptr& field } case ::arrow::Type::LIST: { + // Repeat the same pattern in the list array: + // null, empty list, list of 1 element, list of 3 elements + if (length % 4 != 0) { + return Status::Invalid( + "Length must be divisible by 4 when generating list arrays, but got: ", + length); + } + auto values_array_length = length * 4; auto list_type = checked_cast(type.get()); auto value_field = ::arrow::field("item", list_type->value_type()); - ARROW_ASSIGN_OR_RAISE(auto values_array, GenerateArray(value_field, length, seed)); + ARROW_ASSIGN_OR_RAISE(auto values_array, + GenerateArray(value_field, values_array_length, seed)); auto offset_builder = ::arrow::Int32Builder(); auto bitmap_builder = ::arrow::TypedBufferBuilder(); + RETURN_NOT_OK(offset_builder.Reserve(length + 1)); + RETURN_NOT_OK(bitmap_builder.Reserve(length)); + int32_t num_nulls = 0; - int32_t num_elements = 0; - int32_t element_size = 0; - int32_t current_offset = 0; - RETURN_NOT_OK(offset_builder.Append(current_offset)); - while (current_offset < length) { - num_elements++; - auto is_valid = !(nullable && (num_elements % 10 == 0)); - if (is_valid) { - RETURN_NOT_OK(bitmap_builder.Append(true)); - current_offset += element_size; - if (current_offset > length) { - RETURN_NOT_OK(offset_builder.Append(static_cast(length))); - break; - } else { - RETURN_NOT_OK(offset_builder.Append(current_offset)); - } - } else { - RETURN_NOT_OK(offset_builder.Append(static_cast(current_offset))); + RETURN_NOT_OK(offset_builder.Append(0)); + for (auto offset = 0; offset < length; offset += 4) { + if (nullable) { + // add a null RETURN_NOT_OK(bitmap_builder.Append(false)); - num_nulls++; - } - - if (element_size > 4) { - element_size = 0; + RETURN_NOT_OK(offset_builder.Append(offset)); + num_nulls += 1; } else { - element_size++; + // add an empty list + RETURN_NOT_OK(bitmap_builder.Append(true)); + RETURN_NOT_OK(offset_builder.Append(offset)); } + // add an empty list + RETURN_NOT_OK(bitmap_builder.Append(true)); + RETURN_NOT_OK(offset_builder.Append(offset)); + // add a list of 1 element + RETURN_NOT_OK(bitmap_builder.Append(true)); + RETURN_NOT_OK(offset_builder.Append(offset + 1)); + // add a list of 3 elements + RETURN_NOT_OK(bitmap_builder.Append(true)); + RETURN_NOT_OK(offset_builder.Append(offset + 4)); } std::shared_ptr offsets_array; @@ -255,6 +261,44 @@ Result> GenerateArray(const std::shared_ptr& field } } +TEST(TestGenerateArray, Integer) { + auto field = ::arrow::field("a", ::arrow::int32()); + ASSERT_OK_AND_ASSIGN(auto array, GenerateArray(field, /*length=*/10, /*seed=*/0)); + ASSERT_OK(array->ValidateFull()); + ASSERT_EQ(array->length(), 10); + ASSERT_TRUE(array->type()->Equals(::arrow::int32())); + ASSERT_EQ(array->null_count(), 1); +} + +TEST(TestGenerateArray, ListOfInteger) { + auto field = ::arrow::field("a", ::arrow::list(::arrow::int32())); + auto length = 12; + ASSERT_OK_AND_ASSIGN(auto array, GenerateArray(field, length, /*seed=*/0)); + ASSERT_OK(array->ValidateFull()); + ASSERT_EQ(array->length(), length); + + for (size_t i = 0; i < 12; i += 4) { + // Assert the first element is null + ASSERT_TRUE(array->IsNull(i)); + + // Assert the second element is an empty list + ASSERT_TRUE(array->IsValid(i + 1)); + auto list_array = std::static_pointer_cast<::arrow::ListArray>(array); + ASSERT_EQ(list_array->value_length(i + 1), 0); + + // Assert the third element has length 1 + ASSERT_TRUE(array->IsValid(i + 2)); + ASSERT_EQ(list_array->value_length(i + 2), 1); + + // Assert the fourth element has length 3 + ASSERT_TRUE(array->IsValid(i + 3)); + ASSERT_EQ(list_array->value_length(i + 3), 3); + } + + ASSERT_NOT_OK(GenerateArray(field, 3, /*seed=*/0)); + ASSERT_OK(GenerateArray(field, 8, /*seed=*/0)); +} + Result> GenerateTable( const std::shared_ptr<::arrow::Schema>& schema, int64_t size, int64_t seed = 0) { std::vector> arrays; @@ -273,9 +317,23 @@ Result> ConcatAndCombine( return table->CombineChunks(); } +Result> ReadTableFromBuffer(const std::shared_ptr& data) { + std::shared_ptr
result; + FileReaderBuilder builder; + std::unique_ptr reader; + auto props = default_arrow_reader_properties(); + + RETURN_NOT_OK(builder.Open(std::make_shared(data))); + RETURN_NOT_OK(builder.memory_pool(::arrow::default_memory_pool()) + ->properties(props) + ->Build(&reader)); + RETURN_NOT_OK(reader->ReadTable(&result)); + return result; +} + Result> WriteTableToBuffer( const std::shared_ptr
& table, int64_t min_chunk_size, int64_t max_chunk_size, - bool enable_dictionary = false, int64_t row_group_size = 1024 * 1024, + int64_t row_group_length = 1024 * 1024, bool enable_dictionary = false, ParquetDataPageVersion data_page_version = ParquetDataPageVersion::V1) { auto sink = CreateOutputStream(); @@ -290,21 +348,27 @@ Result> WriteTableToBuffer( } auto write_props = builder.build(); auto arrow_props = ArrowWriterProperties::Builder().store_schema()->build(); - RETURN_NOT_OK(WriteTable(*table, default_memory_pool(), sink, row_group_size, + RETURN_NOT_OK(WriteTable(*table, default_memory_pool(), sink, row_group_length, write_props, arrow_props)); - return sink->Finish(); -} + ARROW_ASSIGN_OR_RAISE(auto buffer, sink->Finish()); + + // check whether the schema has extension types, if not we can easily ensure that + // the parquet seralization is roundtripable with CDC enabled + bool validate_roundtrip = true; + for (const auto& field : table->schema()->fields()) { + if (field->type()->id() == ::arrow::Type::EXTENSION) { + validate_roundtrip = false; + break; + } + } + if (validate_roundtrip) { + ARROW_ASSIGN_OR_RAISE(auto readback, ReadTableFromBuffer(buffer)); + RETURN_NOT_OK(readback->ValidateFull()); + ARROW_RETURN_IF(!readback->Equals(*table), + Status::Invalid("Readback table not equal to original")); + } -Result> ReadTableFromBuffer(const std::shared_ptr& data) { - std::shared_ptr
result; - FileReaderBuilder builder; - std::unique_ptr reader; - RETURN_NOT_OK(builder.Open(std::make_shared(data))); - RETURN_NOT_OK(builder.memory_pool(::arrow::default_memory_pool()) - ->properties(default_arrow_reader_properties()) - ->Build(&reader)); - RETURN_NOT_OK(reader->ReadTable(&result)); - return result; + return buffer; } // Type to represent a list of chunks where each element is the size of the chunk. @@ -312,16 +376,15 @@ using ChunkList = std::vector; // Type to represent the sizes and lengths of the data pages in a column. -struct RowGroupInfo { +struct ColumnInfo { ChunkList page_lengths; ChunkList page_sizes; bool has_dictionary_page = false; }; -using ParquetInfo = std::vector; +using ParquetInfo = std::vector; -ParquetInfo GetColumnParquetInfo(const std::shared_ptr& data, - int column_index = 0) { +ParquetInfo GetColumnParquetInfo(const std::shared_ptr& data, int column_index) { // Read the parquet data out of the buffer and get the sizes and lengths of the // data pages in given column. We assert on the sizes and lengths of the pages // to ensure that the chunking is done correctly. @@ -333,44 +396,22 @@ ParquetInfo GetColumnParquetInfo(const std::shared_ptr& data, auto metadata = parquet_reader->metadata(); for (int rg = 0; rg < metadata->num_row_groups(); rg++) { auto page_reader = parquet_reader->RowGroup(rg)->GetColumnPageReader(column_index); - RowGroupInfo rg_info; + ColumnInfo column_info; while (auto page = page_reader->NextPage()) { if (page->type() == PageType::DATA_PAGE || page->type() == PageType::DATA_PAGE_V2) { auto data_page = static_cast(page.get()); - rg_info.page_sizes.push_back(data_page->size()); - rg_info.page_lengths.push_back(data_page->num_values()); + column_info.page_sizes.push_back(data_page->uncompressed_size()); + column_info.page_lengths.push_back(data_page->num_values()); } else if (page->type() == PageType::DICTIONARY_PAGE) { - rg_info.has_dictionary_page = true; + column_info.has_dictionary_page = true; } } - result.push_back(rg_info); + result.push_back(column_info); } return result; } -Result WriteAndGetParquetInfo( - const std::shared_ptr
& table, int64_t min_chunk_size, int64_t max_chunk_size, - bool enable_dictionary = false, - ParquetDataPageVersion data_page_version = ParquetDataPageVersion::V1, - int64_t row_group_size = 1024 * 1024, - - int column_index = 0) { - // Write the table to a buffer and read it back to get the page sizes - ARROW_ASSIGN_OR_RAISE( - auto buffer, - WriteTableToBuffer(table, min_chunk_size, max_chunk_size, enable_dictionary, - row_group_size, data_page_version)); - ARROW_ASSIGN_OR_RAISE(auto readback, ReadTableFromBuffer(buffer)); - - RETURN_NOT_OK(readback->ValidateFull()); - if (readback->schema()->Equals(*table->schema())) { - ARROW_RETURN_IF(!readback->Equals(*table), - Status::Invalid("Readback table not equal to original")); - } - return GetColumnParquetInfo(buffer, column_index); -} - // A git-hunk like side-by-side data structure to represent the differences between two // vectors of uint64_t values. using ChunkDiff = std::pair; @@ -601,12 +642,11 @@ TEST(TestFindDifferences, AdditionalCase) { } } -void AssertPageLengthDifferences(const RowGroupInfo& original, - const RowGroupInfo& modified, +void AssertPageLengthDifferences(const ColumnInfo& original, const ColumnInfo& modified, int32_t exact_number_of_equal_diffs, int32_t exact_number_of_larger_diffs, int32_t exact_number_of_smaller_diffs, - int64_t edit_length = 0) { + const std::shared_ptr& edit_array) { // Asserts that the differences between the original and modified page lengths // are as expected. A longest common subsequence diff is calculated on the original // and modified sequences of page lengths. The exact_number_of_equal_diffs, @@ -615,6 +655,21 @@ void AssertPageLengthDifferences(const RowGroupInfo& original, // lengths respectively. The edit_length parameter is used to verify that the page // lenght differences are exactly equal to the edit_length. auto diffs = FindDifferences(original.page_lengths, modified.page_lengths); + + int64_t edit_length = edit_array->length(); + if (::arrow::is_list_like(edit_array->type()->id())) { + // add null and empty lists to the edit length because the page length corresponds to + // the number of def/rep levels rather than the number of elements in the array + for (auto chunk : edit_array->chunks()) { + auto list_array = checked_cast(chunk.get()); + for (int i = 0; i < list_array->length(); i++) { + if (list_array->IsNull(i) || (list_array->value_length(i) == 0)) { + edit_length++; + } + } + } + } + size_t expected_number_of_diffs = exact_number_of_equal_diffs + exact_number_of_larger_diffs + exact_number_of_smaller_diffs; @@ -644,8 +699,6 @@ void AssertPageLengthDifferences(const RowGroupInfo& original, smaller_diffs++; ASSERT_EQ(original_sum, modified_sum + edit_length); } - ASSERT_LE(diff.first.size(), 2); - ASSERT_LE(diff.second.size(), 2); } ASSERT_EQ(equal_diffs, exact_number_of_equal_diffs); @@ -653,8 +706,7 @@ void AssertPageLengthDifferences(const RowGroupInfo& original, ASSERT_EQ(smaller_diffs, exact_number_of_smaller_diffs); } -void AssertPageLengthDifferences(const RowGroupInfo& original, - const RowGroupInfo& modified, +void AssertPageLengthDifferences(const ColumnInfo& original, const ColumnInfo& modified, int32_t max_number_of_equal_diffs) { // A less restrictive version of the above assertion function mainly used to // assert the update case. @@ -669,8 +721,6 @@ void AssertPageLengthDifferences(const RowGroupInfo& original, for (const auto& val : diff.first) left_sum += val; for (const auto& val : diff.second) right_sum += val; ASSERT_EQ(left_sum, right_sum); - ASSERT_LE(diff.first.size(), 2); - ASSERT_LE(diff.second.size(), 2); } if (diffs.size() == 0) { @@ -679,56 +729,150 @@ void AssertPageLengthDifferences(const RowGroupInfo& original, } } -int64_t ElementCount(int64_t size, int32_t byte_width, bool nullable) { +Result CalculateCdcSize(const std::shared_ptr& array, bool nullable) { + // calculate the CDC chunk size based on the array elements belonging to a parquet page + auto type_id = array->type()->id(); + + int64_t result = 0; + if (::arrow::is_fixed_width(type_id)) { + int64_t element_size; + if (array->type()->id() == ::arrow::Type::BOOL) { + // the CDC chunker increments the chunk size by 1 for each boolean element + element_size = 1; + } else { + element_size = array->type()->byte_width(); + } + auto valid_elements = array->length() - array->null_count(); + result = valid_elements * element_size; + } else if (::arrow::is_binary_like(type_id)) { + auto binary_array = checked_cast(array.get()); + result += binary_array->total_values_length(); + } else if (::arrow::is_large_binary_like(type_id)) { + auto binary_array = checked_cast(array.get()); + result += binary_array->total_values_length(); + } else { + return Status::NotImplemented("CDC size calculation for type ", + array->type()->ToString(), " is not implemented"); + } + if (nullable) { - // in case of nullable types the def_levels are also fed through the chunker - // to identify changes in the null bitmap, this will increase the byte width - // and decrease the number of elements per chunk - byte_width += sizeof(uint16_t); + // in case of nullable types chunk size is calculated from def_levels and + // the valid values + return result + array->length() * sizeof(uint16_t); + } else { + // for non-nullable types the chunk size is calculated purely from the values + return result; } - return size / byte_width; } -void AssertAllBetween(const ChunkList& chunks, int64_t min, int64_t max) { - // except the last chunk since it is not guaranteed to be within the range - for (size_t i = 0; i < chunks.size() - 1; i++) { - ASSERT_GE(chunks[i], min); - ASSERT_LE(chunks[i], max); +Result CalculateCdcSize(const std::shared_ptr<::arrow::ChunkedArray>& array, + bool nullable) { + int64_t result = 0; + for (int i = 0; i < array->num_chunks(); i++) { + ARROW_ASSIGN_OR_RAISE(auto chunk_size, CalculateCdcSize(array->chunk(i), nullable)); + result += chunk_size; } - ASSERT_LE(chunks.back(), max); + return result; } -void AssertChunkSizes(const std::shared_ptr<::arrow::DataType>& dtype, - const RowGroupInfo& base_info, const RowGroupInfo& modified_info, - bool nullable, bool enable_dictionary, int64_t min_chunk_size, - int64_t max_chunk_size) { - if (dtype->id() != ::arrow::Type::BOOL) { - ASSERT_EQ(base_info.has_dictionary_page, enable_dictionary); - ASSERT_EQ(modified_info.has_dictionary_page, enable_dictionary); +void AssertContentDefinedChunkSizes(const std::shared_ptr<::arrow::ChunkedArray>& array, + const ColumnInfo& column_info, bool nullable, + int64_t min_chunk_size, int64_t max_chunk_size, + bool expect_dictionary_page) { + // check that the chunk sizes are within the expected range + + // the test tables are combined in the test cases so we expect only a single chunk + auto type_id = array->type()->id(); + + // check for the dictionary page if expected + if (type_id == ::arrow::Type::BOOL) { + ASSERT_FALSE(column_info.has_dictionary_page); + } else { + ASSERT_EQ(column_info.has_dictionary_page, expect_dictionary_page); + ASSERT_EQ(column_info.has_dictionary_page, expect_dictionary_page); } - if (::arrow::is_fixed_width(dtype->id())) { - // for nullable types we cannot calculate the exact number of elements because - // not all elements are fed through the chunker (null elements are skipped) - auto byte_width = (dtype->id() == ::arrow::Type::BOOL) ? 1 : dtype->byte_width(); - auto min_length = ElementCount(min_chunk_size, byte_width, nullable); - auto max_length = ElementCount(max_chunk_size, byte_width, nullable); - AssertAllBetween(base_info.page_lengths, min_length, max_length); - AssertAllBetween(modified_info.page_lengths, min_length, max_length); - } else if (::arrow::is_base_binary_like(dtype->id()) && !enable_dictionary) { - AssertAllBetween(base_info.page_sizes, min_chunk_size, max_chunk_size); - AssertAllBetween(modified_info.page_sizes, min_chunk_size, max_chunk_size); + + if (::arrow::is_fixed_width(type_id) || ::arrow::is_base_binary_like(type_id)) { + auto offset = 0; + + auto page_lengths = column_info.page_lengths; + for (size_t i = 0; i < page_lengths.size() - 1; i++) { + // since CDC chunking is applied on the logical values before any parquet encoding + // we first slice the array to get the logical array and then calculate the CDC + // chunk size based on that + auto page_length = page_lengths[i]; + auto array_chunk = array->Slice(offset, page_length); + offset += page_length; + + ASSERT_OK_AND_ASSIGN(auto cdc_chunk_size, CalculateCdcSize(array_chunk, nullable)); + ASSERT_GE(cdc_chunk_size, min_chunk_size); + ASSERT_LE(cdc_chunk_size, max_chunk_size); + } + + auto last_page_length = page_lengths.back(); + auto last_array_chunk = array->Slice(offset, last_page_length); + ASSERT_OK_AND_ASSIGN(auto last_cdc_chunk_size, + CalculateCdcSize(last_array_chunk, nullable)); + // min chunk size is not guaranteed for the last chunk, only check that it is not + // larger than the max chunk size + ASSERT_LE(last_cdc_chunk_size, max_chunk_size); + + // the sum of the page lengths should be equal to the length of the array + offset += last_page_length; + ASSERT_EQ(offset, array->length()); } + + // TODO(kszucs): have approximate size assertions for variable length types } class TestCDC : public ::testing::Test { protected: - static constexpr int64_t kMinChunkSize = 8 * 1024; - static constexpr int64_t kMaxChunkSize = 32 * 1024; + static constexpr int64_t kMinChunkSize = 4 * 1024; + static constexpr int64_t kMaxChunkSize = 16 * 1024; uint64_t GetRollingHashMask(const ContentDefinedChunker& cdc) const { return cdc.GetRollingHashMask(); } }; +TEST_F(TestCDC, ChunkSizeParameterValidation) { + // Test that constructor validates min/max chunk size parameters + auto li = LevelInfo(); + + ASSERT_NO_THROW(ContentDefinedChunker(li, 256 * 1024, 1024 * 1024)); + + // with norm_factor=0 the difference between min and max chunk size must be + // at least 16 + ASSERT_THROW(ContentDefinedChunker(li, 0, -1), ParquetException); + ASSERT_THROW(ContentDefinedChunker(li, 1024, 512), ParquetException); + + ASSERT_THROW(ContentDefinedChunker(li, -1, 0), ParquetException); + ASSERT_THROW(ContentDefinedChunker(li, 0, 0), ParquetException); + ASSERT_THROW(ContentDefinedChunker(li, 0, 16), ParquetException); + ASSERT_NO_THROW(ContentDefinedChunker(li, 0, 32)); + ASSERT_THROW(ContentDefinedChunker(li, -16, -16), ParquetException); + ASSERT_THROW(ContentDefinedChunker(li, 16, 0), ParquetException); + ASSERT_THROW(ContentDefinedChunker(li, 32, 32), ParquetException); + ASSERT_THROW(ContentDefinedChunker(li, 32, 48), ParquetException); + ASSERT_NO_THROW(ContentDefinedChunker(li, 32, 64)); + ASSERT_NO_THROW(ContentDefinedChunker(li, 1024 * 1024, 2 * 1024 * 1024)); + ASSERT_NO_THROW( + ContentDefinedChunker(li, 1024 * 1024 * 1024L, 2LL * 1024 * 1024 * 1024L)); + + // with norm_factor=1 the difference between min and max chunk size must be + // at least 64 + ASSERT_THROW(ContentDefinedChunker(li, 1, -1, 1), ParquetException); + ASSERT_THROW(ContentDefinedChunker(li, -1, 1, 1), ParquetException); + ASSERT_THROW(ContentDefinedChunker(li, 1, 1, 1), ParquetException); + ASSERT_THROW(ContentDefinedChunker(li, 1, 32, 1), ParquetException); + ASSERT_THROW(ContentDefinedChunker(li, 1, 33, 1), ParquetException); + ASSERT_NO_THROW(ContentDefinedChunker(li, 1, 65, 1)); + + // with norm_factor=2 the difference between min and max chunk size must be + // at least 128 + ASSERT_THROW(ContentDefinedChunker(li, 0, 123, 2), ParquetException); + ASSERT_NO_THROW(ContentDefinedChunker(li, 0, 128, 2)); +} + TEST_F(TestCDC, RollingHashMaskCalculation) { auto le = LevelInfo(); auto min_size = 256 * 1024; @@ -749,21 +893,23 @@ TEST_F(TestCDC, RollingHashMaskCalculation) { auto cdc4 = ContentDefinedChunker(le, min_size, max_size, -1); ASSERT_EQ(GetRollingHashMask(cdc4), 0xFFFF000000000000); - // this is the smallest possible mask always matching, by using 8 hashtables - // we are going to have a match every 8 bytes; this is an unrealistic case - // but checking for the correctness of the mask calculation - auto cdc5 = ContentDefinedChunker(le, 0, 16, 0); - ASSERT_EQ(GetRollingHashMask(cdc5), 0x0000000000000000); + // check that mask bits are between 1 and 63 after adjusting with the 8 hash tables + ASSERT_THROW(ContentDefinedChunker(le, 0, 16, 0), ParquetException); + + auto cdc5 = ContentDefinedChunker(le, 0, 32, 0); + ASSERT_EQ(GetRollingHashMask(cdc5), 0x8000000000000000); + ASSERT_THROW(ContentDefinedChunker(le, 0, 32, 1), ParquetException); - auto cdc6 = ContentDefinedChunker(le, 0, 32, 1); - ASSERT_EQ(GetRollingHashMask(cdc6), 0x0000000000000000); + auto cdc6 = ContentDefinedChunker(le, 0, 64, 0); + ASSERT_EQ(GetRollingHashMask(cdc6), 0xC000000000000000); auto cdc7 = ContentDefinedChunker(le, 0, 16, -1); ASSERT_EQ(GetRollingHashMask(cdc7), 0x8000000000000000); // another unrealistic case, checking for the validation - auto cdc8 = ContentDefinedChunker(le, 128, 384, -60); - ASSERT_EQ(GetRollingHashMask(cdc8), 0xFFFFFFFFFFFFFFFF); + ASSERT_THROW(ContentDefinedChunker(le, 128, 384, -60), ParquetException); + auto cdc8 = ContentDefinedChunker(le, 128, 384, -59); + ASSERT_EQ(GetRollingHashMask(cdc8), 0xFFFFFFFFFFFFFFFE); } TEST_F(TestCDC, WriteSingleColumnParquetFile) { @@ -828,61 +974,13 @@ TEST_F(TestCDC, LastChunkDoesntTriggerAddDataPage) { writer->Close(); ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish()); - auto info = GetColumnParquetInfo(buffer); + auto info = GetColumnParquetInfo(buffer, /*column_index=*/0); ASSERT_EQ(info.size(), 1); - // AssertAllBetween allow the last chunk size to be smaller than the min_chunk_size - AssertAllBetween(info[0].page_sizes, kMinChunkSize, kMaxChunkSize); - AssertAllBetween(info[0].page_lengths, 3000, 5000); -} - -TEST_F(TestCDC, ChunkSizeParameterValidation) { - // Test that constructor validates min/max chunk size parameters - auto li = LevelInfo(); - - ASSERT_NO_THROW(ContentDefinedChunker(li, 256 * 1024, 1024 * 1024)); - - // with norm_factor=0 the difference between min and max chunk size must be - // at least 16 - ASSERT_THROW(ContentDefinedChunker(li, 0, -1), ParquetException); - ASSERT_THROW(ContentDefinedChunker(li, 1024, 512), ParquetException); - - ASSERT_THROW(ContentDefinedChunker(li, -1, 0), ParquetException); - ASSERT_THROW(ContentDefinedChunker(li, 0, 0), ParquetException); - ASSERT_NO_THROW(ContentDefinedChunker(li, 0, 16)); - ASSERT_THROW(ContentDefinedChunker(li, -16, -16), ParquetException); - ASSERT_THROW(ContentDefinedChunker(li, 16, 0), ParquetException); - ASSERT_THROW(ContentDefinedChunker(li, 32, 32), ParquetException); - ASSERT_NO_THROW(ContentDefinedChunker(li, 32, 48)); - ASSERT_NO_THROW(ContentDefinedChunker(li, 1024 * 1024, 2 * 1024 * 1024)); - ASSERT_NO_THROW( - ContentDefinedChunker(li, 1024 * 1024 * 1024L, 2LL * 1024 * 1024 * 1024L)); - - // with norm_factor=1 the difference between min and max chunk size must be - // at least 64 - ASSERT_THROW(ContentDefinedChunker(li, 1, -1, 1), ParquetException); - ASSERT_THROW(ContentDefinedChunker(li, -1, 1, 1), ParquetException); - ASSERT_THROW(ContentDefinedChunker(li, 1, 1, 1), ParquetException); - ASSERT_THROW(ContentDefinedChunker(li, 1, 32, 1), ParquetException); - ASSERT_NO_THROW(ContentDefinedChunker(li, 1, 33, 1)); - - // with norm_factor=2 the difference between min and max chunk size must be - // at least 128 - ASSERT_THROW(ContentDefinedChunker(li, 0, 63, 2), ParquetException); - ASSERT_NO_THROW(ContentDefinedChunker(li, 0, 64, 2)); - - // with norm_factor=-1 the difference between min and max chunk size must be - // at least 8 - ASSERT_THROW(ContentDefinedChunker(li, 0, 7, -1), ParquetException); - ASSERT_NO_THROW(ContentDefinedChunker(li, 0, 8, -1)); - ASSERT_NO_THROW(ContentDefinedChunker(li, 0, 16, -1)); - - // test the norm_factor extremes - ASSERT_THROW(ContentDefinedChunker(li, 0, 0, -68), ParquetException); - ASSERT_NO_THROW(ContentDefinedChunker(li, 0, 1, -67)); - ASSERT_THROW(ContentDefinedChunker(li, 0, std::numeric_limits::max(), 59), - ParquetException); - ASSERT_NO_THROW(ContentDefinedChunker(li, 0, std::numeric_limits::max(), 58)); + ASSERT_OK_AND_ASSIGN(auto chunked_array, ChunkedArray::Make({array, array})); + AssertContentDefinedChunkSizes(chunked_array, info.front(), /*nullable=*/false, + kMinChunkSize, kMaxChunkSize, + /*expect_dictionary_page=*/false); } struct CaseConfig { @@ -890,9 +988,6 @@ struct CaseConfig { std::shared_ptr<::arrow::DataType> dtype; // Whether the data type is nullable bool is_nullable; - // Approximate number of bytes per record to calculate the number of elements to - // generate - int64_t bytes_per_record; // Data page version to use ParquetDataPageVersion data_page_version = ParquetDataPageVersion::V1; }; @@ -910,8 +1005,10 @@ class TestCDCSingleRowGroup : public ::testing::TestWithParam { protected: static constexpr int64_t kPartSize = 128 * 1024; static constexpr int64_t kEditSize = 128; - static constexpr int64_t kMinChunkSize = 8 * 1024; - static constexpr int64_t kMaxChunkSize = 32 * 1024; + static constexpr int64_t kMinChunkSize = 4 * 1024; + static constexpr int64_t kMaxChunkSize = 16 * 1024; + static constexpr int64_t kRowGroupLength = 1024 * 1024; + // Column random table parts for testing std::shared_ptr field_; std::shared_ptr
part1_, part2_, part3_, part4_, part5_, part6_, part7_; @@ -921,8 +1018,23 @@ class TestCDCSingleRowGroup : public ::testing::TestWithParam { auto field_ = ::arrow::field("f0", param.dtype, param.is_nullable); auto schema = ::arrow::schema({field_}); - auto part_length = kPartSize / param.bytes_per_record; - auto edit_length = kEditSize / param.bytes_per_record; + // since the chunk sizes are constant we derive the number of records to generate + // from the size of the data type unless it is nested or variable length where + // we use a hand picked value to avoid generating too large tables + int64_t bytes_per_record; + if (param.dtype->byte_width() > 0) { + bytes_per_record = param.dtype->byte_width(); + if (param.is_nullable) { + bytes_per_record += sizeof(uint16_t); + } + } else { + // for variable length types we use the size of the first element + bytes_per_record = 16; + } + + auto part_length = kPartSize / bytes_per_record; + auto edit_length = kEditSize / bytes_per_record; + ASSERT_OK_AND_ASSIGN(part1_, GenerateTable(schema, part_length, /*seed=*/0)); ASSERT_OK_AND_ASSIGN(part2_, GenerateTable(schema, edit_length, /*seed=*/1)); ASSERT_OK_AND_ASSIGN(part3_, @@ -943,28 +1055,33 @@ TEST_P(TestCDCSingleRowGroup, DeleteOnce) { ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { - ASSERT_OK_AND_ASSIGN(auto base_info, WriteAndGetParquetInfo( - base, kMinChunkSize, kMaxChunkSize, - enable_dictionary, param.data_page_version)); ASSERT_OK_AND_ASSIGN( - auto modified_info, - WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary, - param.data_page_version)); + auto base_parquet, + WriteTableToBuffer(base, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + enable_dictionary, param.data_page_version)); + ASSERT_OK_AND_ASSIGN( + auto modified_parquet, + WriteTableToBuffer(modified, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + enable_dictionary, param.data_page_version)); + + auto base_info = GetColumnParquetInfo(base_parquet, /*column_index=*/0); + auto modified_info = GetColumnParquetInfo(modified_parquet, /*column_index=*/0); // assert that there is only one row group ASSERT_EQ(base_info.size(), 1); ASSERT_EQ(modified_info.size(), 1); - AssertChunkSizes(param.dtype, base_info.front(), modified_info.front(), - param.is_nullable, enable_dictionary, kMinChunkSize, kMaxChunkSize); - auto edit_length = part2_->num_rows(); - if (::arrow::is_list_like(param.dtype->id())) { - edit_length += 1; - } + AssertContentDefinedChunkSizes(base->column(0), base_info.front(), param.is_nullable, + kMinChunkSize, kMaxChunkSize, + /*expect_dictionary_page=*/enable_dictionary); + AssertContentDefinedChunkSizes(modified->column(0), modified_info.front(), + param.is_nullable, kMinChunkSize, kMaxChunkSize, + /*expect_dictionary_page=*/enable_dictionary); + AssertPageLengthDifferences(base_info.front(), modified_info.front(), /*exact_number_of_equal_diffs=*/0, /*exact_number_of_larger_diffs=*/0, - /*exact_number_of_smaller_diffs=*/1, edit_length); + /*exact_number_of_smaller_diffs=*/1, part2_->column(0)); } } @@ -977,28 +1094,33 @@ TEST_P(TestCDCSingleRowGroup, DeleteTwice) { ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { - ASSERT_OK_AND_ASSIGN(auto base_info, WriteAndGetParquetInfo( - base, kMinChunkSize, kMaxChunkSize, - enable_dictionary, param.data_page_version)); ASSERT_OK_AND_ASSIGN( - auto modified_info, - WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary, - param.data_page_version)); + auto base_parquet, + WriteTableToBuffer(base, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + enable_dictionary, param.data_page_version)); + ASSERT_OK_AND_ASSIGN( + auto modified_parquet, + WriteTableToBuffer(modified, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + enable_dictionary, param.data_page_version)); + + auto base_info = GetColumnParquetInfo(base_parquet, /*column_index=*/0); + auto modified_info = GetColumnParquetInfo(modified_parquet, /*column_index=*/0); // assert that there is only one row group ASSERT_EQ(base_info.size(), 1); ASSERT_EQ(modified_info.size(), 1); - AssertChunkSizes(param.dtype, base_info.front(), modified_info.front(), - param.is_nullable, enable_dictionary, kMinChunkSize, kMaxChunkSize); - auto edit_length = part2_->num_rows(); - if (::arrow::is_list_like(param.dtype->id())) { - edit_length += 1; - } + AssertContentDefinedChunkSizes(base->column(0), base_info.front(), param.is_nullable, + kMinChunkSize, kMaxChunkSize, + /*expect_dictionary_page=*/enable_dictionary); + AssertContentDefinedChunkSizes(modified->column(0), modified_info.front(), + param.is_nullable, kMinChunkSize, kMaxChunkSize, + /*expect_dictionary_page=*/enable_dictionary); + AssertPageLengthDifferences(base_info.front(), modified_info.front(), /*exact_number_of_equal_diffs=*/0, /*exact_number_of_larger_diffs=*/0, - /*exact_number_of_smaller_diffs=*/2, edit_length); + /*exact_number_of_smaller_diffs=*/2, part2_->column(0)); } } @@ -1010,18 +1132,29 @@ TEST_P(TestCDCSingleRowGroup, UpdateOnce) { ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { - ASSERT_OK_AND_ASSIGN(auto base_info, WriteAndGetParquetInfo( - base, kMinChunkSize, kMaxChunkSize, - enable_dictionary, param.data_page_version)); ASSERT_OK_AND_ASSIGN( - auto modified_info, - WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary, - param.data_page_version)); + auto base_parquet, + WriteTableToBuffer(base, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + enable_dictionary, param.data_page_version)); + ASSERT_OK_AND_ASSIGN( + auto modified_parquet, + WriteTableToBuffer(modified, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + enable_dictionary, param.data_page_version)); + + auto base_info = GetColumnParquetInfo(base_parquet, /*column_index=*/0); + auto modified_info = GetColumnParquetInfo(modified_parquet, /*column_index=*/0); + // assert that there is only one row group ASSERT_EQ(base_info.size(), 1); ASSERT_EQ(modified_info.size(), 1); - AssertChunkSizes(param.dtype, base_info.front(), modified_info.front(), - param.is_nullable, enable_dictionary, kMinChunkSize, kMaxChunkSize); + + AssertContentDefinedChunkSizes(base->column(0), base_info.front(), param.is_nullable, + kMinChunkSize, kMaxChunkSize, + /*expect_dictionary_page=*/enable_dictionary); + AssertContentDefinedChunkSizes(modified->column(0), modified_info.front(), + param.is_nullable, kMinChunkSize, kMaxChunkSize, + /*expect_dictionary_page=*/enable_dictionary); + AssertPageLengthDifferences(base_info.front(), modified_info.front(), /*max_number_of_equal_diffs=*/1); } @@ -1037,18 +1170,29 @@ TEST_P(TestCDCSingleRowGroup, UpdateTwice) { ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { - ASSERT_OK_AND_ASSIGN(auto base_info, WriteAndGetParquetInfo( - base, kMinChunkSize, kMaxChunkSize, - enable_dictionary, param.data_page_version)); ASSERT_OK_AND_ASSIGN( - auto modified_info, - WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary, - param.data_page_version)); + auto base_parquet, + WriteTableToBuffer(base, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + enable_dictionary, param.data_page_version)); + ASSERT_OK_AND_ASSIGN( + auto modified_parquet, + WriteTableToBuffer(modified, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + enable_dictionary, param.data_page_version)); + + auto base_info = GetColumnParquetInfo(base_parquet, /*column_index=*/0); + auto modified_info = GetColumnParquetInfo(modified_parquet, /*column_index=*/0); + // assert that there is only one row group ASSERT_EQ(base_info.size(), 1); ASSERT_EQ(modified_info.size(), 1); - AssertChunkSizes(param.dtype, base_info.front(), modified_info.front(), - param.is_nullable, enable_dictionary, kMinChunkSize, kMaxChunkSize); + + AssertContentDefinedChunkSizes(base->column(0), base_info.front(), param.is_nullable, + kMinChunkSize, kMaxChunkSize, + /*expect_dictionary_page=*/enable_dictionary); + AssertContentDefinedChunkSizes(modified->column(0), modified_info.front(), + param.is_nullable, kMinChunkSize, kMaxChunkSize, + /*expect_dictionary_page=*/enable_dictionary); + AssertPageLengthDifferences(base_info.front(), modified_info.front(), /*max_number_of_equal_diffs=*/2); } @@ -1062,27 +1206,33 @@ TEST_P(TestCDCSingleRowGroup, InsertOnce) { ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { - ASSERT_OK_AND_ASSIGN(auto base_info, WriteAndGetParquetInfo( - base, kMinChunkSize, kMaxChunkSize, - enable_dictionary, param.data_page_version)); ASSERT_OK_AND_ASSIGN( - auto modified_info, - WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary, - param.data_page_version)); + auto base_parquet, + WriteTableToBuffer(base, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + enable_dictionary, param.data_page_version)); + ASSERT_OK_AND_ASSIGN( + auto modified_parquet, + WriteTableToBuffer(modified, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + enable_dictionary, param.data_page_version)); + + auto base_info = GetColumnParquetInfo(base_parquet, /*column_index=*/0); + auto modified_info = GetColumnParquetInfo(modified_parquet, /*column_index=*/0); + // assert that there is only one row group ASSERT_EQ(base_info.size(), 1); ASSERT_EQ(modified_info.size(), 1); - AssertChunkSizes(param.dtype, base_info.front(), modified_info.front(), - param.is_nullable, enable_dictionary, kMinChunkSize, kMaxChunkSize); - auto edit_length = part2_->num_rows(); - if (::arrow::is_list_like(param.dtype->id())) { - edit_length += 1; - } + AssertContentDefinedChunkSizes(base->column(0), base_info.front(), param.is_nullable, + kMinChunkSize, kMaxChunkSize, + /*expect_dictionary_page=*/enable_dictionary); + AssertContentDefinedChunkSizes(modified->column(0), modified_info.front(), + param.is_nullable, kMinChunkSize, kMaxChunkSize, + /*expect_dictionary_page=*/enable_dictionary); + AssertPageLengthDifferences(base_info.front(), modified_info.front(), /*exact_number_of_equal_diffs=*/0, /*exact_number_of_larger_diffs=*/1, - /*exact_number_of_smaller_diffs=*/0, edit_length); + /*exact_number_of_smaller_diffs=*/0, part2_->column(0)); } } @@ -1095,27 +1245,33 @@ TEST_P(TestCDCSingleRowGroup, InsertTwice) { ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { - ASSERT_OK_AND_ASSIGN(auto base_info, WriteAndGetParquetInfo( - base, kMinChunkSize, kMaxChunkSize, - enable_dictionary, param.data_page_version)); ASSERT_OK_AND_ASSIGN( - auto modified_info, - WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary, - param.data_page_version)); + auto base_parquet, + WriteTableToBuffer(base, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + enable_dictionary, param.data_page_version)); + ASSERT_OK_AND_ASSIGN( + auto modified_parquet, + WriteTableToBuffer(modified, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + enable_dictionary, param.data_page_version)); + + auto base_info = GetColumnParquetInfo(base_parquet, /*column_index=*/0); + auto modified_info = GetColumnParquetInfo(modified_parquet, /*column_index=*/0); + // assert that there is only one row group ASSERT_EQ(base_info.size(), 1); ASSERT_EQ(modified_info.size(), 1); - AssertChunkSizes(param.dtype, base_info.front(), modified_info.front(), - param.is_nullable, enable_dictionary, kMinChunkSize, kMaxChunkSize); - auto edit_length = part2_->num_rows(); - if (::arrow::is_list_like(param.dtype->id())) { - edit_length += 1; - } + AssertContentDefinedChunkSizes(base->column(0), base_info.front(), param.is_nullable, + kMinChunkSize, kMaxChunkSize, + /*expect_dictionary_page=*/enable_dictionary); + AssertContentDefinedChunkSizes(modified->column(0), modified_info.front(), + param.is_nullable, kMinChunkSize, kMaxChunkSize, + /*expect_dictionary_page=*/enable_dictionary); + AssertPageLengthDifferences(base_info.front(), modified_info.front(), /*exact_number_of_equal_diffs=*/0, /*exact_number_of_larger_diffs=*/2, - /*exact_number_of_smaller_diffs=*/0, edit_length); + /*exact_number_of_smaller_diffs=*/0, part2_->column(0)); } } @@ -1127,26 +1283,38 @@ TEST_P(TestCDCSingleRowGroup, Append) { ASSERT_FALSE(base->Equals(*modified)); for (bool enable_dictionary : {false, true}) { - ASSERT_OK_AND_ASSIGN(auto base_info, WriteAndGetParquetInfo( - base, kMinChunkSize, kMaxChunkSize, - enable_dictionary, param.data_page_version)); ASSERT_OK_AND_ASSIGN( - auto modified_info, - WriteAndGetParquetInfo(modified, kMinChunkSize, kMaxChunkSize, enable_dictionary, - param.data_page_version)); + auto base_parquet, + WriteTableToBuffer(base, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + enable_dictionary, param.data_page_version)); + ASSERT_OK_AND_ASSIGN( + auto modified_parquet, + WriteTableToBuffer(modified, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + enable_dictionary, param.data_page_version)); + + auto base_info = GetColumnParquetInfo(base_parquet, /*column_index=*/0); + auto modified_info = GetColumnParquetInfo(modified_parquet, /*column_index=*/0); + // assert that there is only one row group ASSERT_EQ(base_info.size(), 1); ASSERT_EQ(modified_info.size(), 1); - AssertChunkSizes(param.dtype, base_info.front(), modified_info.front(), - param.is_nullable, enable_dictionary, kMinChunkSize, kMaxChunkSize); + + AssertContentDefinedChunkSizes(base->column(0), base_info.front(), param.is_nullable, + kMinChunkSize, kMaxChunkSize, + /*expect_dictionary_page=*/enable_dictionary); + AssertContentDefinedChunkSizes(modified->column(0), modified_info.front(), + param.is_nullable, kMinChunkSize, kMaxChunkSize, + /*expect_dictionary_page=*/enable_dictionary); auto original_page_lengths = base_info.front().page_lengths; auto modified_page_lengths = modified_info.front().page_lengths; - ASSERT_GE(original_page_lengths.size(), modified_page_lengths.size()); + + ASSERT_LE(original_page_lengths.size(), modified_page_lengths.size()); for (size_t i = 0; i < original_page_lengths.size() - 1; i++) { ASSERT_EQ(original_page_lengths[i], modified_page_lengths[i]); } - ASSERT_GT(modified_page_lengths.back(), original_page_lengths.back()); + auto last_index = original_page_lengths.size() - 1; + ASSERT_GE(modified_page_lengths[last_index], original_page_lengths[last_index]); } } @@ -1159,13 +1327,18 @@ TEST_P(TestCDCSingleRowGroup, EmptyTable) { for (bool enable_dictionary : {false, true}) { ASSERT_OK_AND_ASSIGN( - auto result, WriteAndGetParquetInfo(empty_table, kMinChunkSize, kMaxChunkSize, - enable_dictionary, param.data_page_version)); + auto parquet, + WriteTableToBuffer(empty_table, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + enable_dictionary, param.data_page_version)); + + auto info = GetColumnParquetInfo(parquet, /*column_index=*/0); + + // There is a single row group + ASSERT_EQ(info.size(), 1); // An empty table should result in no data pages - ASSERT_EQ(result.size(), 1); - ASSERT_TRUE(result.front().page_lengths.empty()); - ASSERT_TRUE(result.front().page_sizes.empty()); + ASSERT_TRUE(info.front().page_lengths.empty()); + ASSERT_TRUE(info.front().page_sizes.empty()); } } @@ -1182,8 +1355,13 @@ TEST_P(TestCDCSingleRowGroup, ArrayOffsets) { ASSERT_EQ(first_chunk->offset(), offset); // write out the sliced table, read it back and compare - ASSERT_OK(WriteAndGetParquetInfo(sliced_table, kMinChunkSize, kMaxChunkSize, true, - param.data_page_version)); + ASSERT_OK_AND_ASSIGN( + auto sliced_parquet, + WriteTableToBuffer(sliced_table, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + true, param.data_page_version)); + + auto sliced_info = GetColumnParquetInfo(sliced_parquet, /*column_index=*/0); + ASSERT_EQ(sliced_info.size(), 1); } } @@ -1191,53 +1369,55 @@ INSTANTIATE_TEST_SUITE_P( FixedSizedTypes, TestCDCSingleRowGroup, testing::Values( // Boolean - CaseConfig{::arrow::boolean(), false, 1}, + CaseConfig{::arrow::boolean(), false}, // Numeric - CaseConfig{::arrow::uint8(), false, 1}, CaseConfig{::arrow::uint16(), false, 2}, - CaseConfig{::arrow::uint32(), false, 4}, CaseConfig{::arrow::uint64(), true, 8}, - CaseConfig{::arrow::int8(), false, 1}, CaseConfig{::arrow::int16(), false, 2}, - CaseConfig{::arrow::int32(), false, 4}, CaseConfig{::arrow::int64(), true, 8}, - CaseConfig{::arrow::float16(), false, 2}, - CaseConfig{::arrow::float32(), false, 4}, CaseConfig{::arrow::float64(), true, 8}, - CaseConfig{::arrow::decimal128(18, 6), false, 16}, - CaseConfig{::arrow::decimal256(40, 6), false, 32}, + CaseConfig{::arrow::uint8(), false}, CaseConfig{::arrow::uint16(), false}, + CaseConfig{::arrow::uint32(), false}, CaseConfig{::arrow::uint64(), true}, + CaseConfig{::arrow::int8(), false}, CaseConfig{::arrow::int16(), false}, + CaseConfig{::arrow::int32(), false}, CaseConfig{::arrow::int64(), true}, + CaseConfig{::arrow::float16(), false}, CaseConfig{::arrow::float32(), false}, + CaseConfig{::arrow::float64(), true}, + CaseConfig{::arrow::decimal128(18, 6), false}, + CaseConfig{::arrow::decimal256(40, 6), false}, // Binary-like - CaseConfig{::arrow::utf8(), false, 16}, CaseConfig{::arrow::binary(), true, 16}, - CaseConfig{::arrow::fixed_size_binary(16), true, 16}, + CaseConfig{::arrow::utf8(), false}, CaseConfig{::arrow::binary(), true}, + CaseConfig{::arrow::fixed_size_binary(16), true}, // Temporal - CaseConfig{::arrow::date32(), false, 4}, - CaseConfig{::arrow::time32(::arrow::TimeUnit::MILLI), true, 4}, - CaseConfig{::arrow::time64(::arrow::TimeUnit::NANO), false, 8}, - CaseConfig{::arrow::timestamp(::arrow::TimeUnit::NANO), true, 8}, - CaseConfig{::arrow::duration(::arrow::TimeUnit::NANO), false, 8}, + CaseConfig{::arrow::date32(), false}, + CaseConfig{::arrow::time32(::arrow::TimeUnit::MILLI), true}, + CaseConfig{::arrow::time64(::arrow::TimeUnit::NANO), false}, + CaseConfig{::arrow::timestamp(::arrow::TimeUnit::NANO), true}, + CaseConfig{::arrow::duration(::arrow::TimeUnit::NANO), false}, // Nested types - CaseConfig{::arrow::list(::arrow::int32()), false, 16}, - CaseConfig{::arrow::list(::arrow::int32()), true, 18}, - CaseConfig{::arrow::list(::arrow::utf8()), true, 18}, - CaseConfig{::arrow::struct_({::arrow::field("f0", ::arrow::int32())}), false, 8}, - CaseConfig{::arrow::struct_({::arrow::field("f0", ::arrow::float64())}), true, - 10}, + CaseConfig{::arrow::list(::arrow::int32()), false}, + CaseConfig{::arrow::list(::arrow::int32()), true}, + CaseConfig{::arrow::list(::arrow::utf8()), true}, + CaseConfig{::arrow::struct_({::arrow::field("f0", ::arrow::int32())}), false}, + CaseConfig{::arrow::struct_({::arrow::field("f0", ::arrow::float64())}), true}, CaseConfig{ ::arrow::list(::arrow::struct_({::arrow::field("f0", ::arrow::int32())})), - false, 16}, + false}, // Extension type - CaseConfig{::arrow::uuid(), true, 16}, + CaseConfig{::arrow::uuid(), true}, // Use ParquetDataPageVersion::V2 - CaseConfig{::arrow::large_binary(), false, 16, ParquetDataPageVersion::V2}, - CaseConfig{::arrow::list(::arrow::utf8()), true, 18, - ParquetDataPageVersion::V2})); + CaseConfig{::arrow::large_binary(), false, ParquetDataPageVersion::V2}, + CaseConfig{::arrow::list(::arrow::utf8()), true, ParquetDataPageVersion::V2})); class TestCDCMultipleRowGroups : public ::testing::Test { protected: + static auto constexpr kPartLength = 256 * 1024; + static auto constexpr kEditLength = 128; + static auto constexpr kRowGroupLength = 128 * 1024; + static auto constexpr kEnableDictionary = false; + static auto constexpr kMinChunkSize = 4 * 1024; + static auto constexpr kMaxChunkSize = 16 * 1024; + // Column random table parts for testing std::shared_ptr dtype_; std::shared_ptr
part1_, part2_, part3_; std::shared_ptr
edit1_, edit2_, edit3_; void SetUp() override { - auto constexpr kPartLength = 256 * 1024; - auto constexpr kEditLength = 128; - dtype_ = ::arrow::int32(); auto field = ::arrow::field("f0", dtype_, true); auto schema = ::arrow::schema({field}); @@ -1253,149 +1433,144 @@ class TestCDCMultipleRowGroups : public ::testing::Test { }; TEST_F(TestCDCMultipleRowGroups, InsertOnce) { - auto constexpr kRowGroupLength = 128 * 1024; - auto constexpr kEnableDictionary = false; - auto constexpr kMinChunkSize = 0 * 1024; - auto constexpr kMaxChunkSize = 128 * 1024; - ASSERT_OK_AND_ASSIGN(auto base, ConcatAndCombine({part1_, edit1_, part2_, part3_})); - ASSERT_OK_AND_ASSIGN(auto inserted, + ASSERT_OK_AND_ASSIGN(auto modified, ConcatAndCombine({part1_, edit1_, edit2_, part2_, part3_})); - ASSERT_FALSE(base->Equals(*inserted)); - ASSERT_EQ(inserted->num_rows(), base->num_rows() + edit2_->num_rows()); + ASSERT_FALSE(base->Equals(*modified)); + ASSERT_EQ(modified->num_rows(), base->num_rows() + edit2_->num_rows()); ASSERT_OK_AND_ASSIGN( - auto base_info, - WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, kEnableDictionary, - ParquetDataPageVersion::V1, kRowGroupLength)); + auto base_parquet, + WriteTableToBuffer(base, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + kEnableDictionary, ParquetDataPageVersion::V1)); ASSERT_OK_AND_ASSIGN( - auto inserted_info, - WriteAndGetParquetInfo(inserted, kMinChunkSize, kMaxChunkSize, kEnableDictionary, - ParquetDataPageVersion::V1, kRowGroupLength)); + auto modified_parquet, + WriteTableToBuffer(modified, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + kEnableDictionary, ParquetDataPageVersion::V1)); + + auto base_info = GetColumnParquetInfo(base_parquet, /*column_index=*/0); + auto modified_info = GetColumnParquetInfo(modified_parquet, /*column_index=*/0); + // assert that there are 7 row groups ASSERT_EQ(base_info.size(), 7); - ASSERT_EQ(inserted_info.size(), 7); + ASSERT_EQ(modified_info.size(), 7); - ASSERT_EQ(base_info.at(0).page_lengths, inserted_info.at(0).page_lengths); - ASSERT_EQ(base_info.at(1).page_lengths, inserted_info.at(1).page_lengths); - for (size_t i = 2; i < inserted_info.size() - 1; i++) { - AssertPageLengthDifferences(base_info.at(i), inserted_info.at(i), + ASSERT_EQ(base_info.at(0).page_lengths, modified_info.at(0).page_lengths); + ASSERT_EQ(base_info.at(1).page_lengths, modified_info.at(1).page_lengths); + for (size_t i = 2; i < modified_info.size() - 1; i++) { + AssertPageLengthDifferences(base_info.at(i), modified_info.at(i), /*exact_number_of_equal_diffs=*/0, /*exact_number_of_larger_diffs=*/1, - /*exact_number_of_smaller_diffs=*/1, edit2_->num_rows()); + /*exact_number_of_smaller_diffs=*/1, edit2_->column(0)); } - AssertPageLengthDifferences(base_info.back(), inserted_info.back(), + AssertPageLengthDifferences(base_info.back(), modified_info.back(), /*exact_number_of_equal_diffs=*/0, /*exact_number_of_larger_diffs=*/1, - /*exact_number_of_smaller_diffs=*/0, edit2_->num_rows()); + /*exact_number_of_smaller_diffs=*/0, edit2_->column(0)); } TEST_F(TestCDCMultipleRowGroups, DeleteOnce) { - auto constexpr kRowGroupLength = 128 * 1024; - auto constexpr kEnableDictionary = false; - auto constexpr kMinChunkSize = 0 * 1024; - auto constexpr kMaxChunkSize = 128 * 1024; - ASSERT_OK_AND_ASSIGN(auto base, ConcatAndCombine({part1_, edit1_, part2_, part3_, edit2_})); - ASSERT_OK_AND_ASSIGN(auto deleted, ConcatAndCombine({part1_, part2_, part3_, edit2_})); - ASSERT_FALSE(base->Equals(*deleted)); - ASSERT_EQ(deleted->num_rows(), base->num_rows() - edit1_->num_rows()); + ASSERT_OK_AND_ASSIGN(auto modified, ConcatAndCombine({part1_, part2_, part3_, edit2_})); + ASSERT_FALSE(base->Equals(*modified)); + ASSERT_EQ(modified->num_rows(), base->num_rows() - edit1_->num_rows()); ASSERT_OK_AND_ASSIGN( - auto base_info, - WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, kEnableDictionary, - ParquetDataPageVersion::V1, kRowGroupLength)); + auto base_parquet, + WriteTableToBuffer(base, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + kEnableDictionary, ParquetDataPageVersion::V1)); ASSERT_OK_AND_ASSIGN( - auto deleted_info, - WriteAndGetParquetInfo(deleted, kMinChunkSize, kMaxChunkSize, kEnableDictionary, - ParquetDataPageVersion::V1, kRowGroupLength)); + auto modified_parquet, + WriteTableToBuffer(modified, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + kEnableDictionary, ParquetDataPageVersion::V1)); + + auto base_info = GetColumnParquetInfo(base_parquet, /*column_index=*/0); + auto modified_info = GetColumnParquetInfo(modified_parquet, /*column_index=*/0); ASSERT_EQ(base_info.size(), 7); - ASSERT_EQ(deleted_info.size(), 7); + ASSERT_EQ(modified_info.size(), 7); - ASSERT_EQ(base_info.at(0).page_lengths, deleted_info.at(0).page_lengths); - ASSERT_EQ(base_info.at(1).page_lengths, deleted_info.at(1).page_lengths); - for (size_t i = 2; i < deleted_info.size() - 1; i++) { - AssertPageLengthDifferences(base_info.at(i), deleted_info.at(i), + ASSERT_EQ(base_info.at(0).page_lengths, modified_info.at(0).page_lengths); + ASSERT_EQ(base_info.at(1).page_lengths, modified_info.at(1).page_lengths); + for (size_t i = 2; i < modified_info.size() - 1; i++) { + AssertPageLengthDifferences(base_info.at(i), modified_info.at(i), /*exact_number_of_equal_diffs=*/0, /*exact_number_of_larger_diffs=*/1, - /*exact_number_of_smaller_diffs=*/1, edit1_->num_rows()); + /*exact_number_of_smaller_diffs=*/1, edit1_->column(0)); } - AssertPageLengthDifferences(base_info.back(), deleted_info.back(), + AssertPageLengthDifferences(base_info.back(), modified_info.back(), /*exact_number_of_equal_diffs=*/0, /*exact_number_of_larger_diffs=*/0, - /*exact_number_of_smaller_diffs=*/1, edit1_->num_rows()); + /*exact_number_of_smaller_diffs=*/1, edit1_->column(0)); } TEST_F(TestCDCMultipleRowGroups, UpdateOnce) { - auto constexpr kRowGroupLength = 128 * 1024; - auto constexpr kEnableDictionary = false; - auto constexpr kMinChunkSize = 0 * 1024; - auto constexpr kMaxChunkSize = 128 * 1024; - ASSERT_OK_AND_ASSIGN(auto base, ConcatAndCombine({part1_, edit1_, part2_, part3_, edit2_})); - ASSERT_OK_AND_ASSIGN(auto updated, + ASSERT_OK_AND_ASSIGN(auto modified, ConcatAndCombine({part1_, edit3_, part2_, part3_, edit2_})); - ASSERT_FALSE(base->Equals(*updated)); + ASSERT_FALSE(base->Equals(*modified)); ASSERT_OK_AND_ASSIGN( - auto base_info, - WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, kEnableDictionary, - ParquetDataPageVersion::V1, kRowGroupLength)); + auto base_parquet, + WriteTableToBuffer(base, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + kEnableDictionary, ParquetDataPageVersion::V1)); ASSERT_OK_AND_ASSIGN( - auto updated_info, - WriteAndGetParquetInfo(updated, kMinChunkSize, kMaxChunkSize, kEnableDictionary, - ParquetDataPageVersion::V1, kRowGroupLength)); + auto modified_parquet, + WriteTableToBuffer(modified, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + kEnableDictionary, ParquetDataPageVersion::V1)); + auto base_info = GetColumnParquetInfo(base_parquet, /*column_index=*/0); + auto modified_info = GetColumnParquetInfo(modified_parquet, /*column_index=*/0); + + // assert that there are 7 row groups ASSERT_EQ(base_info.size(), 7); - ASSERT_EQ(updated_info.size(), 7); + ASSERT_EQ(modified_info.size(), 7); - ASSERT_EQ(base_info.at(0).page_lengths, updated_info.at(0).page_lengths); - ASSERT_EQ(base_info.at(1).page_lengths, updated_info.at(1).page_lengths); - AssertPageLengthDifferences(base_info.at(2), updated_info.at(2), + ASSERT_EQ(base_info.at(0).page_lengths, modified_info.at(0).page_lengths); + ASSERT_EQ(base_info.at(1).page_lengths, modified_info.at(1).page_lengths); + AssertPageLengthDifferences(base_info.at(2), modified_info.at(2), /*max_number_of_equal_diffs=*/1); - for (size_t i = 2; i < updated_info.size(); i++) { - ASSERT_EQ(base_info.at(i).page_lengths, updated_info.at(i).page_lengths); + for (size_t i = 2; i < modified_info.size(); i++) { + ASSERT_EQ(base_info.at(i).page_lengths, modified_info.at(i).page_lengths); } } TEST_F(TestCDCMultipleRowGroups, Append) { - auto constexpr kRowGroupLength = 128 * 1024; - auto constexpr kEnableDictionary = false; - auto constexpr kMinChunkSize = 0 * 1024; - auto constexpr kMaxChunkSize = 128 * 1024; - ASSERT_OK_AND_ASSIGN(auto base, ConcatAndCombine({part1_, edit1_, part2_, part3_})); - ASSERT_OK_AND_ASSIGN(auto appended, + ASSERT_OK_AND_ASSIGN(auto modified, ConcatAndCombine({part1_, edit1_, part2_, part3_, edit2_})); - ASSERT_FALSE(base->Equals(*appended)); - ASSERT_EQ(appended->num_rows(), base->num_rows() + edit2_->num_rows()); + ASSERT_FALSE(base->Equals(*modified)); + ASSERT_EQ(modified->num_rows(), base->num_rows() + edit2_->num_rows()); ASSERT_OK_AND_ASSIGN( - auto base_info, - WriteAndGetParquetInfo(base, kMinChunkSize, kMaxChunkSize, kEnableDictionary, - ParquetDataPageVersion::V1, kRowGroupLength)); + auto base_parquet, + WriteTableToBuffer(base, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + kEnableDictionary, ParquetDataPageVersion::V1)); ASSERT_OK_AND_ASSIGN( - auto appended_info, - WriteAndGetParquetInfo(appended, kMinChunkSize, kMaxChunkSize, kEnableDictionary, - ParquetDataPageVersion::V1, kRowGroupLength)); + auto modified_parquet, + WriteTableToBuffer(modified, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + kEnableDictionary, ParquetDataPageVersion::V1)); + + auto base_info = GetColumnParquetInfo(base_parquet, /*column_index=*/0); + auto modified_info = GetColumnParquetInfo(modified_parquet, /*column_index=*/0); + // assert that there are 7 row groups ASSERT_EQ(base_info.size(), 7); - ASSERT_EQ(appended_info.size(), 7); + ASSERT_EQ(modified_info.size(), 7); - for (size_t i = 0; i < appended_info.size() - 1; i++) { - ASSERT_EQ(base_info.at(i).page_lengths, appended_info.at(i).page_lengths); + for (size_t i = 0; i < modified_info.size() - 1; i++) { + ASSERT_EQ(base_info.at(i).page_lengths, modified_info.at(i).page_lengths); } // only the last row group should have more or equal number of pages auto original_page_lengths = base_info.back().page_lengths; - auto appended_page_lengths = appended_info.back().page_lengths; - ASSERT_GE(original_page_lengths.size(), appended_page_lengths.size()); + auto modified_page_lengths = modified_info.back().page_lengths; + ASSERT_GE(original_page_lengths.size(), modified_page_lengths.size()); for (size_t i = 0; i < original_page_lengths.size() - 1; i++) { - ASSERT_EQ(original_page_lengths[i], appended_page_lengths[i]); + ASSERT_EQ(original_page_lengths[i], modified_page_lengths[i]); } - ASSERT_GT(appended_page_lengths.back(), original_page_lengths.back()); + ASSERT_GT(modified_page_lengths.back(), original_page_lengths.back()); } } // namespace parquet::internal From d13c89f1ad02aab267f9f9455bd0285e88c53c33 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 4 Apr 2025 00:17:33 +0200 Subject: [PATCH 083/102] Add prepend test case --- cpp/src/parquet/chunker_internal_test.cc | 46 +++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index 435783eaec3..814fb69ab70 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -793,7 +793,7 @@ void AssertContentDefinedChunkSizes(const std::shared_ptr<::arrow::ChunkedArray> } if (::arrow::is_fixed_width(type_id) || ::arrow::is_base_binary_like(type_id)) { - auto offset = 0; + int64_t offset = 0; auto page_lengths = column_info.page_lengths; for (size_t i = 0; i < page_lengths.size() - 1; i++) { @@ -1275,6 +1275,50 @@ TEST_P(TestCDCSingleRowGroup, InsertTwice) { } } +TEST_P(TestCDCSingleRowGroup, Prepend) { + const auto& param = GetParam(); + + ASSERT_OK_AND_ASSIGN(auto base, ConcatAndCombine({part1_, part2_, part3_})); + ASSERT_OK_AND_ASSIGN(auto modified, ConcatAndCombine({part4_, part1_, part2_, part3_})); + ASSERT_FALSE(base->Equals(*modified)); + + for (bool enable_dictionary : {false, true}) { + ASSERT_OK_AND_ASSIGN( + auto base_parquet, + WriteTableToBuffer(base, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + enable_dictionary, param.data_page_version)); + ASSERT_OK_AND_ASSIGN( + auto modified_parquet, + WriteTableToBuffer(modified, kMinChunkSize, kMaxChunkSize, kRowGroupLength, + enable_dictionary, param.data_page_version)); + + auto base_info = GetColumnParquetInfo(base_parquet, /*column_index=*/0); + auto modified_info = GetColumnParquetInfo(modified_parquet, /*column_index=*/0); + + // assert that there is only one row group + ASSERT_EQ(base_info.size(), 1); + ASSERT_EQ(modified_info.size(), 1); + + AssertContentDefinedChunkSizes(base->column(0), base_info.front(), param.is_nullable, + kMinChunkSize, kMaxChunkSize, + /*expect_dictionary_page=*/enable_dictionary); + AssertContentDefinedChunkSizes(modified->column(0), modified_info.front(), + param.is_nullable, kMinChunkSize, kMaxChunkSize, + /*expect_dictionary_page=*/enable_dictionary); + + auto original_page_lengths = base_info.front().page_lengths; + auto modified_page_lengths = modified_info.front().page_lengths; + + // we expect to have the same number or more pages at the beginning of the + // modified file without increasing the size of any subsequent page + ASSERT_LE(original_page_lengths.size(), modified_page_lengths.size()); + AssertPageLengthDifferences(base_info.front(), modified_info.front(), + /*exact_number_of_equal_diffs=*/0, + /*exact_number_of_larger_diffs=*/1, + /*exact_number_of_smaller_diffs=*/0, part4_->column(0)); + } +} + TEST_P(TestCDCSingleRowGroup, Append) { const auto& param = GetParam(); From 7aec1cdf5cdaaaf1f4cd49e5eb5157990bb19d96 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 4 Apr 2025 00:19:21 +0200 Subject: [PATCH 084/102] Rename CDCOptions to CdcOptions --- cpp/src/parquet/properties.h | 18 +++++++++--------- python/pyarrow/_parquet.pxd | 4 ++-- python/pyarrow/_parquet.pyx | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index b4cf24dd92f..82a4c900145 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -246,7 +246,7 @@ class PARQUET_EXPORT ColumnProperties { }; // EXPERIMENTAL: Options for content-defined chunking. -struct PARQUET_EXPORT CDCOptions { +struct PARQUET_EXPORT CdcOptions { /// Minimum chunk size in bytes, default 256 KiB /// The rolling hash will not be updated until this size is reached for each chunk. /// Note that all data sent through the hash function is counted towards the chunk @@ -271,7 +271,7 @@ struct PARQUET_EXPORT CDCOptions { int norm_factor = 0; }; -static constexpr CDCOptions kDefaultCDCOptions = CDCOptions{256 * 1024, 1024 * 1024, 0}; +static constexpr CdcOptions kDefaultCdcOptions = CdcOptions{256 * 1024, 1024 * 1024, 0}; class PARQUET_EXPORT WriterProperties { public: @@ -290,7 +290,7 @@ class PARQUET_EXPORT WriterProperties { page_checksum_enabled_(false), size_statistics_level_(DEFAULT_SIZE_STATISTICS_LEVEL), content_defined_chunking_enabled_(false), - content_defined_chunking_options_(kDefaultCDCOptions) {} + content_defined_chunking_options_(kDefaultCdcOptions) {} explicit Builder(const WriterProperties& properties) : pool_(properties.memory_pool()), @@ -331,8 +331,8 @@ class PARQUET_EXPORT WriterProperties { return this; } - /// \brief EXPERIMENTAL: Specify content-defined chunking options, see CDCOptions. - Builder* content_defined_chunking_options(const CDCOptions options) { + /// \brief EXPERIMENTAL: Specify content-defined chunking options, see CdcOptions. + Builder* content_defined_chunking_options(const CdcOptions options) { content_defined_chunking_options_ = options; return this; } @@ -791,7 +791,7 @@ class PARQUET_EXPORT WriterProperties { std::unordered_map page_index_enabled_; bool content_defined_chunking_enabled_; - CDCOptions content_defined_chunking_options_; + CdcOptions content_defined_chunking_options_; }; inline MemoryPool* memory_pool() const { return pool_; } @@ -819,7 +819,7 @@ class PARQUET_EXPORT WriterProperties { inline bool content_defined_chunking_enabled() const { return content_defined_chunking_enabled_; } - inline CDCOptions content_defined_chunking_options() const { + inline CdcOptions content_defined_chunking_options() const { return content_defined_chunking_options_; } @@ -926,7 +926,7 @@ class PARQUET_EXPORT WriterProperties { const std::unordered_map& column_properties, ParquetDataPageVersion data_page_version, bool store_short_decimal_as_integer, std::vector sorting_columns, bool content_defined_chunking_enabled, - CDCOptions content_defined_chunking_options) + CdcOptions content_defined_chunking_options) : pool_(pool), dictionary_pagesize_limit_(dictionary_pagesize_limit), write_batch_size_(write_batch_size), @@ -965,7 +965,7 @@ class PARQUET_EXPORT WriterProperties { std::unordered_map column_properties_; bool content_defined_chunking_enabled_; - CDCOptions content_defined_chunking_options_; + CdcOptions content_defined_chunking_options_; }; PARQUET_EXPORT const std::shared_ptr& default_writer_properties(); diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 4433b17253a..3cfc5a7c14a 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -462,7 +462,7 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: - cdef cppclass CDCOptions: + cdef cppclass CdcOptions: int64_t min_chunk_size int64_t max_chunk_size int norm_factor @@ -502,7 +502,7 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: Builder* disable_page_checksum() Builder* enable_content_defined_chunking() Builder* disable_content_defined_chunking() - Builder* content_defined_chunking_options(const CDCOptions options) + Builder* content_defined_chunking_options(const CdcOptions options) shared_ptr[WriterProperties] build() cdef cppclass ArrowWriterProperties: diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 9e79490e3e4..fa073d7e64d 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1979,7 +1979,7 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( cdef: shared_ptr[WriterProperties] properties WriterProperties.Builder props - CDCOptions cdc_options + CdcOptions cdc_options # data_page_version From e2229cec0303b1351cdf451b3f7e3de2877f346f Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 4 Apr 2025 00:51:40 +0200 Subject: [PATCH 085/102] Some more comments in the test suite --- cpp/src/parquet/chunker_internal_test.cc | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index 814fb69ab70..c76588d34da 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -656,6 +656,9 @@ void AssertPageLengthDifferences(const ColumnInfo& original, const ColumnInfo& m // lenght differences are exactly equal to the edit_length. auto diffs = FindDifferences(original.page_lengths, modified.page_lengths); + // Note, the assertion function assumes that all edits are made using the same edit + // array, this could be improved by passing a list of edit arrays to the function + // and calculating the edit length for each edit array. int64_t edit_length = edit_array->length(); if (::arrow::is_list_like(edit_array->type()->id())) { // add null and empty lists to the edit length because the page length corresponds to @@ -720,6 +723,9 @@ void AssertPageLengthDifferences(const ColumnInfo& original, const ColumnInfo& m int64_t left_sum = 0, right_sum = 0; for (const auto& val : diff.first) left_sum += val; for (const auto& val : diff.second) right_sum += val; + // This is only used from the UpdateOnce and UpdateTwice test cases where the edit(s) + // don't change the length of the original array, only update the value. This happens + // to apply to the list types as well because of the consistent array data generation. ASSERT_EQ(left_sum, right_sum); } @@ -822,7 +828,9 @@ void AssertContentDefinedChunkSizes(const std::shared_ptr<::arrow::ChunkedArray> ASSERT_EQ(offset, array->length()); } - // TODO(kszucs): have approximate size assertions for variable length types + // TODO(kszucs): have approximate size assertions for variable length types because + // we cannot calculate accurate CDC chunk sizes for list-like types without actually + // scanning the data and reimplementing the logic from the CDC chunker } class TestCDC : public ::testing::Test { From 6fe02230fba8e17f3e2f0b92d4092123c83f685f Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 4 Apr 2025 02:01:03 +0200 Subject: [PATCH 086/102] Some more comments --- cpp/src/parquet/chunker_internal.cc | 4 +- cpp/src/parquet/chunker_internal_test.cc | 53 ++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index 610b7054dd0..88b34ffe688 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -127,7 +127,6 @@ class ContentDefinedChunker::Impl { void Roll(const uint8_t* value) { // Update the rolling hash with a compile-time known sized value, set has_matched_ to // true if the hash matches the mask. - chunk_size_ += kByteWidth; if (chunk_size_ < min_chunk_size_) { // short-circuit if we haven't reached the minimum chunk size, this speeds up the @@ -148,7 +147,6 @@ class ContentDefinedChunker::Impl { void Roll(const uint8_t* value, int64_t length) { // Update the rolling hash with a binary-like value, set has_matched_ to true if the // hash matches the mask. - chunk_size_ += length; if (chunk_size_ < min_chunk_size_) { // short-circuit if we haven't reached the minimum chunk size, this speeds up the @@ -172,6 +170,8 @@ class ContentDefinedChunker::Impl { // we use central limit theorem to approximate normal distribution, see // section 6.2.1 in paper https://www.cidrdb.org/cidr2023/papers/p43-low.pdf) if (ARROW_PREDICT_FALSE(++nth_run_ >= kNumGearhashTables)) { + // note that we choose not to reset the rolling hash state here, nor anywhere else + // in the code, in practice this doesn't seem to affect the chunking effectiveness nth_run_ = 0; chunk_size_ = 0; return true; diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index c76588d34da..0628ab4c005 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -1079,6 +1079,7 @@ TEST_P(TestCDCSingleRowGroup, DeleteOnce) { ASSERT_EQ(base_info.size(), 1); ASSERT_EQ(modified_info.size(), 1); + // check that the chunk sizes are within the expected range AssertContentDefinedChunkSizes(base->column(0), base_info.front(), param.is_nullable, kMinChunkSize, kMaxChunkSize, /*expect_dictionary_page=*/enable_dictionary); @@ -1086,6 +1087,11 @@ TEST_P(TestCDCSingleRowGroup, DeleteOnce) { param.is_nullable, kMinChunkSize, kMaxChunkSize, /*expect_dictionary_page=*/enable_dictionary); + // check that there is a single "diff" between the two page length sequences + // and that the diff removes edit_length number of values, there should be no + // other differences because we deal with a single row group (in case of multiple + // row groups the first page of each subsequent row group would be different due + // to shifting caused by the fixed sized row group length) AssertPageLengthDifferences(base_info.front(), modified_info.front(), /*exact_number_of_equal_diffs=*/0, /*exact_number_of_larger_diffs=*/0, @@ -1118,6 +1124,7 @@ TEST_P(TestCDCSingleRowGroup, DeleteTwice) { ASSERT_EQ(base_info.size(), 1); ASSERT_EQ(modified_info.size(), 1); + // check that the chunk sizes are within the expected range AssertContentDefinedChunkSizes(base->column(0), base_info.front(), param.is_nullable, kMinChunkSize, kMaxChunkSize, /*expect_dictionary_page=*/enable_dictionary); @@ -1125,6 +1132,10 @@ TEST_P(TestCDCSingleRowGroup, DeleteTwice) { param.is_nullable, kMinChunkSize, kMaxChunkSize, /*expect_dictionary_page=*/enable_dictionary); + // check that there are exactly two "diffs" between the two page length sequences + // and those diffs remove edit_length number of values (part2 and part4 have the + // same number of values), there should be no other differences because we have + // a single row group AssertPageLengthDifferences(base_info.front(), modified_info.front(), /*exact_number_of_equal_diffs=*/0, /*exact_number_of_larger_diffs=*/0, @@ -1156,6 +1167,7 @@ TEST_P(TestCDCSingleRowGroup, UpdateOnce) { ASSERT_EQ(base_info.size(), 1); ASSERT_EQ(modified_info.size(), 1); + // check that the chunk sizes are within the expected range AssertContentDefinedChunkSizes(base->column(0), base_info.front(), param.is_nullable, kMinChunkSize, kMaxChunkSize, /*expect_dictionary_page=*/enable_dictionary); @@ -1163,6 +1175,9 @@ TEST_P(TestCDCSingleRowGroup, UpdateOnce) { param.is_nullable, kMinChunkSize, kMaxChunkSize, /*expect_dictionary_page=*/enable_dictionary); + // check that there is a single "diff" between the two page length sequences + // which doesn't change the length of the array, only the values are updated + // there should be no other differences because we deal with a single row group AssertPageLengthDifferences(base_info.front(), modified_info.front(), /*max_number_of_equal_diffs=*/1); } @@ -1194,6 +1209,7 @@ TEST_P(TestCDCSingleRowGroup, UpdateTwice) { ASSERT_EQ(base_info.size(), 1); ASSERT_EQ(modified_info.size(), 1); + // check that the chunk sizes are within the expected range AssertContentDefinedChunkSizes(base->column(0), base_info.front(), param.is_nullable, kMinChunkSize, kMaxChunkSize, /*expect_dictionary_page=*/enable_dictionary); @@ -1201,6 +1217,8 @@ TEST_P(TestCDCSingleRowGroup, UpdateTwice) { param.is_nullable, kMinChunkSize, kMaxChunkSize, /*expect_dictionary_page=*/enable_dictionary); + // check that there are exactly two "diffs" between the two page length sequences + // which don't change the length of the array, only the values are updated AssertPageLengthDifferences(base_info.front(), modified_info.front(), /*max_number_of_equal_diffs=*/2); } @@ -1230,6 +1248,7 @@ TEST_P(TestCDCSingleRowGroup, InsertOnce) { ASSERT_EQ(base_info.size(), 1); ASSERT_EQ(modified_info.size(), 1); + // check that the chunk sizes are within the expected range AssertContentDefinedChunkSizes(base->column(0), base_info.front(), param.is_nullable, kMinChunkSize, kMaxChunkSize, /*expect_dictionary_page=*/enable_dictionary); @@ -1237,6 +1256,9 @@ TEST_P(TestCDCSingleRowGroup, InsertOnce) { param.is_nullable, kMinChunkSize, kMaxChunkSize, /*expect_dictionary_page=*/enable_dictionary); + // check that there is a single "diff" between the two page length sequences + // adding edit_length number of values, there should be no other differences + // because we deal with a single row group and made a single modification AssertPageLengthDifferences(base_info.front(), modified_info.front(), /*exact_number_of_equal_diffs=*/0, /*exact_number_of_larger_diffs=*/1, @@ -1269,6 +1291,7 @@ TEST_P(TestCDCSingleRowGroup, InsertTwice) { ASSERT_EQ(base_info.size(), 1); ASSERT_EQ(modified_info.size(), 1); + // check that the chunk sizes are within the expected range AssertContentDefinedChunkSizes(base->column(0), base_info.front(), param.is_nullable, kMinChunkSize, kMaxChunkSize, /*expect_dictionary_page=*/enable_dictionary); @@ -1276,6 +1299,8 @@ TEST_P(TestCDCSingleRowGroup, InsertTwice) { param.is_nullable, kMinChunkSize, kMaxChunkSize, /*expect_dictionary_page=*/enable_dictionary); + // check that there are exactly two "diffs" between the two page length sequences + // which add edit_length number of values, there should be no other differences AssertPageLengthDifferences(base_info.front(), modified_info.front(), /*exact_number_of_equal_diffs=*/0, /*exact_number_of_larger_diffs=*/2, @@ -1307,6 +1332,7 @@ TEST_P(TestCDCSingleRowGroup, Prepend) { ASSERT_EQ(base_info.size(), 1); ASSERT_EQ(modified_info.size(), 1); + // check that the chunk sizes are within the expected range AssertContentDefinedChunkSizes(base->column(0), base_info.front(), param.is_nullable, kMinChunkSize, kMaxChunkSize, /*expect_dictionary_page=*/enable_dictionary); @@ -1351,6 +1377,7 @@ TEST_P(TestCDCSingleRowGroup, Append) { ASSERT_EQ(base_info.size(), 1); ASSERT_EQ(modified_info.size(), 1); + // check that the chunk sizes are within the expected range AssertContentDefinedChunkSizes(base->column(0), base_info.front(), param.is_nullable, kMinChunkSize, kMaxChunkSize, /*expect_dictionary_page=*/enable_dictionary); @@ -1361,7 +1388,10 @@ TEST_P(TestCDCSingleRowGroup, Append) { auto original_page_lengths = base_info.front().page_lengths; auto modified_page_lengths = modified_info.front().page_lengths; + // there are either additional pages and/or the last page is larger in the modified + // than in the original file ASSERT_LE(original_page_lengths.size(), modified_page_lengths.size()); + // all pages must be identical except for the last one which can be larger for (size_t i = 0; i < original_page_lengths.size() - 1; i++) { ASSERT_EQ(original_page_lengths[i], modified_page_lengths[i]); } @@ -1395,6 +1425,7 @@ TEST_P(TestCDCSingleRowGroup, EmptyTable) { } TEST_P(TestCDCSingleRowGroup, ArrayOffsets) { + // check that the array offsets are respected in the chunker const auto& param = GetParam(); ASSERT_OK_AND_ASSIGN(auto table, ConcatAndCombine({part1_, part2_, part3_})); @@ -1507,14 +1538,21 @@ TEST_F(TestCDCMultipleRowGroups, InsertOnce) { ASSERT_EQ(base_info.size(), 7); ASSERT_EQ(modified_info.size(), 7); + // the first two row groups should be identical, each part contains two row groups and + // the first part is not modified ASSERT_EQ(base_info.at(0).page_lengths, modified_info.at(0).page_lengths); ASSERT_EQ(base_info.at(1).page_lengths, modified_info.at(1).page_lengths); + // then there is an insertion which causes a larger "diff" somewhere in the row group + // and a smaller "diff" at the end of the row group because the row group length is + // fixed; this rule applies to the subsequent row groups as well because the values + // are shifted by the insertion for (size_t i = 2; i < modified_info.size() - 1; i++) { AssertPageLengthDifferences(base_info.at(i), modified_info.at(i), /*exact_number_of_equal_diffs=*/0, /*exact_number_of_larger_diffs=*/1, /*exact_number_of_smaller_diffs=*/1, edit2_->column(0)); } + // the last row group will simply be larger because of the insertion AssertPageLengthDifferences(base_info.back(), modified_info.back(), /*exact_number_of_equal_diffs=*/0, /*exact_number_of_larger_diffs=*/1, @@ -1540,17 +1578,24 @@ TEST_F(TestCDCMultipleRowGroups, DeleteOnce) { auto base_info = GetColumnParquetInfo(base_parquet, /*column_index=*/0); auto modified_info = GetColumnParquetInfo(modified_parquet, /*column_index=*/0); + // assert that there are 7 row groups ASSERT_EQ(base_info.size(), 7); ASSERT_EQ(modified_info.size(), 7); + // the first two row groups should be identical, each part contains two row groups and + // the first part is not modified ASSERT_EQ(base_info.at(0).page_lengths, modified_info.at(0).page_lengths); ASSERT_EQ(base_info.at(1).page_lengths, modified_info.at(1).page_lengths); for (size_t i = 2; i < modified_info.size() - 1; i++) { + // because of the deletion values are shifted in the row group, we expect a smaller + // "diff" at the beginning of the row group and a larger "diff" at the end of the + // row group AssertPageLengthDifferences(base_info.at(i), modified_info.at(i), /*exact_number_of_equal_diffs=*/0, /*exact_number_of_larger_diffs=*/1, /*exact_number_of_smaller_diffs=*/1, edit1_->column(0)); } + // the last row group will simply be smaller because of the deletion AssertPageLengthDifferences(base_info.back(), modified_info.back(), /*exact_number_of_equal_diffs=*/0, /*exact_number_of_larger_diffs=*/0, @@ -1580,11 +1625,16 @@ TEST_F(TestCDCMultipleRowGroups, UpdateOnce) { ASSERT_EQ(base_info.size(), 7); ASSERT_EQ(modified_info.size(), 7); + // the first two row groups should be identical, each part contains two row groups and + // the first part is not modified ASSERT_EQ(base_info.at(0).page_lengths, modified_info.at(0).page_lengths); ASSERT_EQ(base_info.at(1).page_lengths, modified_info.at(1).page_lengths); + // then there is an update (without insertion or deletion so no shifting occurs) which + // causes a "diff" with both sides having the same number of values but different ones AssertPageLengthDifferences(base_info.at(2), modified_info.at(2), /*max_number_of_equal_diffs=*/1); for (size_t i = 2; i < modified_info.size(); i++) { + // the rest of the row groups should be identical ASSERT_EQ(base_info.at(i).page_lengths, modified_info.at(i).page_lengths); } } @@ -1618,7 +1668,10 @@ TEST_F(TestCDCMultipleRowGroups, Append) { // only the last row group should have more or equal number of pages auto original_page_lengths = base_info.back().page_lengths; auto modified_page_lengths = modified_info.back().page_lengths; + + // the last row group should be larger or equal in size ASSERT_GE(original_page_lengths.size(), modified_page_lengths.size()); + // all pages must be identical except for the last one which can be larger for (size_t i = 0; i < original_page_lengths.size() - 1; i++) { ASSERT_EQ(original_page_lengths[i], modified_page_lengths[i]); } From feee7e7db43bada98e937cd25daa74d680923af0 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 4 Apr 2025 02:09:27 +0200 Subject: [PATCH 087/102] Remove redundant assertions --- cpp/src/parquet/chunker_internal_test.cc | 37 ++++++++++-------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index 0628ab4c005..738a87a6a83 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -579,6 +579,22 @@ TEST(TestFindDifferences, DifferentLengths) { ASSERT_EQ(diffs[0].second, ChunkList({4, 5})); } +TEST(TestFindDifferences, ChangesAtBothEnds) { + ChunkList first = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + ChunkList second = {0, 0, 2, 3, 4, 5, 7, 7, 8}; + auto diffs = FindDifferences(first, second); + + ASSERT_EQ(diffs.size(), 3); + ASSERT_EQ(diffs[0].first, ChunkList({1})); + ASSERT_EQ(diffs[0].second, ChunkList({0, 0})); + + ASSERT_EQ(diffs[1].first, ChunkList({6})); + ASSERT_EQ(diffs[1].second, ChunkList({7})); + + ASSERT_EQ(diffs[2].first, ChunkList({9})); + ASSERT_EQ(diffs[2].second, ChunkList({})); +} + TEST(TestFindDifferences, EmptyArrays) { ChunkList first = {}; ChunkList second = {}; @@ -596,11 +612,6 @@ TEST(TestFindDifferences, LongSequenceWithSingleDifference) { ASSERT_EQ(diffs.size(), 1); ASSERT_EQ(diffs[0].first, ChunkList({1994, 2193})); ASSERT_EQ(diffs[0].second, ChunkList({2048, 43, 2080})); - - // Verify that elements after the difference are identical - for (size_t i = 3; i < second.size(); i++) { - ASSERT_EQ(first[i - 1], second[i]); - } } TEST(TestFindDifferences, LongSequenceWithMiddleChanges) { @@ -613,14 +624,6 @@ TEST(TestFindDifferences, LongSequenceWithMiddleChanges) { ASSERT_EQ(diffs.size(), 1); ASSERT_EQ(diffs[0].first, ChunkList({1934, 1772, 1914, 2075, 2154})); ASSERT_EQ(diffs[0].second, ChunkList({2265, 1804, 1717, 1925, 2122})); - - // Verify elements before and after the difference are identical - for (size_t i = 0; i < 4; i++) { - ASSERT_EQ(first[i], second[i]); - } - for (size_t i = 9; i < first.size(); i++) { - ASSERT_EQ(first[i], second[i]); - } } TEST(TestFindDifferences, AdditionalCase) { @@ -632,14 +635,6 @@ TEST(TestFindDifferences, AdditionalCase) { ASSERT_EQ(diffs[0].first, ChunkList({401})); ASSERT_EQ(diffs[0].second, ChunkList({393})); - - // Verify elements before and after the difference are identical - for (size_t i = 0; i < 3; i++) { - ASSERT_EQ(original[i], modified[i]); - } - for (size_t i = 4; i < original.size(); i++) { - ASSERT_EQ(original[i], modified[i]); - } } void AssertPageLengthDifferences(const ColumnInfo& original, const ColumnInfo& modified, From 2032b3ad4f118617af86ae6cafb81ad4f8f15089 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 4 Apr 2025 02:14:31 +0200 Subject: [PATCH 088/102] Remove the capture in CalculateBinaryLike closure --- cpp/src/parquet/chunker_internal.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index 88b34ffe688..fe332e206c4 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -312,10 +312,9 @@ class ContentDefinedChunker::Impl { const int16_t* rep_levels, int64_t num_levels, const ::arrow::Array& values) { const auto& array = checked_cast(values); - const uint8_t* value; - typename ArrayType::offset_type length; return Calculate(def_levels, rep_levels, num_levels, [&](int64_t i) { - value = array.GetValue(i, &length); + typename ArrayType::offset_type length; + const uint8_t* value = array.GetValue(i, &length); Roll(value, length); }); } From 61731c6b0eac5e3dea43e5d5d093c69fdc07c56b Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 4 Apr 2025 13:29:59 +0200 Subject: [PATCH 089/102] Migrate from DCHECK to ARROW_DCHECK --- cpp/src/parquet/chunker_internal.cc | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index fe332e206c4..09247c8a1b9 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -24,7 +24,7 @@ #include "arrow/array.h" #include "arrow/util/bit_util.h" -#include "arrow/util/logging.h" +#include "arrow/util/logging_internal.h" #include "arrow/visit_type_inline.h" #include "parquet/chunker_internal_generated.h" #include "parquet/exception.h" @@ -189,12 +189,12 @@ class ContentDefinedChunker::Impl { void ValidateChunks(const std::vector& chunks, int64_t num_levels) const { // chunks must be non-empty and monotonic increasing - DCHECK(!chunks.empty()); + ARROW_DCHECK(!chunks.empty()); // the first chunk must start at the first level auto first_chunk = chunks.front(); - DCHECK_EQ(first_chunk.level_offset, 0); - DCHECK_EQ(first_chunk.value_offset, 0); + ARROW_DCHECK_EQ(first_chunk.level_offset, 0); + ARROW_DCHECK_EQ(first_chunk.value_offset, 0); // the following chunks must be contiguous, non-overlapping and monotonically // increasing @@ -202,16 +202,17 @@ class ContentDefinedChunker::Impl { for (size_t i = 1; i < chunks.size(); ++i) { auto chunk = chunks[i]; auto prev_chunk = chunks[i - 1]; - DCHECK_GT(chunk.levels_to_write, 0); - DCHECK_GE(chunk.value_offset, prev_chunk.value_offset); - DCHECK_EQ(chunk.level_offset, prev_chunk.level_offset + prev_chunk.levels_to_write); + ARROW_DCHECK_GT(chunk.levels_to_write, 0); + ARROW_DCHECK_GE(chunk.value_offset, prev_chunk.value_offset); + ARROW_DCHECK_EQ(chunk.level_offset, + prev_chunk.level_offset + prev_chunk.levels_to_write); sum_levels += chunk.levels_to_write; } - DCHECK_EQ(sum_levels, num_levels); + ARROW_DCHECK_EQ(sum_levels, num_levels); // the last chunk must end at the last level auto last_chunk = chunks.back(); - DCHECK_EQ(last_chunk.level_offset + last_chunk.levels_to_write, num_levels); + ARROW_DCHECK_EQ(last_chunk.level_offset + last_chunk.levels_to_write, num_levels); } template From 4966f9cdc05793b5ec31f3002532b249551997cf Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 4 Apr 2025 13:49:00 +0200 Subject: [PATCH 090/102] Use PLAIN encoding in the pyarrow test so that we can have stricter assertions --- .../tests/parquet/test_parquet_writer.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/tests/parquet/test_parquet_writer.py b/python/pyarrow/tests/parquet/test_parquet_writer.py index 7bc14f3da2f..7fd30c73576 100644 --- a/python/pyarrow/tests/parquet/test_parquet_writer.py +++ b/python/pyarrow/tests/parquet/test_parquet_writer.py @@ -366,10 +366,18 @@ def test_parquet_writer_append_key_value_metadata(tempdir): def test_parquet_content_defined_chunking(tempdir): table = pa.table({'a': range(100_000)}) - pq.write_table(table, tempdir / 'unchunked.parquet') + # use PLAIN encoding because we compare the overall size of the row groups + # which would vary depending on the encoding making the assertions wrong + pq.write_table(table, tempdir / 'unchunked.parquet', + use_dictionary=False, + column_encoding="PLAIN") pq.write_table(table, tempdir / 'chunked-default.parquet', + use_dictionary=False, + column_encoding="PLAIN", use_content_defined_chunking=True) pq.write_table(table, tempdir / 'chunked-custom.parquet', + use_dictionary=False, + column_encoding="PLAIN", use_content_defined_chunking={"min_chunk_size": 32_768, "max_chunk_size": 65_536}) @@ -394,11 +402,11 @@ def test_parquet_content_defined_chunking(tempdir): rg_chunked_custom = chunked_custom_metadata.row_group(i) assert rg_unchunked.num_rows == rg_chunked_default.num_rows assert rg_unchunked.num_rows == rg_chunked_custom.num_rows - # since PageReader is not exposed we don't cannot inspect the page sizes + # since PageReader is not exposed we cannot inspect the page sizes # so just check that the total byte size is different - assert rg_unchunked.total_byte_size != rg_chunked_default.total_byte_size - assert rg_unchunked.total_byte_size != rg_chunked_custom.total_byte_size - assert rg_chunked_default.total_byte_size != rg_chunked_custom.total_byte_size + assert rg_unchunked.total_byte_size < rg_chunked_default.total_byte_size + assert rg_unchunked.total_byte_size < rg_chunked_custom.total_byte_size + assert rg_chunked_default.total_byte_size < rg_chunked_custom.total_byte_size def test_parquet_content_defined_chunking_parameters(tempdir): From ae5c929da7b9c780eff3b91fb802cf3c89b86a1d Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 4 Apr 2025 13:58:05 +0200 Subject: [PATCH 091/102] Mention to use the same cdc parameters --- docs/source/python/parquet.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/source/python/parquet.rst b/docs/source/python/parquet.rst index 7183fa05d68..1d2664e81e2 100644 --- a/docs/source/python/parquet.rst +++ b/docs/source/python/parquet.rst @@ -817,6 +817,12 @@ Note that the chunk size is calculated on the logical values before applying any or compression. The actual size of the data pages may vary based on the encoding and compression used. +.. note:: + Ensure that Parquet write options remain consistent across writes and files. + Using different write options (like compression, encoding, or row group size) + for different files may prevent proper deduplication and lead to suboptimal + storage efficiency. + .. code-block:: python import pyarrow as pa From 0aa90c413a98cb3e89108dff7fad18ba4d3d0f28 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 4 Apr 2025 17:03:41 +0200 Subject: [PATCH 092/102] Change the multi row-group tests to use more columns --- cpp/src/parquet/chunker_internal_test.cc | 179 ++++++++++++----------- 1 file changed, 95 insertions(+), 84 deletions(-) diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index 738a87a6a83..d5e23c4fc81 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -1483,22 +1483,24 @@ INSTANTIATE_TEST_SUITE_P( class TestCDCMultipleRowGroups : public ::testing::Test { protected: - static auto constexpr kPartLength = 256 * 1024; + static auto constexpr kPartLength = 128 * 1024; static auto constexpr kEditLength = 128; - static auto constexpr kRowGroupLength = 128 * 1024; + static auto constexpr kRowGroupLength = 64 * 1024; static auto constexpr kEnableDictionary = false; static auto constexpr kMinChunkSize = 4 * 1024; static auto constexpr kMaxChunkSize = 16 * 1024; // Column random table parts for testing - std::shared_ptr dtype_; + std::shared_ptr
part1_, part2_, part3_; std::shared_ptr
edit1_, edit2_, edit3_; void SetUp() override { - dtype_ = ::arrow::int32(); - auto field = ::arrow::field("f0", dtype_, true); - auto schema = ::arrow::schema({field}); + auto schema = ::arrow::schema({ + ::arrow::field("int32", ::arrow::int32(), true), + ::arrow::field("float64", ::arrow::float64(), true), + ::arrow::field("bool", ::arrow::boolean(), false), + }); ASSERT_OK_AND_ASSIGN(part1_, GenerateTable(schema, kPartLength, /*seed=*/0)); ASSERT_OK_AND_ASSIGN(part2_, GenerateTable(schema, kPartLength, /*seed=*/2)); @@ -1526,32 +1528,35 @@ TEST_F(TestCDCMultipleRowGroups, InsertOnce) { WriteTableToBuffer(modified, kMinChunkSize, kMaxChunkSize, kRowGroupLength, kEnableDictionary, ParquetDataPageVersion::V1)); - auto base_info = GetColumnParquetInfo(base_parquet, /*column_index=*/0); - auto modified_info = GetColumnParquetInfo(modified_parquet, /*column_index=*/0); - - // assert that there are 7 row groups - ASSERT_EQ(base_info.size(), 7); - ASSERT_EQ(modified_info.size(), 7); - - // the first two row groups should be identical, each part contains two row groups and - // the first part is not modified - ASSERT_EQ(base_info.at(0).page_lengths, modified_info.at(0).page_lengths); - ASSERT_EQ(base_info.at(1).page_lengths, modified_info.at(1).page_lengths); - // then there is an insertion which causes a larger "diff" somewhere in the row group - // and a smaller "diff" at the end of the row group because the row group length is - // fixed; this rule applies to the subsequent row groups as well because the values - // are shifted by the insertion - for (size_t i = 2; i < modified_info.size() - 1; i++) { - AssertPageLengthDifferences(base_info.at(i), modified_info.at(i), + for (int col = 0; col < base->num_columns(); col++) { + auto base_info = GetColumnParquetInfo(base_parquet, /*column_index=*/col); + auto modified_info = GetColumnParquetInfo(modified_parquet, /*column_index=*/col); + + // assert that there are 7 row groups + ASSERT_EQ(base_info.size(), 7); + ASSERT_EQ(modified_info.size(), 7); + + // the first two row groups should be identical, each part contains two row groups and + // the first part is not modified + ASSERT_EQ(base_info.at(0).page_lengths, modified_info.at(0).page_lengths); + ASSERT_EQ(base_info.at(1).page_lengths, modified_info.at(1).page_lengths); + // then there is an insertion which causes a larger "diff" somewhere in the row group + // and a smaller "diff" at the end of the row group because the row group length is + // fixed; this rule applies to the subsequent row groups as well because the values + // are shifted by the insertion + auto edit_array = edit2_->column(col); + for (size_t i = 2; i < modified_info.size() - 1; i++) { + AssertPageLengthDifferences(base_info.at(i), modified_info.at(i), + /*exact_number_of_equal_diffs=*/0, + /*exact_number_of_larger_diffs=*/1, + /*exact_number_of_smaller_diffs=*/1, edit_array); + } + // the last row group will simply be larger because of the insertion + AssertPageLengthDifferences(base_info.back(), modified_info.back(), /*exact_number_of_equal_diffs=*/0, /*exact_number_of_larger_diffs=*/1, - /*exact_number_of_smaller_diffs=*/1, edit2_->column(0)); + /*exact_number_of_smaller_diffs=*/0, edit_array); } - // the last row group will simply be larger because of the insertion - AssertPageLengthDifferences(base_info.back(), modified_info.back(), - /*exact_number_of_equal_diffs=*/0, - /*exact_number_of_larger_diffs=*/1, - /*exact_number_of_smaller_diffs=*/0, edit2_->column(0)); } TEST_F(TestCDCMultipleRowGroups, DeleteOnce) { @@ -1570,31 +1575,34 @@ TEST_F(TestCDCMultipleRowGroups, DeleteOnce) { WriteTableToBuffer(modified, kMinChunkSize, kMaxChunkSize, kRowGroupLength, kEnableDictionary, ParquetDataPageVersion::V1)); - auto base_info = GetColumnParquetInfo(base_parquet, /*column_index=*/0); - auto modified_info = GetColumnParquetInfo(modified_parquet, /*column_index=*/0); + for (int col = 0; col < base->num_columns(); col++) { + auto base_info = GetColumnParquetInfo(base_parquet, /*column_index=*/col); + auto modified_info = GetColumnParquetInfo(modified_parquet, /*column_index=*/col); - // assert that there are 7 row groups - ASSERT_EQ(base_info.size(), 7); - ASSERT_EQ(modified_info.size(), 7); + // assert that there are 7 row groups + ASSERT_EQ(base_info.size(), 7); + ASSERT_EQ(modified_info.size(), 7); - // the first two row groups should be identical, each part contains two row groups and - // the first part is not modified - ASSERT_EQ(base_info.at(0).page_lengths, modified_info.at(0).page_lengths); - ASSERT_EQ(base_info.at(1).page_lengths, modified_info.at(1).page_lengths); - for (size_t i = 2; i < modified_info.size() - 1; i++) { + // the first two row groups should be identical, each part contains two row groups and + // the first part is not modified + ASSERT_EQ(base_info.at(0).page_lengths, modified_info.at(0).page_lengths); + ASSERT_EQ(base_info.at(1).page_lengths, modified_info.at(1).page_lengths); // because of the deletion values are shifted in the row group, we expect a smaller // "diff" at the beginning of the row group and a larger "diff" at the end of the // row group - AssertPageLengthDifferences(base_info.at(i), modified_info.at(i), + auto edit_array = edit2_->column(col); + for (size_t i = 2; i < modified_info.size() - 1; i++) { + AssertPageLengthDifferences(base_info.at(i), modified_info.at(i), + /*exact_number_of_equal_diffs=*/0, + /*exact_number_of_larger_diffs=*/1, + /*exact_number_of_smaller_diffs=*/1, edit_array); + } + // the last row group will simply be smaller because of the deletion + AssertPageLengthDifferences(base_info.back(), modified_info.back(), /*exact_number_of_equal_diffs=*/0, - /*exact_number_of_larger_diffs=*/1, - /*exact_number_of_smaller_diffs=*/1, edit1_->column(0)); + /*exact_number_of_larger_diffs=*/0, + /*exact_number_of_smaller_diffs=*/1, edit_array); } - // the last row group will simply be smaller because of the deletion - AssertPageLengthDifferences(base_info.back(), modified_info.back(), - /*exact_number_of_equal_diffs=*/0, - /*exact_number_of_larger_diffs=*/0, - /*exact_number_of_smaller_diffs=*/1, edit1_->column(0)); } TEST_F(TestCDCMultipleRowGroups, UpdateOnce) { @@ -1612,25 +1620,26 @@ TEST_F(TestCDCMultipleRowGroups, UpdateOnce) { auto modified_parquet, WriteTableToBuffer(modified, kMinChunkSize, kMaxChunkSize, kRowGroupLength, kEnableDictionary, ParquetDataPageVersion::V1)); - - auto base_info = GetColumnParquetInfo(base_parquet, /*column_index=*/0); - auto modified_info = GetColumnParquetInfo(modified_parquet, /*column_index=*/0); - - // assert that there are 7 row groups - ASSERT_EQ(base_info.size(), 7); - ASSERT_EQ(modified_info.size(), 7); - - // the first two row groups should be identical, each part contains two row groups and - // the first part is not modified - ASSERT_EQ(base_info.at(0).page_lengths, modified_info.at(0).page_lengths); - ASSERT_EQ(base_info.at(1).page_lengths, modified_info.at(1).page_lengths); - // then there is an update (without insertion or deletion so no shifting occurs) which - // causes a "diff" with both sides having the same number of values but different ones - AssertPageLengthDifferences(base_info.at(2), modified_info.at(2), - /*max_number_of_equal_diffs=*/1); - for (size_t i = 2; i < modified_info.size(); i++) { - // the rest of the row groups should be identical - ASSERT_EQ(base_info.at(i).page_lengths, modified_info.at(i).page_lengths); + for (int col = 0; col < base->num_columns(); col++) { + auto base_info = GetColumnParquetInfo(base_parquet, /*column_index=*/col); + auto modified_info = GetColumnParquetInfo(modified_parquet, /*column_index=*/col); + + // assert that there are 7 row groups + ASSERT_EQ(base_info.size(), 7); + ASSERT_EQ(modified_info.size(), 7); + + // the first two row groups should be identical, each part contains two row groups and + // the first part is not modified + ASSERT_EQ(base_info.at(0).page_lengths, modified_info.at(0).page_lengths); + ASSERT_EQ(base_info.at(1).page_lengths, modified_info.at(1).page_lengths); + // then there is an update (without insertion or deletion so no shifting occurs) which + // causes a "diff" with both sides having the same number of values but different ones + AssertPageLengthDifferences(base_info.at(2), modified_info.at(2), + /*max_number_of_equal_diffs=*/1); + for (size_t i = 2; i < modified_info.size(); i++) { + // the rest of the row groups should be identical + ASSERT_EQ(base_info.at(i).page_lengths, modified_info.at(i).page_lengths); + } } } @@ -1650,27 +1659,29 @@ TEST_F(TestCDCMultipleRowGroups, Append) { WriteTableToBuffer(modified, kMinChunkSize, kMaxChunkSize, kRowGroupLength, kEnableDictionary, ParquetDataPageVersion::V1)); - auto base_info = GetColumnParquetInfo(base_parquet, /*column_index=*/0); - auto modified_info = GetColumnParquetInfo(modified_parquet, /*column_index=*/0); + for (int col = 0; col < base->num_columns(); col++) { + auto base_info = GetColumnParquetInfo(base_parquet, /*column_index=*/col); + auto modified_info = GetColumnParquetInfo(modified_parquet, /*column_index=*/col); - // assert that there are 7 row groups - ASSERT_EQ(base_info.size(), 7); - ASSERT_EQ(modified_info.size(), 7); + // assert that there are 7 row groups + ASSERT_EQ(base_info.size(), 7); + ASSERT_EQ(modified_info.size(), 7); - for (size_t i = 0; i < modified_info.size() - 1; i++) { - ASSERT_EQ(base_info.at(i).page_lengths, modified_info.at(i).page_lengths); - } - // only the last row group should have more or equal number of pages - auto original_page_lengths = base_info.back().page_lengths; - auto modified_page_lengths = modified_info.back().page_lengths; - - // the last row group should be larger or equal in size - ASSERT_GE(original_page_lengths.size(), modified_page_lengths.size()); - // all pages must be identical except for the last one which can be larger - for (size_t i = 0; i < original_page_lengths.size() - 1; i++) { - ASSERT_EQ(original_page_lengths[i], modified_page_lengths[i]); + for (size_t i = 0; i < modified_info.size() - 1; i++) { + ASSERT_EQ(base_info.at(i).page_lengths, modified_info.at(i).page_lengths); + } + // only the last row group should have more or equal number of pages + auto original_page_lengths = base_info.back().page_lengths; + auto modified_page_lengths = modified_info.back().page_lengths; + + // the last row group should be larger or equal in size + ASSERT_GE(original_page_lengths.size(), modified_page_lengths.size()); + // all pages must be identical except for the last one which can be larger + for (size_t i = 0; i < original_page_lengths.size() - 1; i++) { + ASSERT_EQ(original_page_lengths[i], modified_page_lengths[i]); + } + ASSERT_GT(modified_page_lengths.back(), original_page_lengths.back()); } - ASSERT_GT(modified_page_lengths.back(), original_page_lengths.back()); } } // namespace parquet::internal From cd27277f084440354afb0397a5d11282f9bd6219 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Sat, 10 May 2025 11:59:51 +0200 Subject: [PATCH 093/102] address review comments --- cpp/src/parquet/chunker_internal.cc | 2 +- cpp/src/parquet/chunker_internal_codegen.py | 1 + cpp/src/parquet/chunker_internal_generated.h | 1 + cpp/src/parquet/properties.h | 8 ++++---- cpp/src/parquet/properties_test.cc | 20 ++++++++++++++++++++ 5 files changed, 27 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index 09247c8a1b9..c59dba4d406 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -113,7 +113,7 @@ class ContentDefinedChunker::Impl { uint64_t GetRollingHashMask() const { return rolling_hash_mask_; } - void Roll(const bool value) { + void Roll(bool value) { if (++chunk_size_ < min_chunk_size_) { // short-circuit if we haven't reached the minimum chunk size, this speeds up the // chunking process since the gearhash doesn't need to be updated diff --git a/cpp/src/parquet/chunker_internal_codegen.py b/cpp/src/parquet/chunker_internal_codegen.py index 5458d31dbfe..096196e5e2e 100644 --- a/cpp/src/parquet/chunker_internal_codegen.py +++ b/cpp/src/parquet/chunker_internal_codegen.py @@ -68,6 +68,7 @@ // under the License. #pragma once + #include namespace parquet::internal {{ diff --git a/cpp/src/parquet/chunker_internal_generated.h b/cpp/src/parquet/chunker_internal_generated.h index a09822a684e..c1e8a1b7e9f 100644 --- a/cpp/src/parquet/chunker_internal_generated.h +++ b/cpp/src/parquet/chunker_internal_generated.h @@ -16,6 +16,7 @@ // under the License. #pragma once + #include namespace parquet::internal { diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 82a4c900145..02700f14375 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -251,14 +251,14 @@ struct PARQUET_EXPORT CdcOptions { /// The rolling hash will not be updated until this size is reached for each chunk. /// Note that all data sent through the hash function is counted towards the chunk /// size, including definition and repetition levels if present. - int64_t min_chunk_size; + int64_t min_chunk_size = 256 * 1024; /// Maximum chunk size in bytes, default is 1024 KiB /// The chunker will create a new chunk whenever the chunk size exceeds this value. /// Note that the parquet writer has a related `pagesize` property that controls /// the maximum size of a parquet data page after encoding. While setting /// `pagesize` to a smaller value than `max_chunk_size` doesn't affect the /// chunking effectiveness, it results in more small parquet data pages. - int64_t max_chunk_size; + int64_t max_chunk_size = 1024 * 1024; /// Number of bit adjustement to the gearhash mask in order to /// center the chunk size around the average size more aggressively, default 0 /// Increasing the normalization factor increases the probability of finding a chunk, @@ -271,7 +271,7 @@ struct PARQUET_EXPORT CdcOptions { int norm_factor = 0; }; -static constexpr CdcOptions kDefaultCdcOptions = CdcOptions{256 * 1024, 1024 * 1024, 0}; +static constexpr CdcOptions kDefaultCdcOptions = CdcOptions{}; class PARQUET_EXPORT WriterProperties { public: @@ -332,7 +332,7 @@ class PARQUET_EXPORT WriterProperties { } /// \brief EXPERIMENTAL: Specify content-defined chunking options, see CdcOptions. - Builder* content_defined_chunking_options(const CdcOptions options) { + Builder* content_defined_chunking_options(const CdcOptions& options) { content_defined_chunking_options_ = options; return this; } diff --git a/cpp/src/parquet/properties_test.cc b/cpp/src/parquet/properties_test.cc index 35fc1156591..82e48136877 100644 --- a/cpp/src/parquet/properties_test.cc +++ b/cpp/src/parquet/properties_test.cc @@ -111,6 +111,26 @@ TEST(TestWriterProperties, SetCodecOptions) { ->window_bits); } +TEST(TestWriterProperties, ContentDefinedChunkingSettings) { + WriterProperties::Builder builder; + std::shared_ptr props = builder.build(); + + ASSERT_FALSE(props->content_defined_chunking_enabled()); + auto cdc_options = props->content_defined_chunking_options(); + ASSERT_EQ(cdc_options.min_chunk_size, 256 * 1024); + ASSERT_EQ(cdc_options.max_chunk_size, 1024 * 1024); + ASSERT_EQ(cdc_options.norm_factor, 0); + + builder.enable_content_defined_chunking(); + builder.content_defined_chunking_options(CdcOptions{512 * 1024, 2048 * 1024, 1}); + props = builder.build(); + ASSERT_TRUE(props->content_defined_chunking_enabled()); + cdc_options = props->content_defined_chunking_options(); + ASSERT_EQ(cdc_options.min_chunk_size, 512 * 1024); + ASSERT_EQ(cdc_options.max_chunk_size, 2048 * 1024); + ASSERT_EQ(cdc_options.norm_factor, 1); +} + TEST(TestReaderProperties, GetStreamInsufficientData) { // ARROW-6058 std::string data = "shorter than expected"; From 5a78e86fd60b0da5825e37f43e30dd3c87e131ce Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Sat, 10 May 2025 12:08:06 +0200 Subject: [PATCH 094/102] Use anonymus namespace for CalculateMask --- cpp/src/parquet/chunker_internal.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index c59dba4d406..4f3df896b73 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -39,6 +39,8 @@ static_assert(std::size(kGearhashTable) == kNumGearhashTables, static_assert(sizeof(kGearhashTable) == kNumGearhashTables * 256 * 8, "each table should have 256 entries of 64 bit values"); +namespace { + /// Calculate the mask to use for the rolling hash, the mask is used to determine if a /// new chunk should be created based on the rolling hash value. The mask is calculated /// based on the min_chunk_size, max_chunk_size and norm_factor parameters. @@ -65,8 +67,7 @@ static_assert(sizeof(kGearhashTable) == kNumGearhashTables * 256 * 8, // @param max_chunk_size The maximum chunk size (default 1MiB) // @param norm_factor Normalization factor (default 0) // @return The mask used to compare against the rolling hash -static uint64_t CalculateMask(int64_t min_chunk_size, int64_t max_chunk_size, - int norm_factor) { +uint64_t CalculateMask(int64_t min_chunk_size, int64_t max_chunk_size, int norm_factor) { if (min_chunk_size < 0) { throw ParquetException("min_chunk_size must be positive"); } @@ -102,6 +103,8 @@ static uint64_t CalculateMask(int64_t min_chunk_size, int64_t max_chunk_size, } } +} // namespace + class ContentDefinedChunker::Impl { public: Impl(const LevelInfo& level_info, int64_t min_chunk_size, int64_t max_chunk_size, From 4721f0022afc06d8c985f1826e90b916ca34beb3 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Sat, 10 May 2025 12:10:37 +0200 Subject: [PATCH 095/102] Correct error message for min_chunk_size=0 --- cpp/src/parquet/chunker_internal.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index 4f3df896b73..e57bb066b50 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -69,7 +69,7 @@ namespace { // @return The mask used to compare against the rolling hash uint64_t CalculateMask(int64_t min_chunk_size, int64_t max_chunk_size, int norm_factor) { if (min_chunk_size < 0) { - throw ParquetException("min_chunk_size must be positive"); + throw ParquetException("min_chunk_size must be non-negative"); } if (max_chunk_size <= min_chunk_size) { throw ParquetException("max_chunk_size must be greater than min_chunk_size"); From 9b4522dce2c5d2cf9c496488483978dc12678fc2 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Sat, 10 May 2025 12:18:45 +0200 Subject: [PATCH 096/102] Rename norm_factor to norm_level to better reflect that it is an integral value --- cpp/src/parquet/chunker_internal.cc | 24 +++++++++---------- cpp/src/parquet/chunker_internal.h | 8 +++---- cpp/src/parquet/chunker_internal_test.cc | 8 +++---- cpp/src/parquet/column_writer.cc | 2 +- cpp/src/parquet/properties.h | 8 +++---- cpp/src/parquet/properties_test.cc | 4 ++-- docs/source/python/parquet.rst | 4 ++-- python/pyarrow/_parquet.pxd | 2 +- python/pyarrow/_parquet.pyx | 4 ++-- python/pyarrow/parquet/core.py | 8 +++---- .../tests/parquet/test_parquet_writer.py | 4 ++-- 11 files changed, 38 insertions(+), 38 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index e57bb066b50..15ffa2cac5b 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -43,7 +43,7 @@ namespace { /// Calculate the mask to use for the rolling hash, the mask is used to determine if a /// new chunk should be created based on the rolling hash value. The mask is calculated -/// based on the min_chunk_size, max_chunk_size and norm_factor parameters. +/// based on the min_chunk_size, max_chunk_size and norm_level parameters. /// /// Assuming that the gear hash hash random values with a uniform distribution, then each /// bit in the actual value of rolling_hash_ has even probability of being set so a mask @@ -65,9 +65,9 @@ namespace { // // @param min_chunk_size The minimum chunk size (default 256KiB) // @param max_chunk_size The maximum chunk size (default 1MiB) -// @param norm_factor Normalization factor (default 0) +// @param norm_level Normalization level (default 0) // @return The mask used to compare against the rolling hash -uint64_t CalculateMask(int64_t min_chunk_size, int64_t max_chunk_size, int norm_factor) { +uint64_t CalculateMask(int64_t min_chunk_size, int64_t max_chunk_size, int norm_level) { if (min_chunk_size < 0) { throw ParquetException("min_chunk_size must be non-negative"); } @@ -87,11 +87,11 @@ uint64_t CalculateMask(int64_t min_chunk_size, int64_t max_chunk_size, int norm_ // by taking the floor(log2(target_size)) int mask_bits = std::max(0, ::arrow::bit_util::NumRequiredBits(target_size) - 1); - // a user defined `norm_factor` can be used to adjust the mask size, hence the matching - // probability, by increasing the norm_factor we increase the probability of matching - // the mask, forcing the distribution closer to the average size; norm_factor is 0 by + // a user defined `norm_level` can be used to adjust the mask size, hence the matching + // probability, by increasing the norm_level we increase the probability of matching + // the mask, forcing the distribution closer to the average size; norm_level is 0 by // default - int effective_bits = mask_bits - norm_factor; + int effective_bits = mask_bits - norm_level; if (effective_bits < 1 || effective_bits > 63) { throw ParquetException( @@ -108,11 +108,11 @@ uint64_t CalculateMask(int64_t min_chunk_size, int64_t max_chunk_size, int norm_ class ContentDefinedChunker::Impl { public: Impl(const LevelInfo& level_info, int64_t min_chunk_size, int64_t max_chunk_size, - int norm_factor) + int norm_level) : level_info_(level_info), min_chunk_size_(min_chunk_size), max_chunk_size_(max_chunk_size), - rolling_hash_mask_(CalculateMask(min_chunk_size, max_chunk_size, norm_factor)) {} + rolling_hash_mask_(CalculateMask(min_chunk_size, max_chunk_size, norm_level)) {} uint64_t GetRollingHashMask() const { return rolling_hash_mask_; } @@ -375,7 +375,7 @@ class ContentDefinedChunker::Impl { const int64_t max_chunk_size_; // The mask to match the rolling hash against to determine if a new chunk should be // created. The mask is calculated based on min/max chunk size and the normalization - // factor. + // level. const uint64_t rolling_hash_mask_; // Whether the rolling hash has matched the mask since the last chunk creation. This @@ -393,8 +393,8 @@ class ContentDefinedChunker::Impl { ContentDefinedChunker::ContentDefinedChunker(const LevelInfo& level_info, int64_t min_chunk_size, - int64_t max_chunk_size, int norm_factor) - : impl_(new Impl(level_info, min_chunk_size, max_chunk_size, norm_factor)) {} + int64_t max_chunk_size, int norm_level) + : impl_(new Impl(level_info, min_chunk_size, max_chunk_size, norm_level)) {} ContentDefinedChunker::~ContentDefinedChunker() = default; diff --git a/cpp/src/parquet/chunker_internal.h b/cpp/src/parquet/chunker_internal.h index 187f39647bd..070b5f6c0b2 100644 --- a/cpp/src/parquet/chunker_internal.h +++ b/cpp/src/parquet/chunker_internal.h @@ -108,16 +108,16 @@ class PARQUET_EXPORT ContentDefinedChunker { // property that controls the maximum size of a parquet data page after encoding. /// While setting `data_pagesize` to a smaller value than `max_chunk_size` doesn't /// affect the chunking effectiveness, it results in more small parquet data pages. - /// @param norm_factor Normalization factor to center the chunk size around the average + /// @param norm_level Normalization level to center the chunk size around the average /// size more aggressively, default 0. - /// Increasing the normalization factor increases the probability of finding a chunk + /// Increasing the normalization level increases the probability of finding a chunk /// boundary, improving the deduplication ratio, but also increases the number of /// small chunks resulting in many small parquet data pages. The default value /// provides a good balance between deduplication ratio and fragmentation. - /// Use norm_factor=1 or norm_factor=2 to reach a higher deduplication ratio at the + /// Use norm_level=1 or norm_level=2 to reach a higher deduplication ratio at the /// expense of fragmentation. ContentDefinedChunker(const LevelInfo& level_info, int64_t min_chunk_size, - int64_t max_chunk_size, int norm_factor = 0); + int64_t max_chunk_size, int norm_level = 0); ~ContentDefinedChunker(); /// Get the chunk boundaries for the given column data diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index d5e23c4fc81..6c888120213 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -339,7 +339,7 @@ Result> WriteTableToBuffer( auto builder = WriterProperties::Builder(); builder.enable_content_defined_chunking()->content_defined_chunking_options( - {min_chunk_size, max_chunk_size, /*norm_factor=*/0}); + {min_chunk_size, max_chunk_size, /*norm_level=*/0}); builder.data_page_version(data_page_version); if (enable_dictionary) { builder.enable_dictionary(); @@ -843,7 +843,7 @@ TEST_F(TestCDC, ChunkSizeParameterValidation) { ASSERT_NO_THROW(ContentDefinedChunker(li, 256 * 1024, 1024 * 1024)); - // with norm_factor=0 the difference between min and max chunk size must be + // with norm_level=0 the difference between min and max chunk size must be // at least 16 ASSERT_THROW(ContentDefinedChunker(li, 0, -1), ParquetException); ASSERT_THROW(ContentDefinedChunker(li, 1024, 512), ParquetException); @@ -861,7 +861,7 @@ TEST_F(TestCDC, ChunkSizeParameterValidation) { ASSERT_NO_THROW( ContentDefinedChunker(li, 1024 * 1024 * 1024L, 2LL * 1024 * 1024 * 1024L)); - // with norm_factor=1 the difference between min and max chunk size must be + // with norm_level=1 the difference between min and max chunk size must be // at least 64 ASSERT_THROW(ContentDefinedChunker(li, 1, -1, 1), ParquetException); ASSERT_THROW(ContentDefinedChunker(li, -1, 1, 1), ParquetException); @@ -870,7 +870,7 @@ TEST_F(TestCDC, ChunkSizeParameterValidation) { ASSERT_THROW(ContentDefinedChunker(li, 1, 33, 1), ParquetException); ASSERT_NO_THROW(ContentDefinedChunker(li, 1, 65, 1)); - // with norm_factor=2 the difference between min and max chunk size must be + // with norm_level=2 the difference between min and max chunk size must be // at least 128 ASSERT_THROW(ContentDefinedChunker(li, 0, 123, 2), ParquetException); ASSERT_NO_THROW(ContentDefinedChunker(li, 0, 128, 2)); diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 4c02758f20a..088ec0a1852 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -769,7 +769,7 @@ class ColumnWriterImpl { auto cdc_options = properties_->content_defined_chunking_options(); content_defined_chunker_.emplace(level_info_, cdc_options.min_chunk_size, cdc_options.max_chunk_size, - cdc_options.norm_factor); + cdc_options.norm_level); } } diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 02700f14375..748504308f4 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -261,14 +261,14 @@ struct PARQUET_EXPORT CdcOptions { int64_t max_chunk_size = 1024 * 1024; /// Number of bit adjustement to the gearhash mask in order to /// center the chunk size around the average size more aggressively, default 0 - /// Increasing the normalization factor increases the probability of finding a chunk, + /// Increasing the normalization level increases the probability of finding a chunk, /// improving the deduplication ratio, but also increasing the number of small chunks /// resulting in many small parquet data pages. The default value provides a good - /// balance between deduplication ratio and fragmentation. Use norm_factor=1 or - /// norm_factor=2 to reach a higher deduplication ratio at the expense of + /// balance between deduplication ratio and fragmentation. Use norm_level=1 or + /// norm_level=2 to reach a higher deduplication ratio at the expense of /// fragmentation. Negative values can also be used to reduce the probability of /// finding a chunk, resulting in larger chunks and fewer data pages. - int norm_factor = 0; + int norm_level = 0; }; static constexpr CdcOptions kDefaultCdcOptions = CdcOptions{}; diff --git a/cpp/src/parquet/properties_test.cc b/cpp/src/parquet/properties_test.cc index 82e48136877..bb2bc9552b4 100644 --- a/cpp/src/parquet/properties_test.cc +++ b/cpp/src/parquet/properties_test.cc @@ -119,7 +119,7 @@ TEST(TestWriterProperties, ContentDefinedChunkingSettings) { auto cdc_options = props->content_defined_chunking_options(); ASSERT_EQ(cdc_options.min_chunk_size, 256 * 1024); ASSERT_EQ(cdc_options.max_chunk_size, 1024 * 1024); - ASSERT_EQ(cdc_options.norm_factor, 0); + ASSERT_EQ(cdc_options.norm_level, 0); builder.enable_content_defined_chunking(); builder.content_defined_chunking_options(CdcOptions{512 * 1024, 2048 * 1024, 1}); @@ -128,7 +128,7 @@ TEST(TestWriterProperties, ContentDefinedChunkingSettings) { cdc_options = props->content_defined_chunking_options(); ASSERT_EQ(cdc_options.min_chunk_size, 512 * 1024); ASSERT_EQ(cdc_options.max_chunk_size, 2048 * 1024); - ASSERT_EQ(cdc_options.norm_factor, 1); + ASSERT_EQ(cdc_options.norm_level, 1); } TEST(TestReaderProperties, GetStreamInsufficientData) { diff --git a/docs/source/python/parquet.rst b/docs/source/python/parquet.rst index 1d2664e81e2..6302f5dd2e8 100644 --- a/docs/source/python/parquet.rst +++ b/docs/source/python/parquet.rst @@ -806,12 +806,12 @@ the Parquet writer. It accepts either a boolean or a dictionary for configuratio - ``True``: Uses the default configuration with: - Minimum chunk size: 256 KiB - Maximum chunk size: 1024 KiB - - Normalization factor: 0 + - Normalization level: 0 - ``dict``: Allows customization of the chunking parameters: - ``min_chunk_size``: Minimum chunk size in bytes (default: 256 KiB). - ``max_chunk_size``: Maximum chunk size in bytes (default: 1024 KiB). - - ``norm_factor``: Normalization factor to adjust chunk size distribution (default: 0). + - ``norm_level``: Normalization level to adjust chunk size distribution (default: 0). Note that the chunk size is calculated on the logical values before applying any encoding or compression. The actual size of the data pages may vary based on the encoding and diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 3cfc5a7c14a..bf65b2b9d15 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -465,7 +465,7 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: cdef cppclass CdcOptions: int64_t min_chunk_size int64_t max_chunk_size - int norm_factor + int norm_level cdef cppclass WriterProperties: cppclass Builder: diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index fa073d7e64d..5033006b508 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -2122,7 +2122,7 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( elif isinstance(use_content_defined_chunking, dict): defined_keys = use_content_defined_chunking.keys() mandatory_keys = {"min_chunk_size", "max_chunk_size"} - allowed_keys = {"min_chunk_size", "max_chunk_size", "norm_factor"} + allowed_keys = {"min_chunk_size", "max_chunk_size", "norm_level"} unknown_keys = defined_keys - allowed_keys missing_keys = mandatory_keys - defined_keys if unknown_keys: @@ -2133,7 +2133,7 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( f"Missing options in 'use_content_defined_chunking': {missing_keys}") cdc_options.min_chunk_size = use_content_defined_chunking["min_chunk_size"] cdc_options.max_chunk_size = use_content_defined_chunking["max_chunk_size"] - cdc_options.norm_factor = use_content_defined_chunking.get("norm_factor", 0) + cdc_options.norm_level = use_content_defined_chunking.get("norm_level", 0) props.enable_content_defined_chunking() props.content_defined_chunking_options(cdc_options) else: diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 3585e3b9640..b6a813fdbdd 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -917,13 +917,13 @@ def _sanitize_table(table, new_schema, flavor): the maximum size of a parquet data page after encoding. While setting `data_page_size` to a smaller value than `max_chunk_size` doesn't affect the chunking effectiveness, it results in more small parquet data pages. - - `norm_factor`: normalization factor to center the chunk size around the average + - `norm_level`: normalization level to center the chunk size around the average size more aggressively, default 0 - Increasing the normalization factor increases the probability of finding a chunk, + Increasing the normalization level increases the probability of finding a chunk, improving the deduplication ratio, but also increasing the number of small chunks resulting in many small parquet data pages. The default value provides a good - balance between deduplication ratio and fragmentation. Use norm_factor=1 or - norm_factor=2 to reach a higher deduplication ratio at the expense of + balance between deduplication ratio and fragmentation. Use norm_level=1 or + norm_level=2 to reach a higher deduplication ratio at the expense of fragmentation. """ diff --git a/python/pyarrow/tests/parquet/test_parquet_writer.py b/python/pyarrow/tests/parquet/test_parquet_writer.py index 7fd30c73576..d1e9e874ba1 100644 --- a/python/pyarrow/tests/parquet/test_parquet_writer.py +++ b/python/pyarrow/tests/parquet/test_parquet_writer.py @@ -445,6 +445,6 @@ def test_parquet_content_defined_chunking_parameters(tempdir): cdc_options = {"min_chunk_size": 32_768, "max_chunk_size": 65_536} pq.write_table(table, path, use_content_defined_chunking=cdc_options) - # using min_chunk_size, max_chunk_size and norm_factor - cdc_options = {"min_chunk_size": 32_768, "max_chunk_size": 65_536, "norm_factor": 1} + # using min_chunk_size, max_chunk_size and norm_level + cdc_options = {"min_chunk_size": 32_768, "max_chunk_size": 65_536, "norm_level": 1} pq.write_table(table, path, use_content_defined_chunking=cdc_options) From 8f56430fef3fff70273302c33d72a12a3e08fc92 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Sat, 10 May 2025 12:55:59 +0200 Subject: [PATCH 097/102] Add note about norm_level recommended range --- cpp/src/parquet/properties.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 748504308f4..9faba909c83 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -247,7 +247,7 @@ class PARQUET_EXPORT ColumnProperties { // EXPERIMENTAL: Options for content-defined chunking. struct PARQUET_EXPORT CdcOptions { - /// Minimum chunk size in bytes, default 256 KiB + /// Minimum chunk size in bytes, default is 256 KiB /// The rolling hash will not be updated until this size is reached for each chunk. /// Note that all data sent through the hash function is counted towards the chunk /// size, including definition and repetition levels if present. @@ -259,15 +259,17 @@ struct PARQUET_EXPORT CdcOptions { /// `pagesize` to a smaller value than `max_chunk_size` doesn't affect the /// chunking effectiveness, it results in more small parquet data pages. int64_t max_chunk_size = 1024 * 1024; - /// Number of bit adjustement to the gearhash mask in order to - /// center the chunk size around the average size more aggressively, default 0 + /// Number of bit adjustment to the gearhash mask in order to center the chunk size + /// around the average size more aggressively, default is 0 /// Increasing the normalization level increases the probability of finding a chunk, /// improving the deduplication ratio, but also increasing the number of small chunks /// resulting in many small parquet data pages. The default value provides a good - /// balance between deduplication ratio and fragmentation. Use norm_level=1 or - /// norm_level=2 to reach a higher deduplication ratio at the expense of - /// fragmentation. Negative values can also be used to reduce the probability of - /// finding a chunk, resulting in larger chunks and fewer data pages. + /// balance between deduplication ratio and fragmentation. + /// Use norm_level=1 or norm_level=2 to reach a higher deduplication ratio at the + /// expense of fragmentation. Negative values can also be used to reduce the + /// probability of finding a chunk, resulting in larger chunks and fewer data pages. + /// Note that values outside [-3, 3] are not recommended, prefer using the default + /// value of 0 for most use cases. int norm_level = 0; }; From 893465a7cc1c74745ecd47d62221807f30ddb59f Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Sat, 10 May 2025 20:28:33 +0200 Subject: [PATCH 098/102] Make content defined chunking branches unlikely --- cpp/src/parquet/column_writer.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 088ec0a1852..9e67de6d715 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1278,7 +1278,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, int64_t WriteBatch(int64_t num_values, const int16_t* def_levels, const int16_t* rep_levels, const T* values) override { - if (properties_->content_defined_chunking_enabled()) { + if (ARROW_PREDICT_FALSE(properties_->content_defined_chunking_enabled())) { throw ParquetException( "Content-defined chunking is not yet supported for WriteBatch() and " "WriteBatchSpaced(), use WriteArrow() instead"); @@ -1320,7 +1320,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels, const int16_t* rep_levels, const uint8_t* valid_bits, int64_t valid_bits_offset, const T* values) override { - if (properties_->content_defined_chunking_enabled()) { + if (ARROW_PREDICT_FALSE(properties_->content_defined_chunking_enabled())) { throw ParquetException( "Content-defined chunking is not yet supported for WriteBatch() and " "WriteBatchSpaced(), use WriteArrow() instead"); @@ -1388,7 +1388,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, bits_buffer_->ZeroPadding(); } - if (properties_->content_defined_chunking_enabled()) { + if (ARROW_PREDICT_FALSE(properties_->content_defined_chunking_enabled())) { DCHECK(content_defined_chunker_.has_value()); auto chunks = content_defined_chunker_->GetChunks(def_levels, rep_levels, num_levels, leaf_array); From 768743cfebe3a4a39b196d4913719e5a7b98d27b Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Mon, 12 May 2025 19:50:23 +0200 Subject: [PATCH 099/102] Address review comments --- cpp/src/parquet/chunker_internal.cc | 5 +++-- cpp/src/parquet/properties.h | 6 +++--- python/pyarrow/_parquet.pxd | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index 15ffa2cac5b..c0c74546c58 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -273,7 +273,7 @@ class ContentDefinedChunker::Impl { RollValue(value_offset); } - if ((rep_level == 0) && NeedNewChunk()) { + if (rep_level == 0 && NeedNewChunk()) { // if we are at a record boundary and need a new chunk, we create a new chunk auto levels_to_write = offset - prev_offset; if (levels_to_write > 0) { @@ -305,7 +305,8 @@ class ContentDefinedChunker::Impl { const int16_t* rep_levels, int64_t num_levels, const ::arrow::Array& values) { const uint8_t* raw_values = - values.data()->GetValues(1, 0) + values.offset() * kByteWidth; + values.data()->GetValues(/*i=*/1, /*absolute_offset=*/0) + + values.offset() * kByteWidth; return Calculate(def_levels, rep_levels, num_levels, [&](int64_t i) { return Roll(&raw_values[i * kByteWidth]); }); diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 9faba909c83..24d900fdbb5 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -273,8 +273,6 @@ struct PARQUET_EXPORT CdcOptions { int norm_level = 0; }; -static constexpr CdcOptions kDefaultCdcOptions = CdcOptions{}; - class PARQUET_EXPORT WriterProperties { public: class Builder { @@ -292,7 +290,7 @@ class PARQUET_EXPORT WriterProperties { page_checksum_enabled_(false), size_statistics_level_(DEFAULT_SIZE_STATISTICS_LEVEL), content_defined_chunking_enabled_(false), - content_defined_chunking_options_(kDefaultCdcOptions) {} + content_defined_chunking_options_({}) {} explicit Builder(const WriterProperties& properties) : pool_(properties.memory_pool()), @@ -322,6 +320,8 @@ class PARQUET_EXPORT WriterProperties { /// efficient deduplication of data across files, hence more efficient network /// transfers and storage. The chunking is based on a rolling hash algorithm that /// identifies chunk boundaries based on the actual content of the data. + /// + /// Note that only the WriteArrow() interface is supported at the moment. Builder* enable_content_defined_chunking() { content_defined_chunking_enabled_ = true; return this; diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index bf65b2b9d15..7095b35a2c1 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -502,7 +502,7 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: Builder* disable_page_checksum() Builder* enable_content_defined_chunking() Builder* disable_content_defined_chunking() - Builder* content_defined_chunking_options(const CdcOptions options) + Builder* content_defined_chunking_options(CdcOptions options) shared_ptr[WriterProperties] build() cdef cppclass ArrowWriterProperties: From cb5e16c09b1e35316780f9b3c22821552d854d73 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Mon, 12 May 2025 20:36:27 +0200 Subject: [PATCH 100/102] Assert on exception message for WriteBatchSpaced and WriteBatch if CDC is enabled --- cpp/src/parquet/chunker_internal_test.cc | 23 +++++++++++++++++------ cpp/src/parquet/column_writer.cc | 8 ++++---- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index 6c888120213..2343c63ba93 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -22,6 +22,7 @@ #include #include +#include #include #include "arrow/table.h" @@ -934,12 +935,22 @@ TEST_F(TestCDC, WriteSingleColumnParquetFile) { std::vector numbers = {1, 2, 3, 4, 5}; std::vector valid_bits = {1, 0, 1, 0, 1}; - EXPECT_THROW( - int_column_writer.WriteBatch(numbers.size(), nullptr, nullptr, numbers.data()), - ParquetException); - EXPECT_THROW(int_column_writer.WriteBatchSpaced(numbers.size(), nullptr, nullptr, - valid_bits.data(), 0, numbers.data()), - ParquetException); + + auto expected_msg = ::testing::Property( + &ParquetException::what, + ::testing::HasSubstr("Content-defined chunking is not supported in WriteBatch() or " + "WriteBatchSpaced(), use WriteArrow() instead.")); + EXPECT_THROW_THAT( + [&]() { + int_column_writer.WriteBatch(numbers.size(), nullptr, nullptr, numbers.data()); + }, + ParquetException, expected_msg); + EXPECT_THROW_THAT( + [&]() { + int_column_writer.WriteBatchSpaced(numbers.size(), nullptr, nullptr, + valid_bits.data(), 0, numbers.data()); + }, + ParquetException, expected_msg); } TEST_F(TestCDC, LastChunkDoesntTriggerAddDataPage) { diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 9e67de6d715..6f169a1476e 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1280,8 +1280,8 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, const int16_t* rep_levels, const T* values) override { if (ARROW_PREDICT_FALSE(properties_->content_defined_chunking_enabled())) { throw ParquetException( - "Content-defined chunking is not yet supported for WriteBatch() and " - "WriteBatchSpaced(), use WriteArrow() instead"); + "Content-defined chunking is not supported in WriteBatch() or " + "WriteBatchSpaced(), use WriteArrow() instead."); } return WriteBatchInternal(num_values, def_levels, rep_levels, values); } @@ -1322,8 +1322,8 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, int64_t valid_bits_offset, const T* values) override { if (ARROW_PREDICT_FALSE(properties_->content_defined_chunking_enabled())) { throw ParquetException( - "Content-defined chunking is not yet supported for WriteBatch() and " - "WriteBatchSpaced(), use WriteArrow() instead"); + "Content-defined chunking is not supported in WriteBatch() or " + "WriteBatchSpaced(), use WriteArrow() instead."); } return WriteBatchSpacedInternal(num_values, def_levels, rep_levels, valid_bits, valid_bits_offset, values); From ab3f86ef78915ed23c927d262daa781ff80d1b9a Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Mon, 12 May 2025 20:54:36 +0200 Subject: [PATCH 101/102] Test JSON extension type instead of UUID because since UUID is not a supported extension type by the parquet reader --- cpp/src/parquet/chunker_internal_test.cc | 25 ++++++++---------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index 2343c63ba93..ab014126ac0 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -25,8 +25,8 @@ #include #include +#include "arrow/extension/json.h" #include "arrow/table.h" -#include "arrow/testing/extension_type.h" #include "arrow/testing/generator.h" #include "arrow/type_fwd.h" #include "arrow/util/float16.h" @@ -323,6 +323,7 @@ Result> ReadTableFromBuffer(const std::shared_ptr FileReaderBuilder builder; std::unique_ptr reader; auto props = default_arrow_reader_properties(); + props.set_arrow_extensions_enabled(true); RETURN_NOT_OK(builder.Open(std::make_shared(data))); RETURN_NOT_OK(builder.memory_pool(::arrow::default_memory_pool()) @@ -353,21 +354,11 @@ Result> WriteTableToBuffer( write_props, arrow_props)); ARROW_ASSIGN_OR_RAISE(auto buffer, sink->Finish()); - // check whether the schema has extension types, if not we can easily ensure that - // the parquet seralization is roundtripable with CDC enabled - bool validate_roundtrip = true; - for (const auto& field : table->schema()->fields()) { - if (field->type()->id() == ::arrow::Type::EXTENSION) { - validate_roundtrip = false; - break; - } - } - if (validate_roundtrip) { - ARROW_ASSIGN_OR_RAISE(auto readback, ReadTableFromBuffer(buffer)); - RETURN_NOT_OK(readback->ValidateFull()); - ARROW_RETURN_IF(!readback->Equals(*table), - Status::Invalid("Readback table not equal to original")); - } + // validate that the data correctly roundtrips + ARROW_ASSIGN_OR_RAISE(auto readback, ReadTableFromBuffer(buffer)); + RETURN_NOT_OK(readback->ValidateFull()); + ARROW_RETURN_IF(!readback->Equals(*table), + Status::Invalid("Readback table not equal to original")); return buffer; } @@ -1487,7 +1478,7 @@ INSTANTIATE_TEST_SUITE_P( ::arrow::list(::arrow::struct_({::arrow::field("f0", ::arrow::int32())})), false}, // Extension type - CaseConfig{::arrow::uuid(), true}, + CaseConfig{::arrow::extension::json(), true}, // Use ParquetDataPageVersion::V2 CaseConfig{::arrow::large_binary(), false, ParquetDataPageVersion::V2}, CaseConfig{::arrow::list(::arrow::utf8()), true, ParquetDataPageVersion::V2})); From 1cc2e4b690e467f548dc4131f9b392a864a132aa Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Tue, 13 May 2025 16:45:08 +0200 Subject: [PATCH 102/102] Reduce the number of test cases for ASAN/Valgrind builds and add more docstrings --- cpp/src/parquet/chunker_internal.cc | 16 ++++++++++++++++ cpp/src/parquet/chunker_internal_test.cc | 19 +++++++++++++++++-- cpp/src/parquet/properties.h | 10 ++++++++++ docs/source/python/parquet.rst | 3 ++- 4 files changed, 45 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/chunker_internal.cc b/cpp/src/parquet/chunker_internal.cc index c0c74546c58..cc0a386f4c1 100644 --- a/cpp/src/parquet/chunker_internal.cc +++ b/cpp/src/parquet/chunker_internal.cc @@ -117,6 +117,8 @@ class ContentDefinedChunker::Impl { uint64_t GetRollingHashMask() const { return rolling_hash_mask_; } void Roll(bool value) { + // Update the rolling hash with a boolean value, set has_matched_ to true if the hash + // matches the if (++chunk_size_ < min_chunk_size_) { // short-circuit if we haven't reached the minimum chunk size, this speeds up the // chunking process since the gearhash doesn't need to be updated @@ -222,6 +224,20 @@ class ContentDefinedChunker::Impl { std::vector Calculate(const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, const RollFunc& RollValue) { // Calculate the chunk boundaries for typed Arrow arrays. + // + // The chunking state is maintained across the entire column without being reset + // between pages and row groups. This enables that the chunking process can be + // continued between different WriteArrow calls. + // + // Below we go over the (def_level, rep_level, value) triplets one by one while + // adjusting the column-global rolling hash based on the triplet. Whenever the + // rolling hash matches a predefined mask it sets the `has_matched_` flag to true. + // + // After each triplet NeedNewChunk() is called to evaluate if we need to create + // a new chunk. If the rolling hash matches the mask `kNumGearhashTables` times in + // row (required for better chunk size distribution) and satisfies the chunk size + // requirements, we create a new chunk. See the `NeedNewChunk()` method for more + // details. std::vector chunks; int64_t offset; int64_t prev_offset = 0; diff --git a/cpp/src/parquet/chunker_internal_test.cc b/cpp/src/parquet/chunker_internal_test.cc index ab014126ac0..1b028cb1d69 100644 --- a/cpp/src/parquet/chunker_internal_test.cc +++ b/cpp/src/parquet/chunker_internal_test.cc @@ -1445,8 +1445,22 @@ TEST_P(TestCDCSingleRowGroup, ArrayOffsets) { } } +#if defined(ADDRESS_SANITIZER) || defined(ARROW_VALGRIND) +// Instantiate the test suite with a reduced set of types to avoid slow tests INSTANTIATE_TEST_SUITE_P( - FixedSizedTypes, TestCDCSingleRowGroup, + Types, TestCDCSingleRowGroup, + testing::Values( + CaseConfig{::arrow::boolean(), false}, CaseConfig{::arrow::int64(), true}, + // Binary-like + CaseConfig{::arrow::utf8(), false}, + CaseConfig{::arrow::fixed_size_binary(16), true}, + // Nested types + CaseConfig{::arrow::list(::arrow::int32()), false}, + CaseConfig{::arrow::list(::arrow::utf8()), true}, + CaseConfig{::arrow::struct_({::arrow::field("f0", ::arrow::float64())}), true})); +#else +INSTANTIATE_TEST_SUITE_P( + Types, TestCDCSingleRowGroup, testing::Values( // Boolean CaseConfig{::arrow::boolean(), false}, @@ -1469,7 +1483,7 @@ INSTANTIATE_TEST_SUITE_P( CaseConfig{::arrow::timestamp(::arrow::TimeUnit::NANO), true}, CaseConfig{::arrow::duration(::arrow::TimeUnit::NANO), false}, // Nested types - CaseConfig{::arrow::list(::arrow::int32()), false}, + CaseConfig{::arrow::list(::arrow::int16()), false}, CaseConfig{::arrow::list(::arrow::int32()), true}, CaseConfig{::arrow::list(::arrow::utf8()), true}, CaseConfig{::arrow::struct_({::arrow::field("f0", ::arrow::int32())}), false}, @@ -1482,6 +1496,7 @@ INSTANTIATE_TEST_SUITE_P( // Use ParquetDataPageVersion::V2 CaseConfig{::arrow::large_binary(), false, ParquetDataPageVersion::V2}, CaseConfig{::arrow::list(::arrow::utf8()), true, ParquetDataPageVersion::V2})); +#endif class TestCDCMultipleRowGroups : public ::testing::Test { protected: diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 24d900fdbb5..1acbc188e79 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -246,6 +246,16 @@ class PARQUET_EXPORT ColumnProperties { }; // EXPERIMENTAL: Options for content-defined chunking. +/// +/// Content-defined chunking is an experimental feature that optimizes parquet +/// files for content addressable storage (CAS) systems by writing data pages +/// according to content-defined chunk boundaries. This allows for more +/// efficient deduplication of data across files, hence more efficient network +/// transfers and storage. +/// Each content-defined chunk is written as a separate parquet data page. The +/// following options control the chunks' size and the chunking process. Note +/// that the chunk size is calculated based on the logical value of the data, +/// before any encoding or compression is applied. struct PARQUET_EXPORT CdcOptions { /// Minimum chunk size in bytes, default is 256 KiB /// The rolling hash will not be updated until this size is reached for each chunk. diff --git a/docs/source/python/parquet.rst b/docs/source/python/parquet.rst index 6302f5dd2e8..78e84e3ce8d 100644 --- a/docs/source/python/parquet.rst +++ b/docs/source/python/parquet.rst @@ -818,7 +818,8 @@ or compression. The actual size of the data pages may vary based on the encoding compression used. .. note:: - Ensure that Parquet write options remain consistent across writes and files. + To make the most of this feature, you should ensure that Parquet write options + remain consistent across writes and files. Using different write options (like compression, encoding, or row group size) for different files may prevent proper deduplication and lead to suboptimal storage efficiency.