From 0a7e537e808a767d9e08b5ef27fb99d2a002dcca Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 17 Apr 2020 08:24:36 +0000 Subject: [PATCH 01/40] compiling --- cpp/cmake_modules/SetupCxxFlags.cmake | 7 +- cpp/src/arrow/util/bit_util.cc | 10 +- cpp/src/arrow/util/bit_util.h | 8 ++ cpp/src/parquet/column_reader.cc | 165 +++++++++++++++++++++-- cpp/src/parquet/column_reader.h | 47 +------ cpp/src/parquet/level_conversion.h | 182 ++++++++++++++++++++++++++ 6 files changed, 355 insertions(+), 64 deletions(-) create mode 100644 cpp/src/parquet/level_conversion.h diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 726de0adec2..753bb303874 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -42,12 +42,13 @@ if(ARROW_CPU_FLAG STREQUAL "x86") set(CXX_SUPPORTS_SSE4_2 TRUE) else() set(ARROW_SSE4_2_FLAG "-msse4.2") - set(ARROW_AVX2_FLAG "-mavx2") + set(ARROW_AVX2_FLAG "-march=core-avx2") # skylake-avx512 consists of AVX512F,AVX512BW,AVX512VL,AVX512CD,AVX512DQ set(ARROW_AVX512_FLAG "-march=skylake-avx512") check_cxx_compiler_flag(${ARROW_SSE4_2_FLAG} CXX_SUPPORTS_SSE4_2) endif() check_cxx_compiler_flag(${ARROW_AVX2_FLAG} CXX_SUPPORTS_AVX2) + check_cxx_compiler_flag(${ARROW_AVX2_FLAG} CXX_SUPPORTS_AVX2) check_cxx_compiler_flag(${ARROW_AVX512_FLAG} CXX_SUPPORTS_AVX512) elseif(ARROW_CPU_FLAG STREQUAL "ppc") # power compiler flags, gcc/clang only @@ -313,13 +314,13 @@ if(ARROW_CPU_FLAG STREQUAL "x86") message(FATAL_ERROR "AVX512 required but compiler doesn't support it.") endif() set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_AVX512_FLAG}") - add_definitions(-DARROW_HAVE_AVX512 -DARROW_HAVE_AVX2 -DARROW_HAVE_SSE4_2) + add_definitions(-DARROW_HAVE_AVX512 -DARROW_HAVE_AVX2 -DARROW_HAVE_BMI2 -DARROW_HAVE_SSE4_2) elseif(ARROW_SIMD_LEVEL STREQUAL "AVX2") if(NOT CXX_SUPPORTS_AVX2) message(FATAL_ERROR "AVX2 required but compiler doesn't support it.") endif() set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_AVX2_FLAG}") - add_definitions(-DARROW_HAVE_AVX2 -DARROW_HAVE_SSE4_2) + add_definitions(-DARROW_HAVE_AVX2 -DARROW_HAVE_BMI2 -DARROW_HAVE_SSE4_2) elseif(ARROW_SIMD_LEVEL STREQUAL "SSE4_2") if(NOT CXX_SUPPORTS_SSE4_2) message(FATAL_ERROR "SSE4.2 required but compiler doesn't support it.") diff --git a/cpp/src/arrow/util/bit_util.cc b/cpp/src/arrow/util/bit_util.cc index 5ec278acae8..df4b170c303 100644 --- a/cpp/src/arrow/util/bit_util.cc +++ b/cpp/src/arrow/util/bit_util.cc @@ -15,14 +15,6 @@ // specific language governing permissions and limitations // under the License. -// Alias MSVC popcount to GCC name -#ifdef _MSC_VER -#include -#define __builtin_popcount __popcnt -#include -#define __builtin_popcountll _mm_popcnt_u64 -#endif - #include #include #include @@ -131,7 +123,7 @@ int64_t CountSetBits(const uint8_t* data, int64_t bit_offset, int64_t length) { const uint64_t* end = u64_data + p.aligned_words; for (auto iter = u64_data; iter < end; ++iter) { - count += __builtin_popcountll(*iter); + count += BitUtil::PopCount(*iter); } } diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index 12b1a13c59a..0223fe57eb1 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -43,13 +43,18 @@ #if defined(_MSC_VER) #include // IWYU pragma: keep +#include #pragma intrinsic(_BitScanReverse) #pragma intrinsic(_BitScanForward) #define ARROW_BYTE_SWAP64 _byteswap_uint64 #define ARROW_BYTE_SWAP32 _byteswap_ulong +#define ARROW_POPCOUNT64 __popcnt_u64 +#define ARROW_POPCOUNT32 __popcnt #else #define ARROW_BYTE_SWAP64 __builtin_bswap64 #define ARROW_BYTE_SWAP32 __builtin_bswap32 +#define ARROW_POPCOUNT64 __builtin_popcountll +#define ARROW_POPCOUNT32 __builtin_popcount #endif #include @@ -107,6 +112,9 @@ static constexpr uint8_t kBytePopcount[] = { 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; +static inline int PopCount(uint64_t bitmap) { return ARROW_POPCOUNT64(bitmap); } +static inline int PopCount(uint32_t bitmap) { return ARROW_POPCOUNT32(bitmap); } + // // Bit-related computations on integer values // diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 034defdcbe2..0c46aa37610 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -42,6 +42,7 @@ #include "parquet/encoding.h" #include "parquet/encryption_internal.h" #include "parquet/internal_file_decryptor.h" +#include "parquet/level_conversion.h" #include "parquet/properties.h" #include "parquet/statistics.h" #include "parquet/thrift_internal.h" // IWYU pragma: keep @@ -51,6 +52,140 @@ using arrow::internal::checked_cast; namespace parquet { +namespace { + +inline void CheckLevelRange(const int16_t* levels, int64_t num_levels, + const int16_t max_expected_level) { + int16_t min_level = std::numeric_limits::max(); + int16_t max_level = std::numeric_limits::min(); + for (int x = 0; x < num_levels; x++) { + min_level = std::min(levels[x], min_level); + max_level = std::max(levels[x], max_level); + } + if (ARROW_PREDICT_FALSE(num_levels > 0 && (min_level < 0 || max_level > max_level))) { + throw ParquetException("definition level exceeds maximum"); + } +} + +#if not defined(ARROW_HAVE_BMI2) +inline void DefinitionLevelsToBitmapScalar( + const int16_t* def_levels, int64_t num_def_levels, const int16_t max_definition_level, + const int16_t max_repetition_level, int64_t* values_read, int64_t* null_count, + uint8_t* valid_bits, int64_t valid_bits_offset) { + // We assume here that valid_bits is large enough to accommodate the + // additional definition levels and the ones that have already been written + ::arrow::internal::BitmapWriter valid_bits_writer(valid_bits, valid_bits_offset, + num_def_levels); + + // TODO(itaiin): As an interim solution we are splitting the code path here + // between repeated+flat column reads, and non-repeated+nested reads. + // Those paths need to be merged in the future + for (int i = 0; i < num_def_levels; ++i) { + if (def_levels[i] == max_definition_level) { + valid_bits_writer.Set(); + } else if (max_repetition_level > 0) { + // repetition+flat case + if (def_levels[i] == (max_definition_level - 1)) { + valid_bits_writer.Clear(); + *null_count += 1; + } else { + continue; + } + } else { + // non-repeated+nested case + if (def_levels[i] < max_definition_level) { + valid_bits_writer.Clear(); + *null_count += 1; + } else { + throw ParquetException("definition level exceeds maximum"); + } + } + + valid_bits_writer.Next(); + } + valid_bits_writer.Finish(); + *values_read = valid_bits_writer.position(); +} +#endif + +template +void DefinitionLevelsToBitmapSimd(const int16_t* def_levels, int64_t num_def_levels, + const int16_t required_definition_level, + int64_t* values_read, int64_t* null_count, + uint8_t* valid_bits, int64_t valid_bits_offset) { + constexpr int64_t kBitMaskSize = 64; + int64_t valid_count = 0; + *values_read = 0; + while (num_def_levels > 0) { + int64_t batch_size = std::min(num_def_levels, kBitMaskSize); + CheckLevelRange(def_levels, batch_size, required_definition_level); + uint64_t valid_bitmap = internal::GreaterThanBitmap(def_levels, batch_size, + required_definition_level - 1); + if (has_repeated_parent) { + // This is currently a specialized code path assuming only (nested) lists + // present through the leaf (i.e. no structs). + // Upper level code only calls this method + // when the leaf-values are nullable (otherwise no spacing is needed), + // Because only nested lists exists it is sufficient to know that the field + // was either null or included it (i.e. >= previous definition level -> > previous + // definition - 1). If there where structs mixed in, we need to know the def_level + // of the repeated parent so we can check for def_level > "def level of repeated + // parent". + uint64_t present_bits = internal::GreaterThanBitmap(def_levels, batch_size, + required_definition_level - 2); + *values_read += internal::AppendValidityBitmap( + /*reversed_bitmap=*/valid_bitmap, + /*reversed_parent_bitmap=*/present_bits, + /*num_entries=*/batch_size, valid_bits, &valid_bits_offset, &valid_count); + } else { + *values_read += internal::AppendValidityBitmap( + /*reversed_bitmap=*/valid_bitmap, + /*reversed_parent_bitmap=*/-1, // this is unused + /*num_entries=*/batch_size, valid_bits, &valid_bits_offset, &valid_count); + } + def_levels += batch_size; + num_def_levels -= batch_size; + } + *null_count += *values_read - valid_count; +} + +inline void DefinitionLevelsToBitmapDispatch( + const int16_t* def_levels, int64_t num_def_levels, const int16_t max_definition_level, + const int16_t max_repetition_level, int64_t* values_read, int64_t* null_count, + uint8_t* valid_bits, int64_t valid_bits_offset) { + if (max_repetition_level > 0) { +#if ARROW_LITTLE_ENDIAN +// we use AVx2 as a proxy for BMI2 instruction set, since there doesn't seem to be a clean +// way o detect that latter for MSVC. +#if defined(ARROW_HAVE_BMI2) + // We need BIM2 instruction which is AVX2 should imply. + DefinitionLevelsToBitmapSimd( + def_levels, num_def_levels, max_definition_level, values_read, null_count, + valid_bits, valid_bits_offset); +#else + DefinitionLevelsToBitmapScalar(def_levels, num_def_levels, max_definition_level, + max_repetition_level, values_read, null_count, + valid_bits, valid_bits_offset); +#endif // ARROW_HAVE_BMI2 + + } else { + // No Special intsturction are used for non-repeated case. + DefinitionLevelsToBitmapSimd( + def_levels, num_def_levels, max_definition_level, values_read, null_count, + valid_bits, valid_bits_offset); + } +} + +#else // big-endian + // Optimized SIMD uses bit shifts that are unlikely to work on big endian platforms. + DefinitionLevelsToBitmapScalar(def_levels, num_def_levels, max_definition_level, + max_repitition_level, values_read, null_count, + valid_bits, valid_bits_offset); + +#endif + +} // namespace + LevelDecoder::LevelDecoder() : num_values_remaining_(0) {} LevelDecoder::~LevelDecoder() {} @@ -882,14 +1017,14 @@ int64_t TypedColumnReaderImpl::ReadBatchSpaced( } } total_values = this->ReadValues(values_to_read, values); - for (int64_t i = 0; i < total_values; i++) { - ::arrow::BitUtil::SetBit(valid_bits, valid_bits_offset + i); - } + ::arrow::BitUtil::SetBitsTo(valid_bits, valid_bits_offset, + /*length=*/total_values, + /*bits_are_set=*/true); *values_read = total_values; } else { - internal::DefinitionLevelsToBitmap(def_levels, num_def_levels, this->max_def_level_, - this->max_rep_level_, values_read, &null_count, - valid_bits, valid_bits_offset); + DefinitionLevelsToBitmapDispatch(def_levels, num_def_levels, this->max_def_level_, + this->max_rep_level_, values_read, &null_count, + valid_bits, valid_bits_offset); total_values = this->ReadValuesSpaced(*values_read, values, static_cast(null_count), valid_bits, valid_bits_offset); @@ -900,9 +1035,9 @@ int64_t TypedColumnReaderImpl::ReadBatchSpaced( } else { // Required field, read all values total_values = this->ReadValues(batch_size, values); - for (int64_t i = 0; i < total_values; i++) { - ::arrow::BitUtil::SetBit(valid_bits, valid_bits_offset + i); - } + ::arrow::BitUtil::SetBitsTo(valid_bits, valid_bits_offset, + /*length=*/total_values, + /*bits_are_set=*/true); *null_count_out = 0; *levels_read = total_values; } @@ -992,6 +1127,16 @@ namespace internal { // better vectorized performance when doing many smaller record reads constexpr int64_t kMinLevelBatchSize = 1024; +void DefinitionLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels, + const int16_t max_definition_level, + const int16_t max_repetition_level, int64_t* values_read, + int64_t* null_count, uint8_t* valid_bits, + int64_t valid_bits_offset) { + DefinitionLevelsToBitmapDispatch(def_levels, num_def_levels, max_definition_level, + max_repetition_level, values_read, null_count, + valid_bits, valid_bits_offset); +} + template class TypedRecordReader : public ColumnReaderImplBase, virtual public RecordReader { @@ -1323,7 +1468,7 @@ class TypedRecordReader : public ColumnReaderImplBase, int64_t null_count = 0; if (nullable_values_) { int64_t values_with_nulls = 0; - internal::DefinitionLevelsToBitmap( + DefinitionLevelsToBitmapDispatch( def_levels() + start_levels_position, levels_position_ - start_levels_position, this->max_def_level_, this->max_rep_level_, &values_with_nulls, &null_count, valid_bits_->mutable_data(), values_written_); diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 4b33e65a062..a1957bbb0a2 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -314,48 +314,11 @@ class DictionaryRecordReader : virtual public RecordReader { virtual std::shared_ptr<::arrow::ChunkedArray> GetResult() = 0; }; -static inline void DefinitionLevelsToBitmap( - const int16_t* def_levels, int64_t num_def_levels, const int16_t max_definition_level, - const int16_t max_repetition_level, int64_t* values_read, int64_t* null_count, - uint8_t* valid_bits, int64_t valid_bits_offset) { - // We assume here that valid_bits is large enough to accommodate the - // additional definition levels and the ones that have already been written - ::arrow::internal::BitmapWriter valid_bits_writer(valid_bits, valid_bits_offset, - num_def_levels); - - // TODO(itaiin): As an interim solution we are splitting the code path here - // between repeated+flat column reads, and non-repeated+nested reads. - // Those paths need to be merged in the future - for (int i = 0; i < num_def_levels; ++i) { - if (def_levels[i] == max_definition_level) { - valid_bits_writer.Set(); - } else if (max_repetition_level > 0) { - // repetition+flat case - if (def_levels[i] == (max_definition_level - 1)) { - valid_bits_writer.Clear(); - *null_count += 1; - } else { - continue; - } - } else { - // non-repeated+nested case - if (def_levels[i] < max_definition_level) { - valid_bits_writer.Clear(); - *null_count += 1; - } else { - throw ParquetException("definition level exceeds maximum"); - } - } - - valid_bits_writer.Next(); - } - valid_bits_writer.Finish(); - *values_read = valid_bits_writer.position(); -} - -} // namespace internal - -namespace internal { +void DefinitionLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels, + const int16_t max_definition_level, + const int16_t max_repetition_level, int64_t* values_read, + int64_t* null_count, uint8_t* valid_bits, + int64_t valid_bits_offset); // TODO(itaiin): another code path split to merge when the general case is done static inline bool HasSpacedValues(const ColumnDescriptor* descr) { diff --git a/cpp/src/parquet/level_conversion.h b/cpp/src/parquet/level_conversion.h new file mode 100644 index 00000000000..df43e64efc4 --- /dev/null +++ b/cpp/src/parquet/level_conversion.h @@ -0,0 +1,182 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include "arrow/util/bit_util.h" + +#if defined(ARROW_HAVE_BMI2) +#include "x86intrin.h" +#endif + +namespace parquet { +namespace internal { +// These APIs are likely to be revised as part of ARROW-8494 to reduce duplicate code. +// They currently represent minimal functionality for vectorized computation of definition +// levels. + +/// Builds a bitmap by applying predicate to the level vector provided. +/// +/// \param[in] levels Rep or def level array. +/// \param[in] num_levels The number of levels to process (must be [0, 64]) +/// \param[in] predicate The predicate to apply (must have the signature `bool +/// predicate(int16_t)`. +/// \returns The bitmap using least significant "bit" ordering. +/// +/// N.B. Correct byte ordering is dependent on little-endian architectures. +/// +template +uint64_t LevelsToBitmap(const int16_t* levels, int64_t num_levels, Predicate predicate) { + // Both clang and GCC can vectorize this automatically with AVX2. + uint64_t mask = 0; + for (int x = 0; x < num_levels; x++) { + mask |= static_cast(predicate(levels[x]) ? 1 : 0) << x; + } + return mask; +} + +/// Builds a bitmap where each set bit indicates the correspond level is greater +/// than rhs. +static inline int64_t GreaterThanBitmap(const int16_t* levels, int64_t num_levels, + int16_t rhs) { + return LevelsToBitmap(levels, num_levels, [&](int16_t value) { return value > rhs; }); +} + +/// Append bits [0, number_of_bits) from new_Bits to valid_bits and valid_bits_offset. +/// +/// \param[in,out] valid_bits The valid bit bitmap to append to. +/// \param[in] valid_bits_offset The bit-offset at which to start appending new bits. +/// \param[in] valid_bits_length The number of bytes allocated in valid_bits. +/// \param[in] new_bits The zero-padded bitmap to append. +/// \param[in] number_of_bits The number of bits to append from new_bits. +/// \returns The new bit offset inside of valid_bits. +static inline int64_t AppendBitmap(uint8_t* valid_bits, int64_t valid_bits_offset, + int64_t valid_bits_length, uint64_t new_bits, + int number_of_bits) { + // Selection masks to retrieve all low order bits for each bytes. + constexpr uint64_t kLsbSelectionMasks[] = { + 0, // unused. + 0x0101010101010101, + 0x0303030303030303, + 0x0707070707070707, + 0x0F0F0F0F0F0F0F0F, + 0x1F1F1F1F1F1F1F1F, + 0x3F3F3F3F3F3F3F3F, + 0x7F7F7F7F7F7F7F7F, + }; + int64_t valid_byte_offset = valid_bits_offset / 8; + int64_t bit_offset = valid_bits_offset % 8; + + int64_t new_offset = valid_bits_offset + number_of_bits; + union ByteAddressableBitmap { + explicit ByteAddressableBitmap(uint64_t mask) : mask(mask) {} + uint64_t mask; + uint8_t bytes[8]; + }; + + if (bit_offset != 0) { + int bits_to_carry = 8 - bit_offset; + // Get the mask the will select the lower order bits (the ones to carry + // over to the existing byte and shift up. + const ByteAddressableBitmap carry_bits(kLsbSelectionMasks[bits_to_carry]); + // Mask to select non-carried bits. + const uint64_t inverse_selection_mask = ~carry_bits.mask; + // Fill out the last incomplete byte in the output, by extracting the least + // siginficant bits from the first byte. + const ByteAddressableBitmap new_bitmap(new_bits); + valid_bits[valid_byte_offset] = + valid_bits[valid_byte_offset] | + (((new_bitmap.bytes[0] & carry_bits.bytes[0])) << bit_offset); + + // We illustrate logic with a 3-byte example in little endian/LSB order. + // Note this ordering is the reversed from HEX masks above with are expressed + // big-endian/MSB and shifts right move the bits to the left (division). + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 + // Shifted mask should look like this assuming bit offset = 6: + // 2 3 4 5 6 7 N N 10 11 12 13 14 15 N N 18 19 20 21 22 23 N N + // clang-format on + uint64_t shifted_new_bits = (new_bits & inverse_selection_mask) >> bits_to_carry; + // captured_carry: + // 0 1 N N N N N N 8 9 N N N N N N 16 17 N N N N N N + uint64_t captured_carry = carry_bits.mask & new_bits; + // mask_cary_bits: + // N N N N N N 8 9 N N N N N N 16 17 N N N N N N N N + uint64_t mask_carry_bits = (captured_carry >> 8) << bit_offset; + + new_bits = shifted_new_bits | mask_carry_bits; + // Don't overwrite the first byte + valid_byte_offset += 1; + number_of_bits -= bits_to_carry; + } + + int bytes_for_new_bits = ::arrow::BitUtil::BytesForBits(number_of_bits); + if (valid_bits_length - + (bytes_for_new_bits + ::arrow::BitUtil::BytesForBits(valid_bits_offset)) > + static_cast(sizeof(new_bits))) { + // This should be the common case and inlined as a single instruction which + // should be cheaper then the general case of calling mempcy, so it is likely + // worth the extra branch. + std::memcpy(valid_bits + valid_byte_offset, &new_bits, sizeof(new_bits)); + } else { + std::memcpy(valid_bits + valid_byte_offset, &new_bits, bytes_for_new_bits); + } + return new_offset; +} + +/// \brief Appends bit values to the validitdy bimap_valid bits, based on bitmaps +/// generated by GreaterThanBitmap, and the appropriate treshold definition_leve. +/// +/// \param[in] bitmap Bitmap generated by GreaterThanBitmap to ensure it exceeds the def +/// level to be non-null. \param[in] repeated_parent_bitmap Bitmap generated by +/// GreaterThanBitmap to find values that exceeds the def level to be considered "present" +/// (i.e. def level greater than the last repeated parents def_level). \param[in] +/// num_entries The number of bits to process in bitmap \[0, 64\]. This is ignored if +/// has_repeated_parent is false. +/// \param[in,out] valid_bits The validity bitmap to update. +/// \param[in,out] valid_bits_offset The current offset to start appending bits to in +/// valid_bits (updated to +/// new value after appending bits). +/// \param[out] set_bit_count The number of set (valid) bits added to valid_bits. +/// \returns The nunmber of values add to the bitmap (set + unset). +template +int64_t AppendValidityBitmap(uint64_t bitmap, uint64_t repeated_parent_bitmap, + int64_t num_entries, uint8_t* valid_bits, + int64_t* valid_bits_offset, int64_t* set_bit_count) { + int64_t values_read = num_entries; + int64_t min_valid_bits_size = + ::arrow::BitUtil::BytesForBits(num_entries + *valid_bits_offset); + if (has_repeated_parent) { +#if defined(ARROW_HAVE_BMI2) + // If the parent list was empty at for the given slot it should not be added to the + // bitmap. + bitmap = _pext_u64(bitmap, repeated_parent_bitmap); + values_read = ::arrow::BitUtil::PopCount(repeated_parent_bitmap); +#else + // We shouldn't get here. + std::abort(); +#endif + } + + *set_bit_count += ::arrow::BitUtil::PopCount(bitmap); + *valid_bits_offset = AppendBitmap(valid_bits, *valid_bits_offset, min_valid_bits_size, + bitmap, values_read); + return values_read; +} + +} // namespace internal +} // namespace parquet From 83c38c4e7e1b805194e7cd7980425c570c177c6d Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 20 Apr 2020 03:24:56 +0000 Subject: [PATCH 02/40] remove grouping --- cpp/src/arrow/util/bit_util.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/bit_util.cc b/cpp/src/arrow/util/bit_util.cc index df4b170c303..e5dd1c18750 100644 --- a/cpp/src/arrow/util/bit_util.cc +++ b/cpp/src/arrow/util/bit_util.cc @@ -506,7 +506,7 @@ Result> BitmapOp(MemoryPool* pool, const uint8_t* left, std::string Bitmap::ToString() const { std::string out(length_ + ((length_ - 1) / 8), ' '); for (int64_t i = 0; i < length_; ++i) { - out[i + (i / 8)] = GetBit(i) ? '1' : '0'; + out[i] = GetBit(i) ? '1' : '0'; } return out; } From 3480711bf84bd59f0de98b21bba330b5c7588a43 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 20 Apr 2020 04:07:36 +0000 Subject: [PATCH 03/40] fix msvc --- cpp/src/arrow/util/bit_util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index 0223fe57eb1..6e326c37b35 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -48,7 +48,7 @@ #pragma intrinsic(_BitScanForward) #define ARROW_BYTE_SWAP64 _byteswap_uint64 #define ARROW_BYTE_SWAP32 _byteswap_ulong -#define ARROW_POPCOUNT64 __popcnt_u64 +#define ARROW_POPCOUNT64 __popcnt64 #define ARROW_POPCOUNT32 __popcnt #else #define ARROW_BYTE_SWAP64 __builtin_bswap64 From 677c81f8b9b4712276404058906a11b9952700c3 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 20 Apr 2020 04:34:14 +0000 Subject: [PATCH 04/40] more msvc fixes --- cpp/src/arrow/util/bit_util.h | 4 ++-- cpp/src/parquet/level_conversion.h | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index 6e326c37b35..9ec4c4642eb 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -112,8 +112,8 @@ static constexpr uint8_t kBytePopcount[] = { 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; -static inline int PopCount(uint64_t bitmap) { return ARROW_POPCOUNT64(bitmap); } -static inline int PopCount(uint32_t bitmap) { return ARROW_POPCOUNT32(bitmap); } +static inline uint64_t PopCount(uint64_t bitmap) { return ARROW_POPCOUNT64(bitmap); } +static inline uint32_t PopCount(uint32_t bitmap) { return ARROW_POPCOUNT32(bitmap); } // // Bit-related computations on integer values diff --git a/cpp/src/parquet/level_conversion.h b/cpp/src/parquet/level_conversion.h index df43e64efc4..17c99f323d8 100644 --- a/cpp/src/parquet/level_conversion.h +++ b/cpp/src/parquet/level_conversion.h @@ -165,7 +165,8 @@ int64_t AppendValidityBitmap(uint64_t bitmap, uint64_t repeated_parent_bitmap, // If the parent list was empty at for the given slot it should not be added to the // bitmap. bitmap = _pext_u64(bitmap, repeated_parent_bitmap); - values_read = ::arrow::BitUtil::PopCount(repeated_parent_bitmap); + values_read = + static_cast(::arrow::BitUtil::PopCount(repeated_parent_bitmap)); #else // We shouldn't get here. std::abort(); From a8e7637a4dc123fa22f9ccde8d7933ad28f3394f Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 20 Apr 2020 04:46:54 +0000 Subject: [PATCH 05/40] move fixes --- cpp/src/parquet/level_conversion.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/level_conversion.h b/cpp/src/parquet/level_conversion.h index 17c99f323d8..a9d7828806e 100644 --- a/cpp/src/parquet/level_conversion.h +++ b/cpp/src/parquet/level_conversion.h @@ -90,7 +90,7 @@ static inline int64_t AppendBitmap(uint8_t* valid_bits, int64_t valid_bits_offse }; if (bit_offset != 0) { - int bits_to_carry = 8 - bit_offset; + int64_t bits_to_carry = 8 - bit_offset; // Get the mask the will select the lower order bits (the ones to carry // over to the existing byte and shift up. const ByteAddressableBitmap carry_bits(kLsbSelectionMasks[bits_to_carry]); @@ -124,9 +124,8 @@ static inline int64_t AppendBitmap(uint8_t* valid_bits, int64_t valid_bits_offse number_of_bits -= bits_to_carry; } - int bytes_for_new_bits = ::arrow::BitUtil::BytesForBits(number_of_bits); - if (valid_bits_length - - (bytes_for_new_bits + ::arrow::BitUtil::BytesForBits(valid_bits_offset)) > + int64_t bytes_for_new_bits = ::arrow::BitUtil::BytesForBits(number_of_bits); + if (valid_bits_length - ::arrow::BitUtil::BytesForBits(valid_bits_offset) > static_cast(sizeof(new_bits))) { // This should be the common case and inlined as a single instruction which // should be cheaper then the general case of calling mempcy, so it is likely From 0d2ebd20e5879da9d28f97a66318df7341cbdb18 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 20 Apr 2020 05:12:28 +0000 Subject: [PATCH 06/40] one more fix --- cpp/src/parquet/level_conversion.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/level_conversion.h b/cpp/src/parquet/level_conversion.h index a9d7828806e..4e522b26f50 100644 --- a/cpp/src/parquet/level_conversion.h +++ b/cpp/src/parquet/level_conversion.h @@ -67,7 +67,7 @@ static inline int64_t GreaterThanBitmap(const int16_t* levels, int64_t num_level /// \returns The new bit offset inside of valid_bits. static inline int64_t AppendBitmap(uint8_t* valid_bits, int64_t valid_bits_offset, int64_t valid_bits_length, uint64_t new_bits, - int number_of_bits) { + int64_t number_of_bits) { // Selection masks to retrieve all low order bits for each bytes. constexpr uint64_t kLsbSelectionMasks[] = { 0, // unused. From e69d87aa1438a54f7572be6cb352f863a6cb69e3 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 21 Apr 2020 05:40:21 +0000 Subject: [PATCH 07/40] change not to ! --- cpp/src/parquet/column_reader.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 0c46aa37610..7318f0d13e3 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -67,7 +67,8 @@ inline void CheckLevelRange(const int16_t* levels, int64_t num_levels, } } -#if not defined(ARROW_HAVE_BMI2) +#if !defined(ARROW_HAVE_BMI2) + inline void DefinitionLevelsToBitmapScalar( const int16_t* def_levels, int64_t num_def_levels, const int16_t max_definition_level, const int16_t max_repetition_level, int64_t* values_read, int64_t* null_count, From ee20f7a2672e3acef617f93008fe506a3fcb592d Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 21 Apr 2020 06:56:14 +0000 Subject: [PATCH 08/40] put back missing value --- cpp/src/arrow/util/bit_util.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/bit_util.cc b/cpp/src/arrow/util/bit_util.cc index e5dd1c18750..df4b170c303 100644 --- a/cpp/src/arrow/util/bit_util.cc +++ b/cpp/src/arrow/util/bit_util.cc @@ -506,7 +506,7 @@ Result> BitmapOp(MemoryPool* pool, const uint8_t* left, std::string Bitmap::ToString() const { std::string out(length_ + ((length_ - 1) / 8), ' '); for (int64_t i = 0; i < length_; ++i) { - out[i] = GetBit(i) ? '1' : '0'; + out[i + (i / 8)] = GetBit(i) ? '1' : '0'; } return out; } From bdc133397bcbfb8c329ead4289fd3f5e63cd1d11 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 22 Apr 2020 04:52:12 +0000 Subject: [PATCH 09/40] better names cleanup docs --- cpp/src/parquet/column_reader.cc | 28 +++++----- cpp/src/parquet/column_reader.h | 9 ++-- cpp/src/parquet/level_conversion.h | 84 ++++++++++++++++-------------- 3 files changed, 63 insertions(+), 58 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 7318f0d13e3..9e5815456db 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -115,13 +115,13 @@ void DefinitionLevelsToBitmapSimd(const int16_t* def_levels, int64_t num_def_lev int64_t* values_read, int64_t* null_count, uint8_t* valid_bits, int64_t valid_bits_offset) { constexpr int64_t kBitMaskSize = 64; - int64_t valid_count = 0; + int64_t set_count = 0; *values_read = 0; while (num_def_levels > 0) { int64_t batch_size = std::min(num_def_levels, kBitMaskSize); CheckLevelRange(def_levels, batch_size, required_definition_level); - uint64_t valid_bitmap = internal::GreaterThanBitmap(def_levels, batch_size, - required_definition_level - 1); + uint64_t defined_bitmap = internal::GreaterThanBitmap(def_levels, batch_size, + required_definition_level - 1); if (has_repeated_parent) { // This is currently a specialized code path assuming only (nested) lists // present through the leaf (i.e. no structs). @@ -132,22 +132,22 @@ void DefinitionLevelsToBitmapSimd(const int16_t* def_levels, int64_t num_def_lev // definition - 1). If there where structs mixed in, we need to know the def_level // of the repeated parent so we can check for def_level > "def level of repeated // parent". - uint64_t present_bits = internal::GreaterThanBitmap(def_levels, batch_size, - required_definition_level - 2); - *values_read += internal::AppendValidityBitmap( - /*reversed_bitmap=*/valid_bitmap, - /*reversed_parent_bitmap=*/present_bits, - /*num_entries=*/batch_size, valid_bits, &valid_bits_offset, &valid_count); + uint64_t present_bitmap = internal::GreaterThanBitmap( + def_levels, batch_size, required_definition_level - 2); + *values_read += internal::AppendSelectedBitsToValidityBitmap( + /*new_bits=*/defined_bitmap, + /*selection_bitmap*/ present_bitmap, valid_bits, &valid_bits_offset, + &set_count); } else { - *values_read += internal::AppendValidityBitmap( - /*reversed_bitmap=*/valid_bitmap, - /*reversed_parent_bitmap=*/-1, // this is unused - /*num_entries=*/batch_size, valid_bits, &valid_bits_offset, &valid_count); + internal::AppendToValidityBitmap( + /*new_bits=*/defined_bitmap, + /*new_bit_count=*/batch_size, valid_bits, &valid_bits_offset, &set_count); + *values_read += batch_size; } def_levels += batch_size; num_def_levels -= batch_size; } - *null_count += *values_read - valid_count; + *null_count += *values_read - set_count; } inline void DefinitionLevelsToBitmapDispatch( diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index a1957bbb0a2..86dc2d4a428 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -314,11 +314,10 @@ class DictionaryRecordReader : virtual public RecordReader { virtual std::shared_ptr<::arrow::ChunkedArray> GetResult() = 0; }; -void DefinitionLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels, - const int16_t max_definition_level, - const int16_t max_repetition_level, int64_t* values_read, - int64_t* null_count, uint8_t* valid_bits, - int64_t valid_bits_offset); +void PARQUET_EXPORT DefinitionLevelsToBitmap( + const int16_t* def_levels, int64_t num_def_levels, const int16_t max_definition_level, + const int16_t max_repetition_level, int64_t* values_read, int64_t* null_count, + uint8_t* valid_bits, int64_t valid_bits_offset); // TODO(itaiin): another code path split to merge when the general case is done static inline bool HasSpacedValues(const ColumnDescriptor* descr) { diff --git a/cpp/src/parquet/level_conversion.h b/cpp/src/parquet/level_conversion.h index 4e522b26f50..31e0edb0bfe 100644 --- a/cpp/src/parquet/level_conversion.h +++ b/cpp/src/parquet/level_conversion.h @@ -57,17 +57,17 @@ static inline int64_t GreaterThanBitmap(const int16_t* levels, int64_t num_level return LevelsToBitmap(levels, num_levels, [&](int16_t value) { return value > rhs; }); } -/// Append bits [0, number_of_bits) from new_Bits to valid_bits and valid_bits_offset. +/// Append bits number_of_bits from new_bits to valid_bits and valid_bits_offset. /// -/// \param[in,out] valid_bits The valid bit bitmap to append to. -/// \param[in] valid_bits_offset The bit-offset at which to start appending new bits. -/// \param[in] valid_bits_length The number of bytes allocated in valid_bits. /// \param[in] new_bits The zero-padded bitmap to append. /// \param[in] number_of_bits The number of bits to append from new_bits. +/// \param[in] valid_bits_length The number of bytes allocated in valid_bits. +/// \param[in] valid_bits_offset The bit-offset at which to start appending new bits. +/// \param[in,out] valid_bits The validity bitmap to append to. /// \returns The new bit offset inside of valid_bits. -static inline int64_t AppendBitmap(uint8_t* valid_bits, int64_t valid_bits_offset, - int64_t valid_bits_length, uint64_t new_bits, - int64_t number_of_bits) { +static inline int64_t AppendBitmap(uint64_t new_bits, int64_t number_of_bits, + int64_t valid_bits_length, int64_t valid_bits_offset, + uint8_t* valid_bits) { // Selection masks to retrieve all low order bits for each bytes. constexpr uint64_t kLsbSelectionMasks[] = { 0, // unused. @@ -140,42 +140,48 @@ static inline int64_t AppendBitmap(uint8_t* valid_bits, int64_t valid_bits_offse /// \brief Appends bit values to the validitdy bimap_valid bits, based on bitmaps /// generated by GreaterThanBitmap, and the appropriate treshold definition_leve. /// -/// \param[in] bitmap Bitmap generated by GreaterThanBitmap to ensure it exceeds the def -/// level to be non-null. \param[in] repeated_parent_bitmap Bitmap generated by -/// GreaterThanBitmap to find values that exceeds the def level to be considered "present" -/// (i.e. def level greater than the last repeated parents def_level). \param[in] -/// num_entries The number of bits to process in bitmap \[0, 64\]. This is ignored if -/// has_repeated_parent is false. -/// \param[in,out] valid_bits The validity bitmap to update. -/// \param[in,out] valid_bits_offset The current offset to start appending bits to in -/// valid_bits (updated to -/// new value after appending bits). -/// \param[out] set_bit_count The number of set (valid) bits added to valid_bits. -/// \returns The nunmber of values add to the bitmap (set + unset). -template -int64_t AppendValidityBitmap(uint64_t bitmap, uint64_t repeated_parent_bitmap, - int64_t num_entries, uint8_t* valid_bits, - int64_t* valid_bits_offset, int64_t* set_bit_count) { - int64_t values_read = num_entries; +/// \param[in] new_bits Bitmap to append (intrepreted as Little-endian/LSB). +/// \param[in] new_bit_count The number of bits to append from new_bits. +/// \param[in,out] validity_bitmap The validity bitmap to update. +/// \param[in,out] validity_bitmap_offset The offset to start appending bits to in +/// valid_bits (updated to latest bitmap). +/// \param[in,out] set_bit_count The number of set bits added to set_bit_count. +void AppendToValidityBitmap(uint64_t new_bits, int64_t new_bit_count, + uint8_t* validity_bitmap, int64_t* validity_bitmap_offset, + int64_t* set_bit_count) { int64_t min_valid_bits_size = - ::arrow::BitUtil::BytesForBits(num_entries + *valid_bits_offset); - if (has_repeated_parent) { + ::arrow::BitUtil::BytesForBits(new_bit_count + *validity_bitmap_offset); + + *set_bit_count += ::arrow::BitUtil::PopCount(new_bits); + *validity_bitmap_offset = AppendBitmap(new_bits, new_bit_count, min_valid_bits_size, + *validity_bitmap_offset, validity_bitmap); +} + +/// The same as AppendToValidityBitmap but only appends bits from bitmap that have +/// a corresponding bit set in selection_bitmap. +/// +/// \returns The number of bits appended. +/// +/// N.B. This is only implemented for archiectures that suppor the BMI2 instruction +/// set. +int64_t AppendSelectedBitsToValidityBitmap(uint64_t new_bits, uint64_t selection_bitmap, + uint8_t* validity_bitmap, + int64_t* validity_bitmap_offset, + int64_t* set_bit_count) { #if defined(ARROW_HAVE_BMI2) - // If the parent list was empty at for the given slot it should not be added to the - // bitmap. - bitmap = _pext_u64(bitmap, repeated_parent_bitmap); - values_read = - static_cast(::arrow::BitUtil::PopCount(repeated_parent_bitmap)); + // If the parent list was empty at for the given slot it should not be added to the + // bitmap. + uint64_t selected_bits = _pext_u64(new_bits, selection_bitmap); + int64_t selected_count = + static_cast(::arrow::BitUtil::PopCount(selection_bitmap)); + + AppendToValidityBitmap(selected_bits, selected_count, validity_bitmap, + validity_bitmap_offset, set_bit_count); + return selected_count; #else - // We shouldn't get here. - std::abort(); + // We shouldn't get here. + std::abort(); #endif - } - - *set_bit_count += ::arrow::BitUtil::PopCount(bitmap); - *valid_bits_offset = AppendBitmap(valid_bits, *valid_bits_offset, min_valid_bits_size, - bitmap, values_read); - return values_read; } } // namespace internal From 5b5df2a80e8ebbc8bd9409e78ac5b5ccee670bbf Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 22 Apr 2020 07:55:49 +0000 Subject: [PATCH 10/40] add some tests more needed --- cpp/src/parquet/CMakeLists.txt | 1 + cpp/src/parquet/level_conversion.h | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index b2b78f3cab5..f4a95eff7b8 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -336,6 +336,7 @@ set_source_files_properties(public_api_test.cc add_parquet_test(reader_test SOURCES column_reader_test.cc + level_conversion_test.cc column_scanner_test.cc reader_test.cc stream_reader_test.cc diff --git a/cpp/src/parquet/level_conversion.h b/cpp/src/parquet/level_conversion.h index 31e0edb0bfe..3be6ad0422c 100644 --- a/cpp/src/parquet/level_conversion.h +++ b/cpp/src/parquet/level_conversion.h @@ -95,12 +95,14 @@ static inline int64_t AppendBitmap(uint64_t new_bits, int64_t number_of_bits, // over to the existing byte and shift up. const ByteAddressableBitmap carry_bits(kLsbSelectionMasks[bits_to_carry]); // Mask to select non-carried bits. - const uint64_t inverse_selection_mask = ~carry_bits.mask; + const ByteAddressableBitmap inverse_selection(~carry_bits.mask); // Fill out the last incomplete byte in the output, by extracting the least // siginficant bits from the first byte. const ByteAddressableBitmap new_bitmap(new_bits); + // valid bits should be a valid bitmask so all trailing bytes hsould be unset + // so no mask is need to start. valid_bits[valid_byte_offset] = - valid_bits[valid_byte_offset] | + valid_bits[valid_byte_offset] | // See above the (((new_bitmap.bytes[0] & carry_bits.bytes[0])) << bit_offset); // We illustrate logic with a 3-byte example in little endian/LSB order. @@ -110,7 +112,7 @@ static inline int64_t AppendBitmap(uint64_t new_bits, int64_t number_of_bits, // Shifted mask should look like this assuming bit offset = 6: // 2 3 4 5 6 7 N N 10 11 12 13 14 15 N N 18 19 20 21 22 23 N N // clang-format on - uint64_t shifted_new_bits = (new_bits & inverse_selection_mask) >> bits_to_carry; + uint64_t shifted_new_bits = (new_bits & inverse_selection.mask) >> bits_to_carry; // captured_carry: // 0 1 N N N N N N 8 9 N N N N N N 16 17 N N N N N N uint64_t captured_carry = carry_bits.mask & new_bits; @@ -125,7 +127,7 @@ static inline int64_t AppendBitmap(uint64_t new_bits, int64_t number_of_bits, } int64_t bytes_for_new_bits = ::arrow::BitUtil::BytesForBits(number_of_bits); - if (valid_bits_length - ::arrow::BitUtil::BytesForBits(valid_bits_offset) > + if (valid_bits_length - ::arrow::BitUtil::BytesForBits(valid_bits_offset) >= static_cast(sizeof(new_bits))) { // This should be the common case and inlined as a single instruction which // should be cheaper then the general case of calling mempcy, so it is likely From fa4e36409e5dc64e25a2402df3b4c88486db60f5 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 22 Apr 2020 07:56:35 +0000 Subject: [PATCH 11/40] actually add test --- cpp/src/parquet/level_conversion_test.cc | 88 ++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 cpp/src/parquet/level_conversion_test.cc diff --git a/cpp/src/parquet/level_conversion_test.cc b/cpp/src/parquet/level_conversion_test.cc new file mode 100644 index 00000000000..9df5a07719b --- /dev/null +++ b/cpp/src/parquet/level_conversion_test.cc @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/level_conversion.h" + +#include +#include + +#include + +#include "arrow/util/bit_util.h" + +namespace parquet { +namespace internal { + +using ::testing::ElementsAreArray; + +std::string ToString(const std::vector& bitmap, int64_t bit_count) { + return arrow::internal::Bitmap(bitmap.data(), /*offset*/ 0, /*length=*/bit_count) + .ToString(); +} + +TEST(TestAppendBitmap, TestOffsetOverwritesCorrectBitsOnExistingByte) { + auto check_append = [](const std::string& expected_bits, int64_t offset) { + std::vector valid_bits = {0x00}; + constexpr int64_t kBitsAfterAppend = 8; + ASSERT_EQ( + AppendBitmap(/*new_bits=*/0xFF, /*number_of_bits*/ 8 - offset, + /*valid_bits_length=*/valid_bits.size(), offset, valid_bits.data()), + kBitsAfterAppend); + EXPECT_EQ(ToString(valid_bits, kBitsAfterAppend), expected_bits); + }; + check_append("11111111", 0); + check_append("01111111", 1); + check_append("00111111", 2); + check_append("00011111", 3); + check_append("00001111", 4); + check_append("00000111", 5); + check_append("00000011", 6); + check_append("00000001", 7); +} + +TEST(TestAppendBitmap, AllBytesAreWrittenWithEnoughSpace) { + std::vector valid_bits(/*count=*/9, 0); + + uint64_t bitmap = 0xFFFFFFFFFFFFFFFF; + AppendBitmap(bitmap, /*number_of_bits*/ 7, + /*valid_bits_length=*/valid_bits.size(), + /*valid_bits_offset=*/1, + /*valid_bits=*/valid_bits.data()); + EXPECT_THAT(valid_bits, ElementsAreArray(std::vector{ + 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01})); +} + +TEST(TestAppendBitmap, OnlyApproriateBytesWrittenWhenLessThen8BytesAvailable) { + std::vector valid_bits = {0x00, 0x00}; + + uint64_t bitmap = 0x1FF; + AppendBitmap(bitmap, /*number_of_bits*/ 7, + /*valid_bits_length=*/2, + /*valid_bits_offset=*/1, + /*valid_bits=*/valid_bits.data()); + + EXPECT_THAT(valid_bits, ElementsAreArray(std::vector{0xFE, 0x00})); + + AppendBitmap(bitmap, /*number_of_bits*/ 9, + /*valid_bits_length=*/2, + /*valid_bits_offset=*/1, + /*valid_bits=*/valid_bits.data()); + EXPECT_THAT(valid_bits, ElementsAreArray(std::vector{0xFE, 0x03})); +} + +} // namespace internal +} // namespace parquet From a9c8b05396e15d65bdde897aa34bb14e7bd1fd6d Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 23 Apr 2020 06:07:44 +0000 Subject: [PATCH 12/40] address comments --- cpp/src/parquet/CMakeLists.txt | 2 +- cpp/src/parquet/column_reader.cc | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index f4a95eff7b8..280b93f4096 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -336,7 +336,7 @@ set_source_files_properties(public_api_test.cc add_parquet_test(reader_test SOURCES column_reader_test.cc - level_conversion_test.cc + level_conversion_test.cc column_scanner_test.cc reader_test.cc stream_reader_test.cc diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 9e5815456db..7808f2144bf 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -156,10 +156,10 @@ inline void DefinitionLevelsToBitmapDispatch( uint8_t* valid_bits, int64_t valid_bits_offset) { if (max_repetition_level > 0) { #if ARROW_LITTLE_ENDIAN -// we use AVx2 as a proxy for BMI2 instruction set, since there doesn't seem to be a clean -// way o detect that latter for MSVC. + + #if defined(ARROW_HAVE_BMI2) - // We need BIM2 instruction which is AVX2 should imply. + // BMI2 is required for efficient bit extraction. DefinitionLevelsToBitmapSimd( def_levels, num_def_levels, max_definition_level, values_read, null_count, valid_bits, valid_bits_offset); From 9b3bbebaf0a88225b4acf8a2f8aba8bc46ef3987 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 23 Apr 2020 06:26:14 +0000 Subject: [PATCH 13/40] test GreaterThanBitmap --- cpp/src/parquet/column_reader.cc | 1 - cpp/src/parquet/level_conversion_test.cc | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 7808f2144bf..e104d4291b8 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -157,7 +157,6 @@ inline void DefinitionLevelsToBitmapDispatch( if (max_repetition_level > 0) { #if ARROW_LITTLE_ENDIAN - #if defined(ARROW_HAVE_BMI2) // BMI2 is required for efficient bit extraction. DefinitionLevelsToBitmapSimd( diff --git a/cpp/src/parquet/level_conversion_test.cc b/cpp/src/parquet/level_conversion_test.cc index 9df5a07719b..6a6e6bee4a4 100644 --- a/cpp/src/parquet/level_conversion_test.cc +++ b/cpp/src/parquet/level_conversion_test.cc @@ -34,6 +34,22 @@ std::string ToString(const std::vector& bitmap, int64_t bit_count) { .ToString(); } +TEST(TestGreaterThanBitmap, GeneratesExpetedBitmasks) { + std::vector levels = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}; + EXPECT_EQ(GreaterThanBitmap(levels.data(), /*num_levels=*/0, /*rhs*/ 0), 0); + EXPECT_EQ(GreaterThanBitmap(levels.data(), /*num_levels=*/64, /*rhs*/ 8), 0); + EXPECT_EQ(GreaterThanBitmap(levels.data(), /*num_levels=*/64, /*rhs*/ -1), + 0xFFFFFFFFFFFFFFFF); + // Should be zero padded. + EXPECT_EQ(GreaterThanBitmap(levels.data(), /*num_levels=*/47, /*rhs*/ -1), + 0x7FFFFFFFFFFF); + EXPECT_EQ(GreaterThanBitmap(levels.data(), /*num_levels=*/64, /*rhs*/ 6), + 0x8080808080808080); +} + TEST(TestAppendBitmap, TestOffsetOverwritesCorrectBitsOnExistingByte) { auto check_append = [](const std::string& expected_bits, int64_t offset) { std::vector valid_bits = {0x00}; From 91f68145c4a9296bf02baad9c71c55a4e3460236 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 23 Apr 2020 07:04:21 +0000 Subject: [PATCH 14/40] add shift tests --- cpp/src/parquet/level_conversion_test.cc | 38 ++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/level_conversion_test.cc b/cpp/src/parquet/level_conversion_test.cc index 6a6e6bee4a4..1ff0120c460 100644 --- a/cpp/src/parquet/level_conversion_test.cc +++ b/cpp/src/parquet/level_conversion_test.cc @@ -29,12 +29,15 @@ namespace internal { using ::testing::ElementsAreArray; +std::string ToString(const uint8_t* bitmap, int64_t bit_count) { + return arrow::internal::Bitmap(bitmap, /*offset*/ 0, /*length=*/bit_count).ToString(); +} + std::string ToString(const std::vector& bitmap, int64_t bit_count) { - return arrow::internal::Bitmap(bitmap.data(), /*offset*/ 0, /*length=*/bit_count) - .ToString(); + return ToString(bitmap.data(), bit_count); } -TEST(TestGreaterThanBitmap, GeneratesExpetedBitmasks) { +TEST(TestGreaterThanBitmap, GeneratesExpectedBitmasks) { std::vector levels = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, @@ -70,6 +73,35 @@ TEST(TestAppendBitmap, TestOffsetOverwritesCorrectBitsOnExistingByte) { check_append("00000001", 7); } +TEST(TestAppendBitmap, TestOffsetShiftBitsCorrectly) { + constexpr uint64_t kPattern = 0x9A9A9A9A9A9A9A9A; + auto check_append = [&](const std::string& leading_bits, const std::string& middle_bits, + const std::string& trailing_bits, int64_t offset) { + ASSERT_GE(offset, 8); + std::vector valid_bits(/*count=*/10, 0); + valid_bits[0] = 0x99; + + AppendBitmap(/*new_bits=*/kPattern, /*number_of_bits*/ 64, + /*valid_bits_length=*/valid_bits.size(), offset, valid_bits.data()); + EXPECT_EQ(valid_bits[0], 0x99); // shouldn't get chanked. + EXPECT_EQ(ToString(valid_bits.data() + 1, /*num_bits=*/8), leading_bits); + for (int x = 2; x < 9; x++) { + EXPECT_EQ(ToString(valid_bits.data() + x, /*num_bits=*/8), middle_bits); + } + EXPECT_EQ(ToString(valid_bits.data() + 9, /*num_bits=*/8), trailing_bits); + }; + // Original Pattern = "01011001" + check_append(/*leading_bits= */ "01011001", /*middle_bits=*/"01011001", + /*trailing_bits=*/"00000000", /*offset=*/8); + check_append("00101100", "10101100", "10000000", 9); + check_append("00010110", "01010110", "01000000", 10); + check_append("00001011", "00101011", "00100000", 11); + check_append("00000101", "10010101", "10010000", 12); + check_append("00000010", "11001010", "11001000", 13); + check_append("00000001", "01100101", "01100100", 14); + check_append("00000000", "10110010", "10110010", 15); +} + TEST(TestAppendBitmap, AllBytesAreWrittenWithEnoughSpace) { std::vector valid_bits(/*count=*/9, 0); From 0fef43b127da7570da24993d2d31a4e555c5844a Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 23 Apr 2020 07:21:22 +0000 Subject: [PATCH 15/40] add remaining unit tests --- cpp/src/parquet/level_conversion.h | 3 ++- cpp/src/parquet/level_conversion_test.cc | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/level_conversion.h b/cpp/src/parquet/level_conversion.h index 3be6ad0422c..509fe53a391 100644 --- a/cpp/src/parquet/level_conversion.h +++ b/cpp/src/parquet/level_conversion.h @@ -147,7 +147,8 @@ static inline int64_t AppendBitmap(uint64_t new_bits, int64_t number_of_bits, /// \param[in,out] validity_bitmap The validity bitmap to update. /// \param[in,out] validity_bitmap_offset The offset to start appending bits to in /// valid_bits (updated to latest bitmap). -/// \param[in,out] set_bit_count The number of set bits added to set_bit_count. +/// \param[in,out] set_bit_count The number of set bits appended is added to +/// set_bit_count. void AppendToValidityBitmap(uint64_t new_bits, int64_t new_bit_count, uint8_t* validity_bitmap, int64_t* validity_bitmap_offset, int64_t* set_bit_count) { diff --git a/cpp/src/parquet/level_conversion_test.cc b/cpp/src/parquet/level_conversion_test.cc index 1ff0120c460..719639fb1ba 100644 --- a/cpp/src/parquet/level_conversion_test.cc +++ b/cpp/src/parquet/level_conversion_test.cc @@ -132,5 +132,28 @@ TEST(TestAppendBitmap, OnlyApproriateBytesWrittenWhenLessThen8BytesAvailable) { EXPECT_THAT(valid_bits, ElementsAreArray(std::vector{0xFE, 0x03})); } +TEST(TestAppendToValidityBitmap, BasicOperation) { + std::vector validity_bitmap(/*count*/ 8, 0); + int64_t valid_bitmap_offset = 1; + int64_t set_bit_count = 5; + AppendToValidityBitmap(/*new_bits*/ 0x99, /*new_bit_count=*/31, validity_bitmap.data(), + &valid_bitmap_offset, &set_bit_count); + EXPECT_EQ(ToString(validity_bitmap, valid_bitmap_offset), + "01001100 10000000 00000000 00000000"); + EXPECT_EQ(set_bit_count, /*5 + 4 set bits=*/9); +} + +TEST(TestAppendSelectedBitsToValidityBitmapi, BasicOperation) { + std::vector validity_bitmap(/*count*/ 8, 0); + int64_t valid_bitmap_offset = 1; + int64_t set_bit_count = 5; + EXPECT_EQ(AppendSelectedBitsToValidityBitmap( + /*new_bits*/ 0x99, /*selection_bitmap=*/0xB8, validity_bitmap.data(), + &valid_bitmap_offset, &set_bit_count), + /*bits_processed=*/4); + EXPECT_EQ(ToString(validity_bitmap, valid_bitmap_offset), "01101"); + EXPECT_EQ(set_bit_count, /*5 + 3 set bits=*/8); +} + } // namespace internal } // namespace parquet From 44411f31a76603a99467016274be0d8776e39fc6 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 23 Apr 2020 07:41:44 +0000 Subject: [PATCH 16/40] skip selection test if no BMI2 --- cpp/src/parquet/level_conversion_test.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/level_conversion_test.cc b/cpp/src/parquet/level_conversion_test.cc index 719639fb1ba..3d6979afcfb 100644 --- a/cpp/src/parquet/level_conversion_test.cc +++ b/cpp/src/parquet/level_conversion_test.cc @@ -143,7 +143,10 @@ TEST(TestAppendToValidityBitmap, BasicOperation) { EXPECT_EQ(set_bit_count, /*5 + 4 set bits=*/9); } -TEST(TestAppendSelectedBitsToValidityBitmapi, BasicOperation) { +TEST(TestAppendSelectedBitsToValidityBitmap, BasicOperation) { +#if !defined(ARROW_HAVE_BMI2) + return; +#endif std::vector validity_bitmap(/*count*/ 8, 0); int64_t valid_bitmap_offset = 1; int64_t set_bit_count = 5; From 04109d4627074c721d5c7182505ca942ae4a8410 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 24 Apr 2020 03:44:32 +0000 Subject: [PATCH 17/40] change back to -mavx2 and add -mbmi2 --- cpp/cmake_modules/SetupCxxFlags.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 753bb303874..7408426bc92 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -42,7 +42,7 @@ if(ARROW_CPU_FLAG STREQUAL "x86") set(CXX_SUPPORTS_SSE4_2 TRUE) else() set(ARROW_SSE4_2_FLAG "-msse4.2") - set(ARROW_AVX2_FLAG "-march=core-avx2") + set(ARROW_AVX2_FLAG "-mavx2 -mbmi2") # skylake-avx512 consists of AVX512F,AVX512BW,AVX512VL,AVX512CD,AVX512DQ set(ARROW_AVX512_FLAG "-march=skylake-avx512") check_cxx_compiler_flag(${ARROW_SSE4_2_FLAG} CXX_SUPPORTS_SSE4_2) From f08d8362fc257d444dd76fa4a99de16e777b2b85 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 24 Apr 2020 04:33:30 +0000 Subject: [PATCH 18/40] fix format --- cpp/cmake_modules/SetupCxxFlags.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 7408426bc92..3248320d617 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -48,7 +48,6 @@ if(ARROW_CPU_FLAG STREQUAL "x86") check_cxx_compiler_flag(${ARROW_SSE4_2_FLAG} CXX_SUPPORTS_SSE4_2) endif() check_cxx_compiler_flag(${ARROW_AVX2_FLAG} CXX_SUPPORTS_AVX2) - check_cxx_compiler_flag(${ARROW_AVX2_FLAG} CXX_SUPPORTS_AVX2) check_cxx_compiler_flag(${ARROW_AVX512_FLAG} CXX_SUPPORTS_AVX512) elseif(ARROW_CPU_FLAG STREQUAL "ppc") # power compiler flags, gcc/clang only @@ -314,7 +313,8 @@ if(ARROW_CPU_FLAG STREQUAL "x86") message(FATAL_ERROR "AVX512 required but compiler doesn't support it.") endif() set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_AVX512_FLAG}") - add_definitions(-DARROW_HAVE_AVX512 -DARROW_HAVE_AVX2 -DARROW_HAVE_BMI2 -DARROW_HAVE_SSE4_2) + add_definitions(-DARROW_HAVE_AVX512 -DARROW_HAVE_AVX2 -DARROW_HAVE_BMI2 + -DARROW_HAVE_SSE4_2) elseif(ARROW_SIMD_LEVEL STREQUAL "AVX2") if(NOT CXX_SUPPORTS_AVX2) message(FATAL_ERROR "AVX2 required but compiler doesn't support it.") From 6a082252153d36e1fa35cfbea9d73c1a537fa2e2 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 24 Apr 2020 05:39:31 +0000 Subject: [PATCH 19/40] address more comments --- cpp/src/parquet/CMakeLists.txt | 1 + cpp/src/parquet/column_reader.cc | 152 +---------------------- cpp/src/parquet/column_reader.h | 5 - cpp/src/parquet/column_reader_test.cc | 51 -------- cpp/src/parquet/level_conversion.h | 75 +++++------ cpp/src/parquet/level_conversion_test.cc | 74 +++++++++-- 6 files changed, 106 insertions(+), 252 deletions(-) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 280b93f4096..8b5dfffe2d0 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -187,6 +187,7 @@ set(PARQUET_SRCS file_writer.cc internal_file_decryptor.cc internal_file_encryptor.cc + level_conversion.cc metadata.cc murmur3.cc "${ARROW_SOURCE_DIR}/src/generated/parquet_constants.cpp" diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index e104d4291b8..e218f8451ac 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -52,140 +52,6 @@ using arrow::internal::checked_cast; namespace parquet { -namespace { - -inline void CheckLevelRange(const int16_t* levels, int64_t num_levels, - const int16_t max_expected_level) { - int16_t min_level = std::numeric_limits::max(); - int16_t max_level = std::numeric_limits::min(); - for (int x = 0; x < num_levels; x++) { - min_level = std::min(levels[x], min_level); - max_level = std::max(levels[x], max_level); - } - if (ARROW_PREDICT_FALSE(num_levels > 0 && (min_level < 0 || max_level > max_level))) { - throw ParquetException("definition level exceeds maximum"); - } -} - -#if !defined(ARROW_HAVE_BMI2) - -inline void DefinitionLevelsToBitmapScalar( - const int16_t* def_levels, int64_t num_def_levels, const int16_t max_definition_level, - const int16_t max_repetition_level, int64_t* values_read, int64_t* null_count, - uint8_t* valid_bits, int64_t valid_bits_offset) { - // We assume here that valid_bits is large enough to accommodate the - // additional definition levels and the ones that have already been written - ::arrow::internal::BitmapWriter valid_bits_writer(valid_bits, valid_bits_offset, - num_def_levels); - - // TODO(itaiin): As an interim solution we are splitting the code path here - // between repeated+flat column reads, and non-repeated+nested reads. - // Those paths need to be merged in the future - for (int i = 0; i < num_def_levels; ++i) { - if (def_levels[i] == max_definition_level) { - valid_bits_writer.Set(); - } else if (max_repetition_level > 0) { - // repetition+flat case - if (def_levels[i] == (max_definition_level - 1)) { - valid_bits_writer.Clear(); - *null_count += 1; - } else { - continue; - } - } else { - // non-repeated+nested case - if (def_levels[i] < max_definition_level) { - valid_bits_writer.Clear(); - *null_count += 1; - } else { - throw ParquetException("definition level exceeds maximum"); - } - } - - valid_bits_writer.Next(); - } - valid_bits_writer.Finish(); - *values_read = valid_bits_writer.position(); -} -#endif - -template -void DefinitionLevelsToBitmapSimd(const int16_t* def_levels, int64_t num_def_levels, - const int16_t required_definition_level, - int64_t* values_read, int64_t* null_count, - uint8_t* valid_bits, int64_t valid_bits_offset) { - constexpr int64_t kBitMaskSize = 64; - int64_t set_count = 0; - *values_read = 0; - while (num_def_levels > 0) { - int64_t batch_size = std::min(num_def_levels, kBitMaskSize); - CheckLevelRange(def_levels, batch_size, required_definition_level); - uint64_t defined_bitmap = internal::GreaterThanBitmap(def_levels, batch_size, - required_definition_level - 1); - if (has_repeated_parent) { - // This is currently a specialized code path assuming only (nested) lists - // present through the leaf (i.e. no structs). - // Upper level code only calls this method - // when the leaf-values are nullable (otherwise no spacing is needed), - // Because only nested lists exists it is sufficient to know that the field - // was either null or included it (i.e. >= previous definition level -> > previous - // definition - 1). If there where structs mixed in, we need to know the def_level - // of the repeated parent so we can check for def_level > "def level of repeated - // parent". - uint64_t present_bitmap = internal::GreaterThanBitmap( - def_levels, batch_size, required_definition_level - 2); - *values_read += internal::AppendSelectedBitsToValidityBitmap( - /*new_bits=*/defined_bitmap, - /*selection_bitmap*/ present_bitmap, valid_bits, &valid_bits_offset, - &set_count); - } else { - internal::AppendToValidityBitmap( - /*new_bits=*/defined_bitmap, - /*new_bit_count=*/batch_size, valid_bits, &valid_bits_offset, &set_count); - *values_read += batch_size; - } - def_levels += batch_size; - num_def_levels -= batch_size; - } - *null_count += *values_read - set_count; -} - -inline void DefinitionLevelsToBitmapDispatch( - const int16_t* def_levels, int64_t num_def_levels, const int16_t max_definition_level, - const int16_t max_repetition_level, int64_t* values_read, int64_t* null_count, - uint8_t* valid_bits, int64_t valid_bits_offset) { - if (max_repetition_level > 0) { -#if ARROW_LITTLE_ENDIAN - -#if defined(ARROW_HAVE_BMI2) - // BMI2 is required for efficient bit extraction. - DefinitionLevelsToBitmapSimd( - def_levels, num_def_levels, max_definition_level, values_read, null_count, - valid_bits, valid_bits_offset); -#else - DefinitionLevelsToBitmapScalar(def_levels, num_def_levels, max_definition_level, - max_repetition_level, values_read, null_count, - valid_bits, valid_bits_offset); -#endif // ARROW_HAVE_BMI2 - - } else { - // No Special intsturction are used for non-repeated case. - DefinitionLevelsToBitmapSimd( - def_levels, num_def_levels, max_definition_level, values_read, null_count, - valid_bits, valid_bits_offset); - } -} - -#else // big-endian - // Optimized SIMD uses bit shifts that are unlikely to work on big endian platforms. - DefinitionLevelsToBitmapScalar(def_levels, num_def_levels, max_definition_level, - max_repitition_level, values_read, null_count, - valid_bits, valid_bits_offset); - -#endif - -} // namespace - LevelDecoder::LevelDecoder() : num_values_remaining_(0) {} LevelDecoder::~LevelDecoder() {} @@ -1022,9 +888,9 @@ int64_t TypedColumnReaderImpl::ReadBatchSpaced( /*bits_are_set=*/true); *values_read = total_values; } else { - DefinitionLevelsToBitmapDispatch(def_levels, num_def_levels, this->max_def_level_, - this->max_rep_level_, values_read, &null_count, - valid_bits, valid_bits_offset); + internal::DefinitionLevelsToBitmap(def_levels, num_def_levels, this->max_def_level_, + this->max_rep_level_, values_read, &null_count, + valid_bits, valid_bits_offset); total_values = this->ReadValuesSpaced(*values_read, values, static_cast(null_count), valid_bits, valid_bits_offset); @@ -1127,16 +993,6 @@ namespace internal { // better vectorized performance when doing many smaller record reads constexpr int64_t kMinLevelBatchSize = 1024; -void DefinitionLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels, - const int16_t max_definition_level, - const int16_t max_repetition_level, int64_t* values_read, - int64_t* null_count, uint8_t* valid_bits, - int64_t valid_bits_offset) { - DefinitionLevelsToBitmapDispatch(def_levels, num_def_levels, max_definition_level, - max_repetition_level, values_read, null_count, - valid_bits, valid_bits_offset); -} - template class TypedRecordReader : public ColumnReaderImplBase, virtual public RecordReader { @@ -1468,7 +1324,7 @@ class TypedRecordReader : public ColumnReaderImplBase, int64_t null_count = 0; if (nullable_values_) { int64_t values_with_nulls = 0; - DefinitionLevelsToBitmapDispatch( + DefinitionLevelsToBitmap( def_levels() + start_levels_position, levels_position_ - start_levels_position, this->max_def_level_, this->max_rep_level_, &values_with_nulls, &null_count, valid_bits_->mutable_data(), values_written_); diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 86dc2d4a428..7b5ee1b722a 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -314,11 +314,6 @@ class DictionaryRecordReader : virtual public RecordReader { virtual std::shared_ptr<::arrow::ChunkedArray> GetResult() = 0; }; -void PARQUET_EXPORT DefinitionLevelsToBitmap( - const int16_t* def_levels, int64_t num_def_levels, const int16_t max_definition_level, - const int16_t max_repetition_level, int64_t* values_read, int64_t* null_count, - uint8_t* valid_bits, int64_t valid_bits_offset); - // TODO(itaiin): another code path split to merge when the general case is done static inline bool HasSpacedValues(const ColumnDescriptor* descr) { if (descr->max_repetition_level() > 0) { diff --git a/cpp/src/parquet/column_reader_test.cc b/cpp/src/parquet/column_reader_test.cc index b8d51cf4aa3..6dc7de88d4d 100644 --- a/cpp/src/parquet/column_reader_test.cc +++ b/cpp/src/parquet/column_reader_test.cc @@ -382,56 +382,5 @@ TEST_F(TestPrimitiveReader, TestDictionaryEncodedPages) { pages_.clear(); } -TEST(TestColumnReader, DefinitionLevelsToBitmap) { - // Bugs in this function were exposed in ARROW-3930 - std::vector def_levels = {3, 3, 3, 2, 3, 3, 3, 3, 3}; - std::vector rep_levels = {0, 1, 1, 1, 1, 1, 1, 1, 1}; - - std::vector valid_bits(2, 0); - - const int max_def_level = 3; - const int max_rep_level = 1; - - int64_t values_read = -1; - int64_t null_count = 0; - internal::DefinitionLevelsToBitmap(def_levels.data(), 9, max_def_level, max_rep_level, - &values_read, &null_count, valid_bits.data(), - 0 /* valid_bits_offset */); - ASSERT_EQ(9, values_read); - ASSERT_EQ(1, null_count); - - // Call again with 0 definition levels, make sure that valid_bits is unmodified - const uint8_t current_byte = valid_bits[1]; - null_count = 0; - internal::DefinitionLevelsToBitmap(def_levels.data(), 0, max_def_level, max_rep_level, - &values_read, &null_count, valid_bits.data(), - 9 /* valid_bits_offset */); - ASSERT_EQ(0, values_read); - ASSERT_EQ(0, null_count); - ASSERT_EQ(current_byte, valid_bits[1]); -} - -TEST(TestColumnReader, DefinitionLevelsToBitmapPowerOfTwo) { - // PARQUET-1623: Invalid memory access when decoding a valid bits vector that has a - // length equal to a power of two and also using a non-zero valid_bits_offset. This - // should not fail when run with ASAN or valgrind. - std::vector def_levels = {3, 3, 3, 2, 3, 3, 3, 3}; - std::vector rep_levels = {0, 1, 1, 1, 1, 1, 1, 1}; - std::vector valid_bits(1, 0); - - const int max_def_level = 3; - const int max_rep_level = 1; - - int64_t values_read = -1; - int64_t null_count = 0; - - // Read the latter half of the validity bitmap - internal::DefinitionLevelsToBitmap(def_levels.data() + 4, 4, max_def_level, - max_rep_level, &values_read, &null_count, - valid_bits.data(), 4 /* valid_bits_offset */); - ASSERT_EQ(4, values_read); - ASSERT_EQ(0, null_count); -} - } // namespace test } // namespace parquet diff --git a/cpp/src/parquet/level_conversion.h b/cpp/src/parquet/level_conversion.h index 509fe53a391..ed719bdb819 100644 --- a/cpp/src/parquet/level_conversion.h +++ b/cpp/src/parquet/level_conversion.h @@ -18,7 +18,9 @@ #pragma once #include + #include "arrow/util/bit_util.h" +#include "parquet/platform.h" #if defined(ARROW_HAVE_BMI2) #include "x86intrin.h" @@ -26,6 +28,12 @@ namespace parquet { namespace internal { + +void PARQUET_EXPORT DefinitionLevelsToBitmap( + const int16_t* def_levels, int64_t num_def_levels, const int16_t max_definition_level, + const int16_t max_repetition_level, int64_t* values_read, int64_t* null_count, + uint8_t* valid_bits, int64_t valid_bits_offset); + // These APIs are likely to be revised as part of ARROW-8494 to reduce duplicate code. // They currently represent minimal functionality for vectorized computation of definition // levels. @@ -40,9 +48,11 @@ namespace internal { /// /// N.B. Correct byte ordering is dependent on little-endian architectures. /// + +#if defined(ARROW_LITTLE_ENDIAN) template uint64_t LevelsToBitmap(const int16_t* levels, int64_t num_levels, Predicate predicate) { - // Both clang and GCC can vectorize this automatically with AVX2. + // Both clang and GCC can vectorize this automatically with SSE4/AVX2. uint64_t mask = 0; for (int x = 0; x < num_levels; x++) { mask |= static_cast(predicate(levels[x]) ? 1 : 0) << x; @@ -50,13 +60,15 @@ uint64_t LevelsToBitmap(const int16_t* levels, int64_t num_levels, Predicate pre return mask; } -/// Builds a bitmap where each set bit indicates the correspond level is greater +/// Builds a bitmap where each set bit indicates the corresponding level is greater /// than rhs. -static inline int64_t GreaterThanBitmap(const int16_t* levels, int64_t num_levels, - int16_t rhs) { +static inline uint64_t GreaterThanBitmap(const int16_t* levels, int64_t num_levels, + int16_t rhs) { return LevelsToBitmap(levels, num_levels, [&](int16_t value) { return value > rhs; }); } +#endif + /// Append bits number_of_bits from new_bits to valid_bits and valid_bits_offset. /// /// \param[in] new_bits The zero-padded bitmap to append. @@ -83,39 +95,34 @@ static inline int64_t AppendBitmap(uint64_t new_bits, int64_t number_of_bits, int64_t bit_offset = valid_bits_offset % 8; int64_t new_offset = valid_bits_offset + number_of_bits; - union ByteAddressableBitmap { - explicit ByteAddressableBitmap(uint64_t mask) : mask(mask) {} - uint64_t mask; - uint8_t bytes[8]; - }; if (bit_offset != 0) { int64_t bits_to_carry = 8 - bit_offset; // Get the mask the will select the lower order bits (the ones to carry // over to the existing byte and shift up. - const ByteAddressableBitmap carry_bits(kLsbSelectionMasks[bits_to_carry]); + const uint64_t carry_mask = kLsbSelectionMasks[bits_to_carry]; // Mask to select non-carried bits. - const ByteAddressableBitmap inverse_selection(~carry_bits.mask); - // Fill out the last incomplete byte in the output, by extracting the least - // siginficant bits from the first byte. - const ByteAddressableBitmap new_bitmap(new_bits); + const uint64_t non_carry_mask = ~carry_mask; + // valid bits should be a valid bitmask so all trailing bytes hsould be unset // so no mask is need to start. - valid_bits[valid_byte_offset] = - valid_bits[valid_byte_offset] | // See above the - (((new_bitmap.bytes[0] & carry_bits.bytes[0])) << bit_offset); + valid_bits[valid_byte_offset] = valid_bits[valid_byte_offset] | // See above the + (((new_bits & carry_mask) & 0xFF) << bit_offset); - // We illustrate logic with a 3-byte example in little endian/LSB order. + // We illustrate logic with a 3-byte example in little endian/LSB order + // (N indicates note set). // Note this ordering is the reversed from HEX masks above with are expressed // big-endian/MSB and shifts right move the bits to the left (division). // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 - // Shifted mask should look like this assuming bit offset = 6: + // Assuming a bit-off of 6: + // + // So shifted_new_bits should look like; // 2 3 4 5 6 7 N N 10 11 12 13 14 15 N N 18 19 20 21 22 23 N N // clang-format on - uint64_t shifted_new_bits = (new_bits & inverse_selection.mask) >> bits_to_carry; + uint64_t shifted_new_bits = (new_bits & non_carry_mask) >> bits_to_carry; // captured_carry: // 0 1 N N N N N N 8 9 N N N N N N 16 17 N N N N N N - uint64_t captured_carry = carry_bits.mask & new_bits; + uint64_t captured_carry = carry_mask & new_bits; // mask_cary_bits: // N N N N N N 8 9 N N N N N N 16 17 N N N N N N N N uint64_t mask_carry_bits = (captured_carry >> 8) << bit_offset; @@ -149,9 +156,10 @@ static inline int64_t AppendBitmap(uint64_t new_bits, int64_t number_of_bits, /// valid_bits (updated to latest bitmap). /// \param[in,out] set_bit_count The number of set bits appended is added to /// set_bit_count. -void AppendToValidityBitmap(uint64_t new_bits, int64_t new_bit_count, - uint8_t* validity_bitmap, int64_t* validity_bitmap_offset, - int64_t* set_bit_count) { +static inline void AppendToValidityBitmap(uint64_t new_bits, int64_t new_bit_count, + uint8_t* validity_bitmap, + int64_t* validity_bitmap_offset, + int64_t* set_bit_count) { int64_t min_valid_bits_size = ::arrow::BitUtil::BytesForBits(new_bit_count + *validity_bitmap_offset); @@ -160,18 +168,16 @@ void AppendToValidityBitmap(uint64_t new_bits, int64_t new_bit_count, *validity_bitmap_offset, validity_bitmap); } +#if defined(ARROW_HAVE_BMI2) /// The same as AppendToValidityBitmap but only appends bits from bitmap that have /// a corresponding bit set in selection_bitmap. /// /// \returns The number of bits appended. -/// -/// N.B. This is only implemented for archiectures that suppor the BMI2 instruction -/// set. -int64_t AppendSelectedBitsToValidityBitmap(uint64_t new_bits, uint64_t selection_bitmap, - uint8_t* validity_bitmap, - int64_t* validity_bitmap_offset, - int64_t* set_bit_count) { -#if defined(ARROW_HAVE_BMI2) +static inline int64_t AppendSelectedBitsToValidityBitmap(uint64_t new_bits, + uint64_t selection_bitmap, + uint8_t* validity_bitmap, + int64_t* validity_bitmap_offset, + int64_t* set_bit_count) { // If the parent list was empty at for the given slot it should not be added to the // bitmap. uint64_t selected_bits = _pext_u64(new_bits, selection_bitmap); @@ -181,11 +187,8 @@ int64_t AppendSelectedBitsToValidityBitmap(uint64_t new_bits, uint64_t selection AppendToValidityBitmap(selected_bits, selected_count, validity_bitmap, validity_bitmap_offset, set_bit_count); return selected_count; -#else - // We shouldn't get here. - std::abort(); -#endif } +#endif } // namespace internal } // namespace parquet diff --git a/cpp/src/parquet/level_conversion_test.cc b/cpp/src/parquet/level_conversion_test.cc index 3d6979afcfb..a1816aff56c 100644 --- a/cpp/src/parquet/level_conversion_test.cc +++ b/cpp/src/parquet/level_conversion_test.cc @@ -29,12 +29,63 @@ namespace internal { using ::testing::ElementsAreArray; -std::string ToString(const uint8_t* bitmap, int64_t bit_count) { +std::string BitmapToString(const uint8_t* bitmap, int64_t bit_count) { return arrow::internal::Bitmap(bitmap, /*offset*/ 0, /*length=*/bit_count).ToString(); } -std::string ToString(const std::vector& bitmap, int64_t bit_count) { - return ToString(bitmap.data(), bit_count); +std::string BitmapToString(const std::vector& bitmap, int64_t bit_count) { + return BitmapToString(bitmap.data(), bit_count); +} + +TEST(TestColumnReader, DefinitionLevelsToBitmap) { + // Bugs in this function were exposed in ARROW-3930 + std::vector def_levels = {3, 3, 3, 2, 3, 3, 3, 3, 3}; + std::vector rep_levels = {0, 1, 1, 1, 1, 1, 1, 1, 1}; + + std::vector valid_bits(2, 0); + + const int max_def_level = 3; + const int max_rep_level = 1; + + int64_t values_read = -1; + int64_t null_count = 0; + internal::DefinitionLevelsToBitmap(def_levels.data(), 9, max_def_level, max_rep_level, + &values_read, &null_count, valid_bits.data(), + 0 /* valid_bits_offset */); + ASSERT_EQ(9, values_read); + ASSERT_EQ(1, null_count); + + // Call again with 0 definition levels, make sure that valid_bits is unmodified + const uint8_t current_byte = valid_bits[1]; + null_count = 0; + internal::DefinitionLevelsToBitmap(def_levels.data(), 0, max_def_level, max_rep_level, + &values_read, &null_count, valid_bits.data(), + 9 /* valid_bits_offset */); + ASSERT_EQ(0, values_read); + ASSERT_EQ(0, null_count); + ASSERT_EQ(current_byte, valid_bits[1]); +} + +TEST(TestColumnReader, DefinitionLevelsToBitmapPowerOfTwo) { + // PARQUET-1623: Invalid memory access when decoding a valid bits vector that has a + // length equal to a power of two and also using a non-zero valid_bits_offset. This + // should not fail when run with ASAN or valgrind. + std::vector def_levels = {3, 3, 3, 2, 3, 3, 3, 3}; + std::vector rep_levels = {0, 1, 1, 1, 1, 1, 1, 1}; + std::vector valid_bits(1, 0); + + const int max_def_level = 3; + const int max_rep_level = 1; + + int64_t values_read = -1; + int64_t null_count = 0; + + // Read the latter half of the validity bitmap + internal::DefinitionLevelsToBitmap(def_levels.data() + 4, 4, max_def_level, + max_rep_level, &values_read, &null_count, + valid_bits.data(), 4 /* valid_bits_offset */); + ASSERT_EQ(4, values_read); + ASSERT_EQ(0, null_count); } TEST(TestGreaterThanBitmap, GeneratesExpectedBitmasks) { @@ -61,7 +112,7 @@ TEST(TestAppendBitmap, TestOffsetOverwritesCorrectBitsOnExistingByte) { AppendBitmap(/*new_bits=*/0xFF, /*number_of_bits*/ 8 - offset, /*valid_bits_length=*/valid_bits.size(), offset, valid_bits.data()), kBitsAfterAppend); - EXPECT_EQ(ToString(valid_bits, kBitsAfterAppend), expected_bits); + EXPECT_EQ(BitmapToString(valid_bits, kBitsAfterAppend), expected_bits); }; check_append("11111111", 0); check_append("01111111", 1); @@ -84,11 +135,11 @@ TEST(TestAppendBitmap, TestOffsetShiftBitsCorrectly) { AppendBitmap(/*new_bits=*/kPattern, /*number_of_bits*/ 64, /*valid_bits_length=*/valid_bits.size(), offset, valid_bits.data()); EXPECT_EQ(valid_bits[0], 0x99); // shouldn't get chanked. - EXPECT_EQ(ToString(valid_bits.data() + 1, /*num_bits=*/8), leading_bits); + EXPECT_EQ(BitmapToString(valid_bits.data() + 1, /*num_bits=*/8), leading_bits); for (int x = 2; x < 9; x++) { - EXPECT_EQ(ToString(valid_bits.data() + x, /*num_bits=*/8), middle_bits); + EXPECT_EQ(BitmapToString(valid_bits.data() + x, /*num_bits=*/8), middle_bits); } - EXPECT_EQ(ToString(valid_bits.data() + 9, /*num_bits=*/8), trailing_bits); + EXPECT_EQ(BitmapToString(valid_bits.data() + 9, /*num_bits=*/8), trailing_bits); }; // Original Pattern = "01011001" check_append(/*leading_bits= */ "01011001", /*middle_bits=*/"01011001", @@ -138,15 +189,13 @@ TEST(TestAppendToValidityBitmap, BasicOperation) { int64_t set_bit_count = 5; AppendToValidityBitmap(/*new_bits*/ 0x99, /*new_bit_count=*/31, validity_bitmap.data(), &valid_bitmap_offset, &set_bit_count); - EXPECT_EQ(ToString(validity_bitmap, valid_bitmap_offset), + EXPECT_EQ(BitmapToString(validity_bitmap, valid_bitmap_offset), "01001100 10000000 00000000 00000000"); EXPECT_EQ(set_bit_count, /*5 + 4 set bits=*/9); } +#if defined(ARROW_HAVE_BMI2) TEST(TestAppendSelectedBitsToValidityBitmap, BasicOperation) { -#if !defined(ARROW_HAVE_BMI2) - return; -#endif std::vector validity_bitmap(/*count*/ 8, 0); int64_t valid_bitmap_offset = 1; int64_t set_bit_count = 5; @@ -154,9 +203,10 @@ TEST(TestAppendSelectedBitsToValidityBitmap, BasicOperation) { /*new_bits*/ 0x99, /*selection_bitmap=*/0xB8, validity_bitmap.data(), &valid_bitmap_offset, &set_bit_count), /*bits_processed=*/4); - EXPECT_EQ(ToString(validity_bitmap, valid_bitmap_offset), "01101"); + EXPECT_EQ(BitmapToString(validity_bitmap, valid_bitmap_offset), "01101"); EXPECT_EQ(set_bit_count, /*5 + 3 set bits=*/8); } +#endif } // namespace internal } // namespace parquet From 189403d6664eb54fe2613b19809929208cc616cf Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sat, 25 Apr 2020 05:14:25 +0000 Subject: [PATCH 20/40] Address remaining comments. - Move AppendBitmap functionality to FirstTimeBitmapWriter - Move level related code (and corresponding unit tests) to level_conversion specific files --- cpp/src/arrow/util/bit_util.h | 97 +++++++++++++++++- cpp/src/arrow/util/bit_util_test.cc | 112 +++++++++++++++++++++ cpp/src/parquet/level_conversion.h | 121 ----------------------- cpp/src/parquet/level_conversion_test.cc | 120 ++++------------------ 4 files changed, 225 insertions(+), 225 deletions(-) diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index 9ec4c4642eb..8074d26d034 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -586,6 +586,101 @@ class FirstTimeBitmapWriter { } } + /// Appends number_of_bits from word to valid_bits and valid_bits_offset. + /// + /// \param[in] word The LSB bitmap to append. Any bits past number_of_bits are assumed + /// to be zero. + /// \param[in] number_of_bits The number of bits to append from word. + void AppendWord(uint64_t word, int64_t number_of_bits) { +#if defined(ARROW_LITTLE_ENDIAN) + // Selection masks to retrieve all low order bits for each bytes. + constexpr uint64_t kLsbSelectionMasks[] = { + 0, // unused. + 0x0101010101010101, + 0x0303030303030303, + 0x0707070707070707, + 0x0F0F0F0F0F0F0F0F, + 0x1F1F1F1F1F1F1F1F, + 0x3F3F3F3F3F3F3F3F, + 0x7F7F7F7F7F7F7F7F, + }; + if (ARROW_PREDICT_FALSE(number_of_bits == 0)) { + return; + } + + // Location that the first byte needs to be written to. + uint8_t* append_position = bitmap_ + byte_offset_; + + // Update state variables except for current_byte_ here. + position_ += number_of_bits; + int64_t bit_offset = BitUtil::CountTrailingZeros(static_cast(bit_mask_)); + bit_mask_ = BitUtil::kBitmask[(bit_offset + number_of_bits) % 8]; + byte_offset_ += (bit_offset + number_of_bits) / 8; + + if (bit_offset != 0) { + // We are in the middle of the byte. This code updates the byte and shifts the word + // so trailing bits can be appended below. + int64_t bits_to_carry = 8 - bit_offset; + // Get the mask the will select the lower order bits (the ones to carry + // over to the existing byte and shift up. + const uint64_t carry_mask = kLsbSelectionMasks[bits_to_carry]; + // Mask to select non-carried bits. + const uint64_t non_carry_mask = ~carry_mask; + + // valid bits should be a valid bitmask so all trailing bytes hsould be unset + // so no mask is need to start. + current_byte_ |= (((word & carry_mask) & 0xFF) << bit_offset); + if (ARROW_PREDICT_FALSE(number_of_bits < bits_to_carry)) { + return; + } + *append_position = current_byte_; + append_position++; + + // We illustrate logic with a 3-byte example in little endian/LSB order + // (N indicates not set bit Y indicates a set bit). + // Note this ordering is the reversed from HEX masks above with are expressed + // big-endian/MSB and shifts right move the bits to the left (division). + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 + // Assuming a bit-offset of 6 the non_carry_mask looks like: + // N N Y Y Y Y Y Y N N Y Y Y Y Y Y N N Y Y Y Y Y Y + // So shifted_word should look like; + // 2 3 4 5 6 7 N N 10 11 12 13 14 15 N N 18 19 20 21 22 23 N N + // clang-format on + uint64_t shifted_word = (word & non_carry_mask) >> bits_to_carry; + // captured_carry: + // 0 1 N N N N N N 8 9 N N N N N N 16 17 N N N N N N + uint64_t captured_carry = carry_mask & word; + // mask_cary_bits: + // N N N N N N 8 9 N N N N N N 16 17 N N N N N N N N + uint64_t mask_carry_bits = (captured_carry >> 8) << bit_offset; + + word = shifted_word | mask_carry_bits; + number_of_bits -= bits_to_carry; + } + + int64_t bytes_for_word = ::arrow::BitUtil::BytesForBits(number_of_bits); + std::memcpy(append_position, &word, bytes_for_word); + // At this point, we are guaranteed to have flushed the previous current_byte_ state. + // So the new state is either the last relevant byte in 'word' + // or zero if we happen to be at a fresh byte. + current_byte_ = + bit_mask_ != 0x1 ? *(reinterpret_cast(&word) + bytes_for_word - 1) : 0; + +#else // big-endian + BitmapReader reader(reinterpret_cast(&word), 0, /*length=*/number_of_bits); + for (int x = 0; x < number_of_bits; x++) { + if (reader.IsSet()) { + Set(); + } else { + Clear(); + } + Next(); + reader.Next(); + } + return; +#endif + } + void Set() { current_byte_ |= bit_mask_; } void Clear() {} @@ -602,7 +697,7 @@ class FirstTimeBitmapWriter { } void Finish() { - // Store current byte if we didn't went past bitmap storage + // Store current byte if we didn't go past bitmap storage if (length_ > 0 && (bit_mask_ != 0x01 || position_ < length_)) { bitmap_[byte_offset_] = current_byte_; } diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc index 8b87e698bc3..83bf5ccd973 100644 --- a/cpp/src/arrow/util/bit_util_test.cc +++ b/cpp/src/arrow/util/bit_util_test.cc @@ -26,6 +26,7 @@ #include #include +#include #include #include "arrow/buffer.h" @@ -46,6 +47,8 @@ using internal::CopyBitmap; using internal::CountSetBits; using internal::InvertBitmap; +using ::testing::ElementsAreArray; + template void WriteVectorToWriter(BitmapWriter& writer, const std::vector values) { for (const auto& value : values) { @@ -315,6 +318,115 @@ TEST(FirstTimeBitmapWriter, NormalOperation) { } } +std::string BitmapToString(const uint8_t* bitmap, int64_t bit_count) { + return arrow::internal::Bitmap(bitmap, /*offset*/ 0, /*length=*/bit_count).ToString(); +} + +std::string BitmapToString(const std::vector& bitmap, int64_t bit_count) { + return BitmapToString(bitmap.data(), bit_count); +} + +TEST(FirstTimeBitmapWriter, AppendWordOffsetOverwritesCorrectBitsOnExistingByte) { + auto check_append = [](const std::string& expected_bits, int64_t offset) { + std::vector valid_bits = {0x00}; + constexpr int64_t kBitsAfterAppend = 8; + internal::FirstTimeBitmapWriter writer(valid_bits.data(), offset, + /*length=*/(8 * valid_bits.size()) - offset); + writer.AppendWord(/*word=*/0xFF, /*number_of_bits=*/kBitsAfterAppend - offset); + writer.Finish(); + EXPECT_EQ(BitmapToString(valid_bits, kBitsAfterAppend), expected_bits); + }; + check_append("11111111", 0); + check_append("01111111", 1); + check_append("00111111", 2); + check_append("00011111", 3); + check_append("00001111", 4); + check_append("00000111", 5); + check_append("00000011", 6); + check_append("00000001", 7); +} + +TEST(FirstTimeBitmapWriter, AppendZeroBitsHasNoImpact) { + std::vector valid_bits(/*count=*/1, 0); + internal::FirstTimeBitmapWriter writer(valid_bits.data(), /*start_offset=*/1, + /*length=*/valid_bits.size() * 8); + writer.AppendWord(/*word=*/0xFF, /*number_of_bits=*/0); + writer.AppendWord(/*word=*/0xFF, /*number_of_bits=*/0); + writer.AppendWord(/*word=*/0x01, /*number_of_bits=*/1); + writer.Finish(); + EXPECT_EQ(valid_bits[0], 0x2); +} + +TEST(FirstTimeBitmapWriter, AppendLessThenByte) { + std::vector valid_bits(/*count*/ 8, 0); + internal::FirstTimeBitmapWriter writer(valid_bits.data(), /*start_offset=*/1, + /*length=*/8); + writer.AppendWord(0xB, 4); + writer.Finish(); + EXPECT_EQ(BitmapToString(valid_bits, /*bit_count=*/8), "01101000"); +} + +TEST(FirstTimeBitmapWriter, AppendByteThenMore) { + std::vector valid_bits(/*count*/ 8, 0); + internal::FirstTimeBitmapWriter writer(valid_bits.data(), /*start_offset=*/0, + /*length=*/9); + writer.AppendWord(0xC3, 8); + writer.AppendWord(0x01, 1); + writer.Finish(); + EXPECT_EQ(BitmapToString(valid_bits, /*bit_count=*/9), "11000011 1"); +} + +TEST(FirstTimeBitmapWriter, AppendWordShiftsBitsCorrectly) { + constexpr uint64_t kPattern = 0x9A9A9A9A9A9A9A9A; + auto check_append = [&](const std::string& leading_bits, const std::string& middle_bits, + const std::string& trailing_bits, int64_t offset) { + ASSERT_GE(offset, 8); + std::vector valid_bits(/*count=*/10, 0); + valid_bits[0] = 0x99; + internal::FirstTimeBitmapWriter writer(valid_bits.data(), offset, + /*length=*/(8 * valid_bits.size()) - offset); + writer.AppendWord(/*word=*/kPattern, /*number_of_bits=*/64); + writer.Finish(); + EXPECT_EQ(valid_bits[0], 0x99); // shouldn't get chanked. + EXPECT_EQ(BitmapToString(valid_bits.data() + 1, /*num_bits=*/8), leading_bits); + for (int x = 2; x < 9; x++) { + EXPECT_EQ(BitmapToString(valid_bits.data() + x, /*num_bits=*/8), middle_bits) + << "x: " << x << " " << offset << " " << BitmapToString(valid_bits.data(), 80); + } + EXPECT_EQ(BitmapToString(valid_bits.data() + 9, /*num_bits=*/8), trailing_bits); + }; + // Original Pattern = "01011001" + check_append(/*leading_bits= */ "01011001", /*middle_bits=*/"01011001", + /*trailing_bits=*/"00000000", /*offset=*/8); + check_append("00101100", "10101100", "10000000", 9); + check_append("00010110", "01010110", "01000000", 10); + check_append("00001011", "00101011", "00100000", 11); + check_append("00000101", "10010101", "10010000", 12); + check_append("00000010", "11001010", "11001000", 13); + check_append("00000001", "01100101", "01100100", 14); + check_append("00000000", "10110010", "10110010", 15); +} + +TEST(TestAppendBitmap, AppendWordOnlyApproriateBytesWritten) { + std::vector valid_bits = {0x00, 0x00}; + + uint64_t bitmap = 0x1FF; + { + internal::FirstTimeBitmapWriter writer(valid_bits.data(), /*start_offset=*/1, + /*length=*/(8 * valid_bits.size()) - 1); + writer.AppendWord(bitmap, /*number_of_bits*/ 7); + writer.Finish(); + EXPECT_THAT(valid_bits, ElementsAreArray(std::vector{0xFE, 0x00})); + } + { + internal::FirstTimeBitmapWriter writer(valid_bits.data(), /*start_offset=*/1, + /*length=*/(8 * valid_bits.size()) - 1); + writer.AppendWord(bitmap, /*number_of_bits*/ 8); + writer.Finish(); + EXPECT_THAT(valid_bits, ElementsAreArray(std::vector{0xFE, 0x03})); + } +} + // Tests for GenerateBits and GenerateBitsUnrolled struct GenerateBitsFunctor { diff --git a/cpp/src/parquet/level_conversion.h b/cpp/src/parquet/level_conversion.h index ed719bdb819..b863a2f92ad 100644 --- a/cpp/src/parquet/level_conversion.h +++ b/cpp/src/parquet/level_conversion.h @@ -69,126 +69,5 @@ static inline uint64_t GreaterThanBitmap(const int16_t* levels, int64_t num_leve #endif -/// Append bits number_of_bits from new_bits to valid_bits and valid_bits_offset. -/// -/// \param[in] new_bits The zero-padded bitmap to append. -/// \param[in] number_of_bits The number of bits to append from new_bits. -/// \param[in] valid_bits_length The number of bytes allocated in valid_bits. -/// \param[in] valid_bits_offset The bit-offset at which to start appending new bits. -/// \param[in,out] valid_bits The validity bitmap to append to. -/// \returns The new bit offset inside of valid_bits. -static inline int64_t AppendBitmap(uint64_t new_bits, int64_t number_of_bits, - int64_t valid_bits_length, int64_t valid_bits_offset, - uint8_t* valid_bits) { - // Selection masks to retrieve all low order bits for each bytes. - constexpr uint64_t kLsbSelectionMasks[] = { - 0, // unused. - 0x0101010101010101, - 0x0303030303030303, - 0x0707070707070707, - 0x0F0F0F0F0F0F0F0F, - 0x1F1F1F1F1F1F1F1F, - 0x3F3F3F3F3F3F3F3F, - 0x7F7F7F7F7F7F7F7F, - }; - int64_t valid_byte_offset = valid_bits_offset / 8; - int64_t bit_offset = valid_bits_offset % 8; - - int64_t new_offset = valid_bits_offset + number_of_bits; - - if (bit_offset != 0) { - int64_t bits_to_carry = 8 - bit_offset; - // Get the mask the will select the lower order bits (the ones to carry - // over to the existing byte and shift up. - const uint64_t carry_mask = kLsbSelectionMasks[bits_to_carry]; - // Mask to select non-carried bits. - const uint64_t non_carry_mask = ~carry_mask; - - // valid bits should be a valid bitmask so all trailing bytes hsould be unset - // so no mask is need to start. - valid_bits[valid_byte_offset] = valid_bits[valid_byte_offset] | // See above the - (((new_bits & carry_mask) & 0xFF) << bit_offset); - - // We illustrate logic with a 3-byte example in little endian/LSB order - // (N indicates note set). - // Note this ordering is the reversed from HEX masks above with are expressed - // big-endian/MSB and shifts right move the bits to the left (division). - // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 - // Assuming a bit-off of 6: - // - // So shifted_new_bits should look like; - // 2 3 4 5 6 7 N N 10 11 12 13 14 15 N N 18 19 20 21 22 23 N N - // clang-format on - uint64_t shifted_new_bits = (new_bits & non_carry_mask) >> bits_to_carry; - // captured_carry: - // 0 1 N N N N N N 8 9 N N N N N N 16 17 N N N N N N - uint64_t captured_carry = carry_mask & new_bits; - // mask_cary_bits: - // N N N N N N 8 9 N N N N N N 16 17 N N N N N N N N - uint64_t mask_carry_bits = (captured_carry >> 8) << bit_offset; - - new_bits = shifted_new_bits | mask_carry_bits; - // Don't overwrite the first byte - valid_byte_offset += 1; - number_of_bits -= bits_to_carry; - } - - int64_t bytes_for_new_bits = ::arrow::BitUtil::BytesForBits(number_of_bits); - if (valid_bits_length - ::arrow::BitUtil::BytesForBits(valid_bits_offset) >= - static_cast(sizeof(new_bits))) { - // This should be the common case and inlined as a single instruction which - // should be cheaper then the general case of calling mempcy, so it is likely - // worth the extra branch. - std::memcpy(valid_bits + valid_byte_offset, &new_bits, sizeof(new_bits)); - } else { - std::memcpy(valid_bits + valid_byte_offset, &new_bits, bytes_for_new_bits); - } - return new_offset; -} - -/// \brief Appends bit values to the validitdy bimap_valid bits, based on bitmaps -/// generated by GreaterThanBitmap, and the appropriate treshold definition_leve. -/// -/// \param[in] new_bits Bitmap to append (intrepreted as Little-endian/LSB). -/// \param[in] new_bit_count The number of bits to append from new_bits. -/// \param[in,out] validity_bitmap The validity bitmap to update. -/// \param[in,out] validity_bitmap_offset The offset to start appending bits to in -/// valid_bits (updated to latest bitmap). -/// \param[in,out] set_bit_count The number of set bits appended is added to -/// set_bit_count. -static inline void AppendToValidityBitmap(uint64_t new_bits, int64_t new_bit_count, - uint8_t* validity_bitmap, - int64_t* validity_bitmap_offset, - int64_t* set_bit_count) { - int64_t min_valid_bits_size = - ::arrow::BitUtil::BytesForBits(new_bit_count + *validity_bitmap_offset); - - *set_bit_count += ::arrow::BitUtil::PopCount(new_bits); - *validity_bitmap_offset = AppendBitmap(new_bits, new_bit_count, min_valid_bits_size, - *validity_bitmap_offset, validity_bitmap); -} - -#if defined(ARROW_HAVE_BMI2) -/// The same as AppendToValidityBitmap but only appends bits from bitmap that have -/// a corresponding bit set in selection_bitmap. -/// -/// \returns The number of bits appended. -static inline int64_t AppendSelectedBitsToValidityBitmap(uint64_t new_bits, - uint64_t selection_bitmap, - uint8_t* validity_bitmap, - int64_t* validity_bitmap_offset, - int64_t* set_bit_count) { - // If the parent list was empty at for the given slot it should not be added to the - // bitmap. - uint64_t selected_bits = _pext_u64(new_bits, selection_bitmap); - int64_t selected_count = - static_cast(::arrow::BitUtil::PopCount(selection_bitmap)); - - AppendToValidityBitmap(selected_bits, selected_count, validity_bitmap, - validity_bitmap_offset, set_bit_count); - return selected_count; -} -#endif - } // namespace internal } // namespace parquet diff --git a/cpp/src/parquet/level_conversion_test.cc b/cpp/src/parquet/level_conversion_test.cc index a1816aff56c..c53b7defbd0 100644 --- a/cpp/src/parquet/level_conversion_test.cc +++ b/cpp/src/parquet/level_conversion_test.cc @@ -40,7 +40,6 @@ std::string BitmapToString(const std::vector& bitmap, int64_t bit_count TEST(TestColumnReader, DefinitionLevelsToBitmap) { // Bugs in this function were exposed in ARROW-3930 std::vector def_levels = {3, 3, 3, 2, 3, 3, 3, 3, 3}; - std::vector rep_levels = {0, 1, 1, 1, 1, 1, 1, 1, 1}; std::vector valid_bits(2, 0); @@ -71,7 +70,6 @@ TEST(TestColumnReader, DefinitionLevelsToBitmapPowerOfTwo) { // length equal to a power of two and also using a non-zero valid_bits_offset. This // should not fail when run with ASAN or valgrind. std::vector def_levels = {3, 3, 3, 2, 3, 3, 3, 3}; - std::vector rep_levels = {0, 1, 1, 1, 1, 1, 1, 1}; std::vector valid_bits(1, 0); const int max_def_level = 3; @@ -104,109 +102,25 @@ TEST(TestGreaterThanBitmap, GeneratesExpectedBitmasks) { 0x8080808080808080); } -TEST(TestAppendBitmap, TestOffsetOverwritesCorrectBitsOnExistingByte) { - auto check_append = [](const std::string& expected_bits, int64_t offset) { - std::vector valid_bits = {0x00}; - constexpr int64_t kBitsAfterAppend = 8; - ASSERT_EQ( - AppendBitmap(/*new_bits=*/0xFF, /*number_of_bits*/ 8 - offset, - /*valid_bits_length=*/valid_bits.size(), offset, valid_bits.data()), - kBitsAfterAppend); - EXPECT_EQ(BitmapToString(valid_bits, kBitsAfterAppend), expected_bits); - }; - check_append("11111111", 0); - check_append("01111111", 1); - check_append("00111111", 2); - check_append("00011111", 3); - check_append("00001111", 4); - check_append("00000111", 5); - check_append("00000011", 6); - check_append("00000001", 7); -} - -TEST(TestAppendBitmap, TestOffsetShiftBitsCorrectly) { - constexpr uint64_t kPattern = 0x9A9A9A9A9A9A9A9A; - auto check_append = [&](const std::string& leading_bits, const std::string& middle_bits, - const std::string& trailing_bits, int64_t offset) { - ASSERT_GE(offset, 8); - std::vector valid_bits(/*count=*/10, 0); - valid_bits[0] = 0x99; - - AppendBitmap(/*new_bits=*/kPattern, /*number_of_bits*/ 64, - /*valid_bits_length=*/valid_bits.size(), offset, valid_bits.data()); - EXPECT_EQ(valid_bits[0], 0x99); // shouldn't get chanked. - EXPECT_EQ(BitmapToString(valid_bits.data() + 1, /*num_bits=*/8), leading_bits); - for (int x = 2; x < 9; x++) { - EXPECT_EQ(BitmapToString(valid_bits.data() + x, /*num_bits=*/8), middle_bits); - } - EXPECT_EQ(BitmapToString(valid_bits.data() + 9, /*num_bits=*/8), trailing_bits); - }; - // Original Pattern = "01011001" - check_append(/*leading_bits= */ "01011001", /*middle_bits=*/"01011001", - /*trailing_bits=*/"00000000", /*offset=*/8); - check_append("00101100", "10101100", "10000000", 9); - check_append("00010110", "01010110", "01000000", 10); - check_append("00001011", "00101011", "00100000", 11); - check_append("00000101", "10010101", "10010000", 12); - check_append("00000010", "11001010", "11001000", 13); - check_append("00000001", "01100101", "01100100", 14); - check_append("00000000", "10110010", "10110010", 15); -} - -TEST(TestAppendBitmap, AllBytesAreWrittenWithEnoughSpace) { - std::vector valid_bits(/*count=*/9, 0); - - uint64_t bitmap = 0xFFFFFFFFFFFFFFFF; - AppendBitmap(bitmap, /*number_of_bits*/ 7, - /*valid_bits_length=*/valid_bits.size(), - /*valid_bits_offset=*/1, - /*valid_bits=*/valid_bits.data()); - EXPECT_THAT(valid_bits, ElementsAreArray(std::vector{ - 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01})); -} - -TEST(TestAppendBitmap, OnlyApproriateBytesWrittenWhenLessThen8BytesAvailable) { - std::vector valid_bits = {0x00, 0x00}; - - uint64_t bitmap = 0x1FF; - AppendBitmap(bitmap, /*number_of_bits*/ 7, - /*valid_bits_length=*/2, - /*valid_bits_offset=*/1, - /*valid_bits=*/valid_bits.data()); - - EXPECT_THAT(valid_bits, ElementsAreArray(std::vector{0xFE, 0x00})); - - AppendBitmap(bitmap, /*number_of_bits*/ 9, - /*valid_bits_length=*/2, - /*valid_bits_offset=*/1, - /*valid_bits=*/valid_bits.data()); - EXPECT_THAT(valid_bits, ElementsAreArray(std::vector{0xFE, 0x03})); -} - -TEST(TestAppendToValidityBitmap, BasicOperation) { - std::vector validity_bitmap(/*count*/ 8, 0); - int64_t valid_bitmap_offset = 1; - int64_t set_bit_count = 5; - AppendToValidityBitmap(/*new_bits*/ 0x99, /*new_bit_count=*/31, validity_bitmap.data(), - &valid_bitmap_offset, &set_bit_count); - EXPECT_EQ(BitmapToString(validity_bitmap, valid_bitmap_offset), - "01001100 10000000 00000000 00000000"); - EXPECT_EQ(set_bit_count, /*5 + 4 set bits=*/9); -} - -#if defined(ARROW_HAVE_BMI2) -TEST(TestAppendSelectedBitsToValidityBitmap, BasicOperation) { +TEST(DefinitionLevelsToBitmap, WithRepetitionLevelFiltersOutEmptyListValues) { std::vector validity_bitmap(/*count*/ 8, 0); - int64_t valid_bitmap_offset = 1; - int64_t set_bit_count = 5; - EXPECT_EQ(AppendSelectedBitsToValidityBitmap( - /*new_bits*/ 0x99, /*selection_bitmap=*/0xB8, validity_bitmap.data(), - &valid_bitmap_offset, &set_bit_count), - /*bits_processed=*/4); - EXPECT_EQ(BitmapToString(validity_bitmap, valid_bitmap_offset), "01101"); - EXPECT_EQ(set_bit_count, /*5 + 3 set bits=*/8); + int64_t null_count = 5; + int64_t values_read = 1; + + // All zeros should be ignored, ones should be unset in the bitmp and 2 should be set. + std::vector def_levels = {0, 0, 0, 2, 2, 1, 0, 2}; + DefinitionLevelsToBitmap( + def_levels.data(), def_levels.size(), /*max_definition_level=*/2, + /*max_repetition_level=*/1, &values_read, &null_count, validity_bitmap.data(), + /*valid_bits_offset=*/1); + + EXPECT_EQ(BitmapToString(validity_bitmap, /*bit_count=*/8), "01101000"); + for (size_t x = 1; x < validity_bitmap.size(); x++) { + EXPECT_EQ(validity_bitmap[x], 0) << "index: " << x; + } + EXPECT_EQ(null_count, /*5 + 1 =*/6); + EXPECT_EQ(values_read, 4); // value should get overwritten. } -#endif } // namespace internal } // namespace parquet From 9aeb97d834ac7fca20e9eff4370b7df8e036a87f Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sun, 26 Apr 2020 05:15:50 +0000 Subject: [PATCH 21/40] add missing file --- cpp/src/parquet/level_conversion.cc | 162 ++++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 cpp/src/parquet/level_conversion.cc diff --git a/cpp/src/parquet/level_conversion.cc b/cpp/src/parquet/level_conversion.cc new file mode 100644 index 00000000000..537bd6d9ad5 --- /dev/null +++ b/cpp/src/parquet/level_conversion.cc @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet/level_conversion.h" +#include "parquet/exception.h" + +namespace parquet { +namespace internal { +namespace { +inline void CheckLevelRange(const int16_t* levels, int64_t num_levels, + const int16_t max_expected_level) { + int16_t min_level = std::numeric_limits::max(); + int16_t max_level = std::numeric_limits::min(); + for (int x = 0; x < num_levels; x++) { + min_level = std::min(levels[x], min_level); + max_level = std::max(levels[x], max_level); + } + if (ARROW_PREDICT_FALSE(num_levels > 0 && (min_level < 0 || max_level > max_level))) { + throw ParquetException("definition level exceeds maximum"); + } +} + +#if !defined(ARROW_HAVE_BMI2) + +inline void DefinitionLevelsToBitmapScalar( + const int16_t* def_levels, int64_t num_def_levels, const int16_t max_definition_level, + const int16_t max_repetition_level, int64_t* values_read, int64_t* null_count, + uint8_t* valid_bits, int64_t valid_bits_offset) { + // We assume here that valid_bits is large enough to accommodate the + // additional definition levels and the ones that have already been written + ::arrow::internal::BitmapWriter valid_bits_writer(valid_bits, valid_bits_offset, + num_def_levels); + + // TODO(itaiin): As an interim solution we are splitting the code path here + // between repeated+flat column reads, and non-repeated+nested reads. + // Those paths need to be merged in the future + for (int i = 0; i < num_def_levels; ++i) { + if (def_levels[i] == max_definition_level) { + valid_bits_writer.Set(); + } else if (max_repetition_level > 0) { + // repetition+flat case + if (def_levels[i] == (max_definition_level - 1)) { + valid_bits_writer.Clear(); + *null_count += 1; + } else { + continue; + } + } else { + // non-repeated+nested case + if (def_levels[i] < max_definition_level) { + valid_bits_writer.Clear(); + *null_count += 1; + } else { + throw ParquetException("definition level exceeds maximum"); + } + } + + valid_bits_writer.Next(); + } + valid_bits_writer.Finish(); + *values_read = valid_bits_writer.position(); +} +#endif + +template +void DefinitionLevelsToBitmapSimd(const int16_t* def_levels, int64_t num_def_levels, + const int16_t required_definition_level, + int64_t* values_read, int64_t* null_count, + uint8_t* valid_bits, int64_t valid_bits_offset) { + constexpr int64_t kBitMaskSize = 64; + ::arrow::internal::FirstTimeBitmapWriter writer(valid_bits, + /*start_offset=*/valid_bits_offset, + /*length=*/num_def_levels); + int64_t set_count = 0; + *values_read = 0; + while (num_def_levels > 0) { + int64_t batch_size = std::min(num_def_levels, kBitMaskSize); + CheckLevelRange(def_levels, batch_size, required_definition_level); + uint64_t defined_bitmap = internal::GreaterThanBitmap(def_levels, batch_size, + required_definition_level - 1); + if (has_repeated_parent) { +#if defined(ARROW_HAVE_BMI2) + // This is currently a specialized code path assuming only (nested) lists + // present through the leaf (i.e. no structs). + // Upper level code only calls this method + // when the leaf-values are nullable (otherwise no spacing is needed), + // Because only nested lists exists it is sufficient to know that the field + // was either null or included it (i.e. definition level > max_definitation_level + // -2) If there where structs mixed in, we need to know the def_level of the + // repeated parent so we can check for def_level > "def level of repeated parent". + uint64_t present_bitmap = internal::GreaterThanBitmap( + def_levels, batch_size, required_definition_level - 2); + uint64_t selected_bits = _pext_u64(defined_bitmap, present_bitmap); + writer.AppendWord(selected_bits, ::arrow::BitUtil::PopCount(present_bitmap)); + set_count += ::arrow::BitUtil::PopCount(selected_bits); +#else + assert(false && "must not execute this without BMI2"); +#endif + } else { + writer.AppendWord(defined_bitmap, batch_size); + set_count += ::arrow::BitUtil::PopCount(defined_bitmap); + } + def_levels += batch_size; + num_def_levels -= batch_size; + } + + *values_read = writer.position(); + *null_count += *values_read - set_count; + writer.Finish(); +} + +} // namespace + +void DefinitionLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels, + const int16_t max_definition_level, + const int16_t max_repetition_level, int64_t* values_read, + int64_t* null_count, uint8_t* valid_bits, + int64_t valid_bits_offset) { + if (max_repetition_level > 0) { +#if ARROW_LITTLE_ENDIAN + +#if defined(ARROW_HAVE_BMI2) + // BMI2 is required for efficient bit extraction. + DefinitionLevelsToBitmapSimd( + def_levels, num_def_levels, max_definition_level, values_read, null_count, + valid_bits, valid_bits_offset); +#else + DefinitionLevelsToBitmapScalar(def_levels, num_def_levels, max_definition_level, + max_repetition_level, values_read, null_count, + valid_bits, valid_bits_offset); +#endif // ARROW_HAVE_BMI2 + + } else { + // No BMI2 intsturctions are used for non-repeated case. + DefinitionLevelsToBitmapSimd( + def_levels, num_def_levels, max_definition_level, values_read, null_count, + valid_bits, valid_bits_offset); + } + +#else // big-endian + DefinitionLevelsToBitmapScalar(def_levels, num_def_levels, max_definition_level, + max_repetition_level, values_read, null_count, + valid_bits, valid_bits_offset); +#endif +} + +} // namespace internal +} // namespace parquet From 435523f8d34627b4401913bbf1d7a16c84649b0b Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sun, 26 Apr 2020 05:17:47 +0000 Subject: [PATCH 22/40] move docs inside defined guard --- cpp/src/parquet/level_conversion.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/parquet/level_conversion.h b/cpp/src/parquet/level_conversion.h index b863a2f92ad..82a57b5e992 100644 --- a/cpp/src/parquet/level_conversion.h +++ b/cpp/src/parquet/level_conversion.h @@ -38,6 +38,7 @@ void PARQUET_EXPORT DefinitionLevelsToBitmap( // They currently represent minimal functionality for vectorized computation of definition // levels. +#if defined(ARROW_LITTLE_ENDIAN) /// Builds a bitmap by applying predicate to the level vector provided. /// /// \param[in] levels Rep or def level array. @@ -48,8 +49,6 @@ void PARQUET_EXPORT DefinitionLevelsToBitmap( /// /// N.B. Correct byte ordering is dependent on little-endian architectures. /// - -#if defined(ARROW_LITTLE_ENDIAN) template uint64_t LevelsToBitmap(const int16_t* levels, int64_t num_levels, Predicate predicate) { // Both clang and GCC can vectorize this automatically with SSE4/AVX2. From 4226e16297d81117a201893811273baacd7a3738 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sun, 26 Apr 2020 05:44:36 +0000 Subject: [PATCH 23/40] fix lint and some other unnecessary includes --- cpp/src/parquet/level_conversion.cc | 10 +++++++++- cpp/src/parquet/level_conversion.h | 5 ----- cpp/src/parquet/level_conversion_test.cc | 6 ++++-- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/cpp/src/parquet/level_conversion.cc b/cpp/src/parquet/level_conversion.cc index 537bd6d9ad5..d24809cff33 100644 --- a/cpp/src/parquet/level_conversion.cc +++ b/cpp/src/parquet/level_conversion.cc @@ -14,8 +14,16 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. - #include "parquet/level_conversion.h" + +#include +#include +#if defined(ARROW_HAVE_BMI2) +#include +#endif + +#include "arrow/util/bit_util.h" + #include "parquet/exception.h" namespace parquet { diff --git a/cpp/src/parquet/level_conversion.h b/cpp/src/parquet/level_conversion.h index 82a57b5e992..afe5fabbb7d 100644 --- a/cpp/src/parquet/level_conversion.h +++ b/cpp/src/parquet/level_conversion.h @@ -19,13 +19,8 @@ #include -#include "arrow/util/bit_util.h" #include "parquet/platform.h" -#if defined(ARROW_HAVE_BMI2) -#include "x86intrin.h" -#endif - namespace parquet { namespace internal { diff --git a/cpp/src/parquet/level_conversion_test.cc b/cpp/src/parquet/level_conversion_test.cc index c53b7defbd0..0bdbad5a624 100644 --- a/cpp/src/parquet/level_conversion_test.cc +++ b/cpp/src/parquet/level_conversion_test.cc @@ -14,13 +14,13 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. - #include "parquet/level_conversion.h" #include #include #include +#include #include "arrow/util/bit_util.h" @@ -86,7 +86,8 @@ TEST(TestColumnReader, DefinitionLevelsToBitmapPowerOfTwo) { ASSERT_EQ(0, null_count); } -TEST(TestGreaterThanBitmap, GeneratesExpectedBitmasks) { +#if defined(ARROW_LITTLE_ENDIAN) +TEST(GreaterThanBitmap, GeneratesExpectedBitmasks) { std::vector levels = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, @@ -101,6 +102,7 @@ TEST(TestGreaterThanBitmap, GeneratesExpectedBitmasks) { EXPECT_EQ(GreaterThanBitmap(levels.data(), /*num_levels=*/64, /*rhs*/ 6), 0x8080808080808080); } +#endif TEST(DefinitionLevelsToBitmap, WithRepetitionLevelFiltersOutEmptyListValues) { std::vector validity_bitmap(/*count*/ 8, 0); From a36288324634d0fc22da8b2e39d9b0baefb49945 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 1 May 2020 05:16:11 +0000 Subject: [PATCH 24/40] reword/try to fix typos in docs --- cpp/src/arrow/util/bit_util.h | 42 ++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index 8074d26d034..1b3b79a2093 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -589,11 +589,11 @@ class FirstTimeBitmapWriter { /// Appends number_of_bits from word to valid_bits and valid_bits_offset. /// /// \param[in] word The LSB bitmap to append. Any bits past number_of_bits are assumed - /// to be zero. + /// to be unset (i.e. 0). /// \param[in] number_of_bits The number of bits to append from word. void AppendWord(uint64_t word, int64_t number_of_bits) { #if defined(ARROW_LITTLE_ENDIAN) - // Selection masks to retrieve all low order bits for each bytes. + // Selection masks to retrieve all low order bits for each byte in word. constexpr uint64_t kLsbSelectionMasks[] = { 0, // unused. 0x0101010101010101, @@ -618,39 +618,41 @@ class FirstTimeBitmapWriter { byte_offset_ += (bit_offset + number_of_bits) / 8; if (bit_offset != 0) { - // We are in the middle of the byte. This code updates the byte and shifts the word - // so trailing bits can be appended below. + // We are in the middle of the byte. This code updates the byte and shifts + // bits appropriately within word so it can be memcpy'd below. int64_t bits_to_carry = 8 - bit_offset; - // Get the mask the will select the lower order bits (the ones to carry - // over to the existing byte and shift up. + // Get the mask that will select the least significant bit (the ones to carry + // over to the current_byte_ and shift up). const uint64_t carry_mask = kLsbSelectionMasks[bits_to_carry]; // Mask to select non-carried bits. const uint64_t non_carry_mask = ~carry_mask; - // valid bits should be a valid bitmask so all trailing bytes hsould be unset - // so no mask is need to start. + // Carry over bits from word to current_byte_. We assume any extra bits in word + // unset so no additional accounting is needed for when number_of_bits < + // bits_to_carry. current_byte_ |= (((word & carry_mask) & 0xFF) << bit_offset); + // Check if everything is transfered into current_byte_. if (ARROW_PREDICT_FALSE(number_of_bits < bits_to_carry)) { return; } *append_position = current_byte_; append_position++; - // We illustrate logic with a 3-byte example in little endian/LSB order - // (N indicates not set bit Y indicates a set bit). - // Note this ordering is the reversed from HEX masks above with are expressed - // big-endian/MSB and shifts right move the bits to the left (division). + // We illustrate the logic with a 3-byte example in little-endian/LSB order + // ('N' indicates a not set bit, 'Y' indicates a set bit). + // Note this ordering is reversed from HEX masks above with are expressed + // big-endian/MSB and shifts right move the bits to the left. + // The original bit positions are: // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 - // Assuming a bit-offset of 6 the non_carry_mask looks like: + // Assuming a bit-offset of 6 non_carry_mask is: // N N Y Y Y Y Y Y N N Y Y Y Y Y Y N N Y Y Y Y Y Y - // So shifted_word should look like; + // shifted_word is: // 2 3 4 5 6 7 N N 10 11 12 13 14 15 N N 18 19 20 21 22 23 N N - // clang-format on uint64_t shifted_word = (word & non_carry_mask) >> bits_to_carry; - // captured_carry: + // captured_carry is: // 0 1 N N N N N N 8 9 N N N N N N 16 17 N N N N N N uint64_t captured_carry = carry_mask & word; - // mask_cary_bits: + // mask_carry_bits is: // N N N N N N 8 9 N N N N N N 16 17 N N N N N N N N uint64_t mask_carry_bits = (captured_carry >> 8) << bit_offset; @@ -660,9 +662,9 @@ class FirstTimeBitmapWriter { int64_t bytes_for_word = ::arrow::BitUtil::BytesForBits(number_of_bits); std::memcpy(append_position, &word, bytes_for_word); - // At this point, we are guaranteed to have flushed the previous current_byte_ state. - // So the new state is either the last relevant byte in 'word' - // or zero if we happen to be at a fresh byte. + // At this point, the previous current_byte_ has been written to bitmap_. + // The new current_byte_ is either the last relevant byte in 'word' + // or cleared if the new position is byte aligned (i.e. a fresh byte). current_byte_ = bit_mask_ != 0x1 ? *(reinterpret_cast(&word) + bytes_for_word - 1) : 0; From 439cb01cd5e600fa94133d649fb51a0762ebc37c Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 4 May 2020 04:26:04 +0000 Subject: [PATCH 25/40] try suggested R fix --- r/R/parquet.R | 3 ++- r/man/ParquetFileReader.Rd | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/r/R/parquet.R b/r/R/parquet.R index 8d363e4cb40..1415a06dab0 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -439,7 +439,7 @@ ParquetFileWriter$create <- function( #' @export #' @examples #' \donttest{ -#' f <- system.file("v0.7.1.parquet", package="arrow") +#' f <- ReadableFile$create(system.file("v0.7.1.parquet", package="arrow")) #' pq <- ParquetFileReader$create(f) #' pq$GetSchema() #' if (codec_is_available("snappy")) { @@ -447,6 +447,7 @@ ParquetFileWriter$create <- function( #' tab <- pq$ReadTable(starts_with("c")) #' tab$schema #' } +#' f$close() #' } #' @include arrow-package.R ParquetFileReader <- R6Class("ParquetFileReader", diff --git a/r/man/ParquetFileReader.Rd b/r/man/ParquetFileReader.Rd index 1e9f78f4d04..3e052ae20d1 100644 --- a/r/man/ParquetFileReader.Rd +++ b/r/man/ParquetFileReader.Rd @@ -33,7 +33,7 @@ with columns filtered by a character vector of column names or a \examples{ \donttest{ -f <- system.file("v0.7.1.parquet", package="arrow") +f <- ReadableFile$create(system.file("v0.7.1.parquet", package="arrow")) pq <- ParquetFileReader$create(f) pq$GetSchema() if (codec_is_available("snappy")) { @@ -41,5 +41,6 @@ if (codec_is_available("snappy")) { tab <- pq$ReadTable(starts_with("c")) tab$schema } +f$close() } } From e1d0b78f9edb2bcca933b6e6ea0dbc837d29d764 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 5 May 2020 03:01:46 +0000 Subject: [PATCH 26/40] Revert "try suggested R fix" This reverts commit 59e39794974ebade0cff0ba702b6cbc9cbd43b87. --- r/R/parquet.R | 3 +-- r/man/ParquetFileReader.Rd | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/r/R/parquet.R b/r/R/parquet.R index 1415a06dab0..8d363e4cb40 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -439,7 +439,7 @@ ParquetFileWriter$create <- function( #' @export #' @examples #' \donttest{ -#' f <- ReadableFile$create(system.file("v0.7.1.parquet", package="arrow")) +#' f <- system.file("v0.7.1.parquet", package="arrow") #' pq <- ParquetFileReader$create(f) #' pq$GetSchema() #' if (codec_is_available("snappy")) { @@ -447,7 +447,6 @@ ParquetFileWriter$create <- function( #' tab <- pq$ReadTable(starts_with("c")) #' tab$schema #' } -#' f$close() #' } #' @include arrow-package.R ParquetFileReader <- R6Class("ParquetFileReader", diff --git a/r/man/ParquetFileReader.Rd b/r/man/ParquetFileReader.Rd index 3e052ae20d1..1e9f78f4d04 100644 --- a/r/man/ParquetFileReader.Rd +++ b/r/man/ParquetFileReader.Rd @@ -33,7 +33,7 @@ with columns filtered by a character vector of column names or a \examples{ \donttest{ -f <- ReadableFile$create(system.file("v0.7.1.parquet", package="arrow")) +f <- system.file("v0.7.1.parquet", package="arrow") pq <- ParquetFileReader$create(f) pq$GetSchema() if (codec_is_available("snappy")) { @@ -41,6 +41,5 @@ if (codec_is_available("snappy")) { tab <- pq$ReadTable(starts_with("c")) tab$schema } -f$close() } } From 66e85958d8dd8c12921a0f2236853240b0ce9233 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 5 May 2020 03:32:19 +0000 Subject: [PATCH 27/40] simplify AppendWord --- cpp/src/arrow/util/bit_util.h | 54 +++++++---------------------------- 1 file changed, 11 insertions(+), 43 deletions(-) diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index 1b3b79a2093..1dc171f61b4 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -593,17 +593,6 @@ class FirstTimeBitmapWriter { /// \param[in] number_of_bits The number of bits to append from word. void AppendWord(uint64_t word, int64_t number_of_bits) { #if defined(ARROW_LITTLE_ENDIAN) - // Selection masks to retrieve all low order bits for each byte in word. - constexpr uint64_t kLsbSelectionMasks[] = { - 0, // unused. - 0x0101010101010101, - 0x0303030303030303, - 0x0707070707070707, - 0x0F0F0F0F0F0F0F0F, - 0x1F1F1F1F1F1F1F1F, - 0x3F3F3F3F3F3F3F3F, - 0x7F7F7F7F7F7F7F7F, - }; if (ARROW_PREDICT_FALSE(number_of_bits == 0)) { return; } @@ -621,42 +610,18 @@ class FirstTimeBitmapWriter { // We are in the middle of the byte. This code updates the byte and shifts // bits appropriately within word so it can be memcpy'd below. int64_t bits_to_carry = 8 - bit_offset; - // Get the mask that will select the least significant bit (the ones to carry - // over to the current_byte_ and shift up). - const uint64_t carry_mask = kLsbSelectionMasks[bits_to_carry]; - // Mask to select non-carried bits. - const uint64_t non_carry_mask = ~carry_mask; - // Carry over bits from word to current_byte_. We assume any extra bits in word // unset so no additional accounting is needed for when number_of_bits < // bits_to_carry. - current_byte_ |= (((word & carry_mask) & 0xFF) << bit_offset); + current_byte_ |= (word & BitUtil::kPrecedingBitmask[bits_to_carry]) << bit_offset; // Check if everything is transfered into current_byte_. if (ARROW_PREDICT_FALSE(number_of_bits < bits_to_carry)) { return; } *append_position = current_byte_; append_position++; - - // We illustrate the logic with a 3-byte example in little-endian/LSB order - // ('N' indicates a not set bit, 'Y' indicates a set bit). - // Note this ordering is reversed from HEX masks above with are expressed - // big-endian/MSB and shifts right move the bits to the left. - // The original bit positions are: - // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 - // Assuming a bit-offset of 6 non_carry_mask is: - // N N Y Y Y Y Y Y N N Y Y Y Y Y Y N N Y Y Y Y Y Y - // shifted_word is: - // 2 3 4 5 6 7 N N 10 11 12 13 14 15 N N 18 19 20 21 22 23 N N - uint64_t shifted_word = (word & non_carry_mask) >> bits_to_carry; - // captured_carry is: - // 0 1 N N N N N N 8 9 N N N N N N 16 17 N N N N N N - uint64_t captured_carry = carry_mask & word; - // mask_carry_bits is: - // N N N N N N 8 9 N N N N N N 16 17 N N N N N N N N - uint64_t mask_carry_bits = (captured_carry >> 8) << bit_offset; - - word = shifted_word | mask_carry_bits; + // Move the carry bits off of word. + word = word >> bits_to_carry; number_of_bits -= bits_to_carry; } @@ -665,19 +630,22 @@ class FirstTimeBitmapWriter { // At this point, the previous current_byte_ has been written to bitmap_. // The new current_byte_ is either the last relevant byte in 'word' // or cleared if the new position is byte aligned (i.e. a fresh byte). - current_byte_ = - bit_mask_ != 0x1 ? *(reinterpret_cast(&word) + bytes_for_word - 1) : 0; + if (bit_mask_ == 0x1) { + current_byte_ = 0; + } else { + // current_byte_ = *(reinterpret_cast(&word) + bytes_for_word - 1) : 0; + current_byte_ = *(append_position + bytes_for_word - 1); + } #else // big-endian - BitmapReader reader(reinterpret_cast(&word), 0, /*length=*/number_of_bits); for (int x = 0; x < number_of_bits; x++) { - if (reader.IsSet()) { + if (word & 0x1) { Set(); } else { Clear(); } Next(); - reader.Next(); + word >>= 1; } return; #endif From c0baaa9c108ed6e2b191eb6b4607a85824d874d7 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 5 May 2020 03:53:32 +0000 Subject: [PATCH 28/40] add tests for offsets correct typo --- cpp/src/arrow/util/bit_util.h | 1 - cpp/src/arrow/util/bit_util_test.cc | 38 ++++++++++++++++++++++++++++- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index 1dc171f61b4..f9eccb43c7b 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -633,7 +633,6 @@ class FirstTimeBitmapWriter { if (bit_mask_ == 0x1) { current_byte_ = 0; } else { - // current_byte_ = *(reinterpret_cast(&word) + bytes_for_word - 1) : 0; current_byte_ = *(append_position + bytes_for_word - 1); } diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc index 83bf5ccd973..d0572e8d117 100644 --- a/cpp/src/arrow/util/bit_util_test.cc +++ b/cpp/src/arrow/util/bit_util_test.cc @@ -344,6 +344,42 @@ TEST(FirstTimeBitmapWriter, AppendWordOffsetOverwritesCorrectBitsOnExistingByte) check_append("00000111", 5); check_append("00000011", 6); check_append("00000001", 7); + + auto check_with_set = [](const std::string& expected_bits, int64_t offset) { + std::vector valid_bits = {0x1}; + constexpr int64_t kBitsAfterAppend = 8; + internal::FirstTimeBitmapWriter writer(valid_bits.data(), offset, + /*length=*/(8 * valid_bits.size()) - offset); + writer.AppendWord(/*word=*/0xFF, /*number_of_bits=*/kBitsAfterAppend - offset); + writer.Finish(); + EXPECT_EQ(BitmapToString(valid_bits, kBitsAfterAppend), expected_bits); + }; + // 0ffset zero would not be a valid mask. + check_with_set("11111111", 1); + check_with_set("10111111", 2); + check_with_set("10011111", 3); + check_with_set("10001111", 4); + check_with_set("10000111", 5); + check_with_set("10000011", 6); + check_with_set("10000001", 7); + + auto check_with_preceding = [](const std::string& expected_bits, int64_t offset) { + std::vector valid_bits = {BitUtil::kPrecedingBitmask[offset]}; + constexpr int64_t kBitsAfterAppend = 8; + internal::FirstTimeBitmapWriter writer(valid_bits.data(), offset, + /*length=*/(8 * valid_bits.size()) - offset); + writer.AppendWord(/*word=*/0xFF, /*number_of_bits=*/kBitsAfterAppend - offset); + writer.Finish(); + EXPECT_EQ(BitmapToString(valid_bits, kBitsAfterAppend), expected_bits); + }; + check_with_preceding("11111111", 0); + check_with_preceding("11111111", 1); + check_with_preceding("11111111", 2); + check_with_preceding("11111111", 3); + check_with_preceding("11111111", 4); + check_with_preceding("11111111", 5); + check_with_preceding("11111111", 6); + check_with_preceding("11111111", 7); } TEST(FirstTimeBitmapWriter, AppendZeroBitsHasNoImpact) { @@ -357,7 +393,7 @@ TEST(FirstTimeBitmapWriter, AppendZeroBitsHasNoImpact) { EXPECT_EQ(valid_bits[0], 0x2); } -TEST(FirstTimeBitmapWriter, AppendLessThenByte) { +TEST(FirstTimeBitmapWriter, AppendLessThanByte) { std::vector valid_bits(/*count*/ 8, 0); internal::FirstTimeBitmapWriter writer(valid_bits.data(), /*start_offset=*/1, /*length=*/8); From d07849da3138555574e73e799c3e8e2a784f92d6 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 6 May 2020 03:37:44 +0000 Subject: [PATCH 29/40] add tests --- cpp/src/arrow/util/bit_util_test.cc | 69 +++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc index d0572e8d117..af938231f7b 100644 --- a/cpp/src/arrow/util/bit_util_test.cc +++ b/cpp/src/arrow/util/bit_util_test.cc @@ -364,7 +364,7 @@ TEST(FirstTimeBitmapWriter, AppendWordOffsetOverwritesCorrectBitsOnExistingByte) check_with_set("10000001", 7); auto check_with_preceding = [](const std::string& expected_bits, int64_t offset) { - std::vector valid_bits = {BitUtil::kPrecedingBitmask[offset]}; + std::vector valid_bits = {0xFF}; constexpr int64_t kBitsAfterAppend = 8; internal::FirstTimeBitmapWriter writer(valid_bits.data(), offset, /*length=*/(8 * valid_bits.size()) - offset); @@ -394,36 +394,59 @@ TEST(FirstTimeBitmapWriter, AppendZeroBitsHasNoImpact) { } TEST(FirstTimeBitmapWriter, AppendLessThanByte) { - std::vector valid_bits(/*count*/ 8, 0); - internal::FirstTimeBitmapWriter writer(valid_bits.data(), /*start_offset=*/1, - /*length=*/8); - writer.AppendWord(0xB, 4); - writer.Finish(); - EXPECT_EQ(BitmapToString(valid_bits, /*bit_count=*/8), "01101000"); + { + std::vector valid_bits(/*count*/ 8, 0); + internal::FirstTimeBitmapWriter writer(valid_bits.data(), /*start_offset=*/1, + /*length=*/8); + writer.AppendWord(0xB, 4); + writer.Finish(); + EXPECT_EQ(BitmapToString(valid_bits, /*bit_count=*/8), "01101000"); + } + { + // Test with all bits initially set. + std::vector valid_bits(/*count*/ 8, 0xFF); + internal::FirstTimeBitmapWriter writer(valid_bits.data(), /*start_offset=*/1, + /*length=*/8); + writer.AppendWord(0xB, 4); + writer.Finish(); + EXPECT_EQ(BitmapToString(valid_bits, /*bit_count=*/8), "11101000"); + } } TEST(FirstTimeBitmapWriter, AppendByteThenMore) { - std::vector valid_bits(/*count*/ 8, 0); - internal::FirstTimeBitmapWriter writer(valid_bits.data(), /*start_offset=*/0, - /*length=*/9); - writer.AppendWord(0xC3, 8); - writer.AppendWord(0x01, 1); - writer.Finish(); - EXPECT_EQ(BitmapToString(valid_bits, /*bit_count=*/9), "11000011 1"); + { + std::vector valid_bits(/*count*/ 8, 0); + internal::FirstTimeBitmapWriter writer(valid_bits.data(), /*start_offset=*/0, + /*length=*/9); + writer.AppendWord(0xC3, 8); + writer.AppendWord(0x01, 1); + writer.Finish(); + EXPECT_EQ(BitmapToString(valid_bits, /*bit_count=*/9), "11000011 1"); + } + { + std::vector valid_bits(/*count*/ 8, 0xFF); + internal::FirstTimeBitmapWriter writer(valid_bits.data(), /*start_offset=*/0, + /*length=*/9); + writer.AppendWord(0xC3, 8); + writer.AppendWord(0x01, 1); + writer.Finish(); + EXPECT_EQ(BitmapToString(valid_bits, /*bit_count=*/9), "11000011 1"); + } } TEST(FirstTimeBitmapWriter, AppendWordShiftsBitsCorrectly) { constexpr uint64_t kPattern = 0x9A9A9A9A9A9A9A9A; auto check_append = [&](const std::string& leading_bits, const std::string& middle_bits, - const std::string& trailing_bits, int64_t offset) { + const std::string& trailing_bits, int64_t offset, + bool preset_buffer_bits = false) { ASSERT_GE(offset, 8); - std::vector valid_bits(/*count=*/10, 0); + std::vector valid_bits(/*count=*/10, preset_buffer_bits ? 0xFF : 0); valid_bits[0] = 0x99; internal::FirstTimeBitmapWriter writer(valid_bits.data(), offset, - /*length=*/(8 * valid_bits.size()) - offset); + /*length=*/(9 * sizeof(kPattern)) - offset); writer.AppendWord(/*word=*/kPattern, /*number_of_bits=*/64); writer.Finish(); - EXPECT_EQ(valid_bits[0], 0x99); // shouldn't get chanked. + EXPECT_EQ(valid_bits[0], 0x99); // shouldn't get changed. EXPECT_EQ(BitmapToString(valid_bits.data() + 1, /*num_bits=*/8), leading_bits); for (int x = 2; x < 9; x++) { EXPECT_EQ(BitmapToString(valid_bits.data() + x, /*num_bits=*/8), middle_bits) @@ -441,6 +464,16 @@ TEST(FirstTimeBitmapWriter, AppendWordShiftsBitsCorrectly) { check_append("00000010", "11001010", "11001000", 13); check_append("00000001", "01100101", "01100100", 14); check_append("00000000", "10110010", "10110010", 15); + + check_append(/*leading_bits= */ "01011001", /*middle_bits=*/"01011001", + /*trailing_bits=*/"11111111", /*offset=*/8, /*preset_buffer_bits=*/true); + check_append("10101100", "10101100", "10000000", 9, true); + check_append("11010110", "01010110", "01000000", 10, true); + check_append("11101011", "00101011", "00100000", 11, true); + check_append("11110101", "10010101", "10010000", 12, true); + check_append("11111010", "11001010", "11001000", 13, true); + check_append("11111101", "01100101", "01100100", 14, true); + check_append("11111110", "10110010", "10110010", 15, true); } TEST(TestAppendBitmap, AppendWordOnlyApproriateBytesWritten) { From c82ac10897c6e4f09f7651e8fb47c4f0900e0277 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 6 May 2020 04:21:28 +0000 Subject: [PATCH 30/40] address the rest of feedbac --- cpp/src/parquet/level_conversion.cc | 108 +++++++++++++++++----------- cpp/src/parquet/level_conversion.h | 4 +- 2 files changed, 68 insertions(+), 44 deletions(-) diff --git a/cpp/src/parquet/level_conversion.cc b/cpp/src/parquet/level_conversion.cc index d24809cff33..fcc3295b74f 100644 --- a/cpp/src/parquet/level_conversion.cc +++ b/cpp/src/parquet/level_conversion.cc @@ -23,6 +23,7 @@ #endif #include "arrow/util/bit_util.h" +#include "arrow/util/logging.h" #include "parquet/exception.h" @@ -84,6 +85,40 @@ inline void DefinitionLevelsToBitmapScalar( } #endif +template +void DefinitionLevelsBatchToBitmap(const int16_t* def_levels, int64_t num_def_levels, + const int16_t required_definition_level, + const int batch_size, int64_t* set_count, + ::arrow::internal::FirstTimeBitmapWriter* writer) { + CheckLevelRange(def_levels, batch_size, required_definition_level); + uint64_t defined_bitmap = + internal::GreaterThanBitmap(def_levels, batch_size, required_definition_level - 1); + + DCHECK_LE(batch_size, 64); + if (has_repeated_parent) { +#if defined(ARROW_HAVE_BMI2) + // This is currently a specialized code path assuming only (nested) lists + // present through the leaf (i.e. no structs). + // Upper level code only calls this method + // when the leaf-values are nullable (otherwise no spacing is needed), + // Because only nested lists exists it is sufficient to know that the field + // was either null or included it (i.e. definition level > max_definitation_level + // -2) If there where structs mixed in, we need to know the def_level of the + // repeated parent so we can check for def_level > "def level of repeated parent". + uint64_t present_bitmap = internal::GreaterThanBitmap(def_levels, batch_size, + required_definition_level - 2); + uint64_t selected_bits = _pext_u64(defined_bitmap, present_bitmap); + writer->AppendWord(selected_bits, ::arrow::BitUtil::PopCount(present_bitmap)); + *set_count += ::arrow::BitUtil::PopCount(selected_bits); +#else + assert(false && "must not execute this without BMI2"); +#endif + } else { + writer->AppendWord(defined_bitmap, batch_size); + *set_count += ::arrow::BitUtil::PopCount(defined_bitmap); + } +} + template void DefinitionLevelsToBitmapSimd(const int16_t* def_levels, int64_t num_def_levels, const int16_t required_definition_level, @@ -95,52 +130,27 @@ void DefinitionLevelsToBitmapSimd(const int16_t* def_levels, int64_t num_def_lev /*length=*/num_def_levels); int64_t set_count = 0; *values_read = 0; - while (num_def_levels > 0) { - int64_t batch_size = std::min(num_def_levels, kBitMaskSize); - CheckLevelRange(def_levels, batch_size, required_definition_level); - uint64_t defined_bitmap = internal::GreaterThanBitmap(def_levels, batch_size, - required_definition_level - 1); - if (has_repeated_parent) { -#if defined(ARROW_HAVE_BMI2) - // This is currently a specialized code path assuming only (nested) lists - // present through the leaf (i.e. no structs). - // Upper level code only calls this method - // when the leaf-values are nullable (otherwise no spacing is needed), - // Because only nested lists exists it is sufficient to know that the field - // was either null or included it (i.e. definition level > max_definitation_level - // -2) If there where structs mixed in, we need to know the def_level of the - // repeated parent so we can check for def_level > "def level of repeated parent". - uint64_t present_bitmap = internal::GreaterThanBitmap( - def_levels, batch_size, required_definition_level - 2); - uint64_t selected_bits = _pext_u64(defined_bitmap, present_bitmap); - writer.AppendWord(selected_bits, ::arrow::BitUtil::PopCount(present_bitmap)); - set_count += ::arrow::BitUtil::PopCount(selected_bits); -#else - assert(false && "must not execute this without BMI2"); -#endif - } else { - writer.AppendWord(defined_bitmap, batch_size); - set_count += ::arrow::BitUtil::PopCount(defined_bitmap); - } - def_levels += batch_size; - num_def_levels -= batch_size; + while (num_def_levels > kBitMaskSize) { + DefinitionLevelsBatchToBitmap( + def_levels, num_def_levels, required_definition_level, + /*batch_size=*/kBitMaskSize, &set_count, &writer); + def_levels += kBitMaskSize; + num_def_levels -= kBitMaskSize; } + DefinitionLevelsBatchToBitmap( + def_levels, num_def_levels, required_definition_level, + /*batch_size=*/num_def_levels, &set_count, &writer); *values_read = writer.position(); *null_count += *values_read - set_count; writer.Finish(); } -} // namespace - -void DefinitionLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels, - const int16_t max_definition_level, - const int16_t max_repetition_level, int64_t* values_read, - int64_t* null_count, uint8_t* valid_bits, - int64_t valid_bits_offset) { +void DefinitionLevelsToBitmapLittleEndian( + const int16_t* def_levels, int64_t num_def_levels, const int16_t max_definition_level, + const int16_t max_repetition_level, int64_t* values_read, int64_t* null_count, + uint8_t* valid_bits, int64_t valid_bits_offset) { if (max_repetition_level > 0) { -#if ARROW_LITTLE_ENDIAN - #if defined(ARROW_HAVE_BMI2) // BMI2 is required for efficient bit extraction. DefinitionLevelsToBitmapSimd( @@ -158,11 +168,25 @@ void DefinitionLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels, def_levels, num_def_levels, max_definition_level, values_read, null_count, valid_bits, valid_bits_offset); } +} + +} // namespace + +void DefinitionLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels, + const int16_t max_definition_level, + const int16_t max_repetition_level, int64_t* values_read, + int64_t* null_count, uint8_t* valid_bits, + int64_t valid_bits_offset) { +#if ARROW_LITTLE_ENDIAN + DefinitionLevelsToBitmapLittleEndian(def_levels, num_def_levels, max_definition_level, + max_repetition_level, values_read, null_count, + valid_bits, valid_bits_offset); + +#else + DefinitionLevelsToBitmapScalar(def_levels, num_def_levels, max_definition_level, + max_repetition_level, values_read, null_count, + valid_bits, valid_bits_offset); -#else // big-endian - DefinitionLevelsToBitmapScalar(def_levels, num_def_levels, max_definition_level, - max_repetition_level, values_read, null_count, - valid_bits, valid_bits_offset); #endif } diff --git a/cpp/src/parquet/level_conversion.h b/cpp/src/parquet/level_conversion.h index afe5fabbb7d..ff91ca4606e 100644 --- a/cpp/src/parquet/level_conversion.h +++ b/cpp/src/parquet/level_conversion.h @@ -49,7 +49,7 @@ uint64_t LevelsToBitmap(const int16_t* levels, int64_t num_levels, Predicate pre // Both clang and GCC can vectorize this automatically with SSE4/AVX2. uint64_t mask = 0; for (int x = 0; x < num_levels; x++) { - mask |= static_cast(predicate(levels[x]) ? 1 : 0) << x; + mask |= static_cast(predicate(levels[x]) ? 1 : 0) << x; } return mask; } @@ -58,7 +58,7 @@ uint64_t LevelsToBitmap(const int16_t* levels, int64_t num_levels, Predicate pre /// than rhs. static inline uint64_t GreaterThanBitmap(const int16_t* levels, int64_t num_levels, int16_t rhs) { - return LevelsToBitmap(levels, num_levels, [&](int16_t value) { return value > rhs; }); + return LevelsToBitmap(levels, num_levels, [rhs](int16_t value) { return value > rhs; }); } #endif From 1593f9efba47eba76611a161586e17e8b19459e9 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 6 May 2020 04:26:22 +0000 Subject: [PATCH 31/40] static assert failure on big endian --- cpp/src/arrow/util/bit_util.h | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index f9eccb43c7b..1bab31915ca 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -637,16 +637,7 @@ class FirstTimeBitmapWriter { } #else // big-endian - for (int x = 0; x < number_of_bits; x++) { - if (word & 0x1) { - Set(); - } else { - Clear(); - } - Next(); - word >>= 1; - } - return; + static_assert(false, "AppendWord not implement on Big Endian"); #endif } From d541d3a3ec70e24377f858b90fee95ea48556c17 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 6 May 2020 04:39:11 +0000 Subject: [PATCH 32/40] fix comparison --- cpp/src/parquet/level_conversion.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/level_conversion.cc b/cpp/src/parquet/level_conversion.cc index fcc3295b74f..305f65d9ce0 100644 --- a/cpp/src/parquet/level_conversion.cc +++ b/cpp/src/parquet/level_conversion.cc @@ -38,7 +38,7 @@ inline void CheckLevelRange(const int16_t* levels, int64_t num_levels, min_level = std::min(levels[x], min_level); max_level = std::max(levels[x], max_level); } - if (ARROW_PREDICT_FALSE(num_levels > 0 && (min_level < 0 || max_level > max_level))) { + if (ARROW_PREDICT_FALSE(num_levels > 0 && (min_level < 0 || max_level > max_expected_level))) { throw ParquetException("definition level exceeds maximum"); } } From 0f57459390589e20ba693f6d5009a33381cc7b28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Saint-Jacques?= Date: Wed, 6 May 2020 10:05:26 -0400 Subject: [PATCH 33/40] Refactor DefinitionLevelsBatchToBitmap signature and reformat --- cpp/src/parquet/level_conversion.cc | 30 +++++++++++++---------------- dev/archery/archery/cli.py | 5 +++++ dev/archery/archery/lang/cpp.py | 3 +++ 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/cpp/src/parquet/level_conversion.cc b/cpp/src/parquet/level_conversion.cc index 305f65d9ce0..de2b5782774 100644 --- a/cpp/src/parquet/level_conversion.cc +++ b/cpp/src/parquet/level_conversion.cc @@ -24,7 +24,6 @@ #include "arrow/util/bit_util.h" #include "arrow/util/logging.h" - #include "parquet/exception.h" namespace parquet { @@ -38,7 +37,8 @@ inline void CheckLevelRange(const int16_t* levels, int64_t num_levels, min_level = std::min(levels[x], min_level); max_level = std::max(levels[x], max_level); } - if (ARROW_PREDICT_FALSE(num_levels > 0 && (min_level < 0 || max_level > max_expected_level))) { + if (ARROW_PREDICT_FALSE(num_levels > 0 && + (min_level < 0 || max_level > max_expected_level))) { throw ParquetException("definition level exceeds maximum"); } } @@ -86,10 +86,9 @@ inline void DefinitionLevelsToBitmapScalar( #endif template -void DefinitionLevelsBatchToBitmap(const int16_t* def_levels, int64_t num_def_levels, - const int16_t required_definition_level, - const int batch_size, int64_t* set_count, - ::arrow::internal::FirstTimeBitmapWriter* writer) { +int64_t DefinitionLevelsBatchToBitmap(const int16_t* def_levels, const int64_t batch_size, + const int16_t required_definition_level, + ::arrow::internal::FirstTimeBitmapWriter* writer) { CheckLevelRange(def_levels, batch_size, required_definition_level); uint64_t defined_bitmap = internal::GreaterThanBitmap(def_levels, batch_size, required_definition_level - 1); @@ -98,9 +97,8 @@ void DefinitionLevelsBatchToBitmap(const int16_t* def_levels, int64_t num_def_le if (has_repeated_parent) { #if defined(ARROW_HAVE_BMI2) // This is currently a specialized code path assuming only (nested) lists - // present through the leaf (i.e. no structs). - // Upper level code only calls this method - // when the leaf-values are nullable (otherwise no spacing is needed), + // present through the leaf (i.e. no structs). Upper level code only calls + // this method when the leaf-values are nullable (otherwise no spacing is needed), // Because only nested lists exists it is sufficient to know that the field // was either null or included it (i.e. definition level > max_definitation_level // -2) If there where structs mixed in, we need to know the def_level of the @@ -109,13 +107,13 @@ void DefinitionLevelsBatchToBitmap(const int16_t* def_levels, int64_t num_def_le required_definition_level - 2); uint64_t selected_bits = _pext_u64(defined_bitmap, present_bitmap); writer->AppendWord(selected_bits, ::arrow::BitUtil::PopCount(present_bitmap)); - *set_count += ::arrow::BitUtil::PopCount(selected_bits); + return ::arrow::BitUtil::PopCount(selected_bits); #else assert(false && "must not execute this without BMI2"); #endif } else { writer->AppendWord(defined_bitmap, batch_size); - *set_count += ::arrow::BitUtil::PopCount(defined_bitmap); + return ::arrow::BitUtil::PopCount(defined_bitmap); } } @@ -131,15 +129,13 @@ void DefinitionLevelsToBitmapSimd(const int16_t* def_levels, int64_t num_def_lev int64_t set_count = 0; *values_read = 0; while (num_def_levels > kBitMaskSize) { - DefinitionLevelsBatchToBitmap( - def_levels, num_def_levels, required_definition_level, - /*batch_size=*/kBitMaskSize, &set_count, &writer); + set_count += DefinitionLevelsBatchToBitmap( + def_levels, kBitMaskSize, required_definition_level, &writer); def_levels += kBitMaskSize; num_def_levels -= kBitMaskSize; } - DefinitionLevelsBatchToBitmap( - def_levels, num_def_levels, required_definition_level, - /*batch_size=*/num_def_levels, &set_count, &writer); + set_count += DefinitionLevelsBatchToBitmap( + def_levels, num_def_levels, required_definition_level, &writer); *values_read = writer.position(); *null_count += *values_read - set_count; diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 15c05a40397..a9fddca81c2 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -102,6 +102,9 @@ def validate_arrow_sources(ctx, param, src): warn_level_type = click.Choice(["everything", "checkin", "production"], case_sensitive=False) +simd_level = click.Choice(["NONE", "SSE4_2", "AVX2", "AVX512"], + case_sensitive=True) + def cpp_toolchain_options(cmd): options = [ @@ -133,6 +136,8 @@ def _apply_options(cmd, options): help="Controls compiler warnings -W(no-)error.") @click.option("--use-gold-linker", default=True, type=BOOL, help="Toggles ARROW_USE_LD_GOLD option.") +@click.option("--simd-level", default="SSE4_2", type=simd_level, + help="Toggles ARROW_SIMD_LEVEL option.") # Tests and benchmarks @click.option("--with-tests", default=True, type=BOOL, help="Build with tests.") diff --git a/dev/archery/archery/lang/cpp.py b/dev/archery/archery/lang/cpp.py index 2769781096e..4af0f5434cc 100644 --- a/dev/archery/archery/lang/cpp.py +++ b/dev/archery/archery/lang/cpp.py @@ -62,6 +62,7 @@ def __init__(self, # extras with_lint_only=False, use_gold_linker=True, + simd_level="SSE4_2", cmake_extras=None): self._cc = cc self._cxx = cxx @@ -111,6 +112,7 @@ def __init__(self, self.with_lint_only = with_lint_only self.use_gold_linker = use_gold_linker + self.simd_level = simd_level self.cmake_extras = cmake_extras @@ -232,6 +234,7 @@ def _gen_defs(self): broken_with_gold_ld = [self.with_fuzzing, self.with_gandiva] if self.use_gold_linker and not any(broken_with_gold_ld): yield ("ARROW_USE_LD_GOLD", truthifier(self.use_gold_linker)) + yield ("ARROW_SIMD_LEVEL", or_else(self.simd_level, "SSE4_2")) # Detect custom conda toolchain if self.use_conda: From 858ef0133f3bbbfb71179325d5cc1dfe49a287d1 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sun, 17 May 2020 03:17:15 +0000 Subject: [PATCH 34/40] add repeated benchmark --- cpp/src/parquet/CMakeLists.txt | 1 + cpp/src/parquet/level_conversion_benchmark.cc | 59 +++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 cpp/src/parquet/level_conversion_benchmark.cc diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 8b5dfffe2d0..915ff75804a 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -373,6 +373,7 @@ add_parquet_test(schema_test) add_parquet_benchmark(column_io_benchmark) add_parquet_benchmark(encoding_benchmark) +add_parquet_benchmark(level_conversion_benchmark) add_parquet_benchmark(arrow/reader_writer_benchmark PREFIX "parquet-arrow") if(ARROW_WITH_BROTLI) diff --git a/cpp/src/parquet/level_conversion_benchmark.cc b/cpp/src/parquet/level_conversion_benchmark.cc new file mode 100644 index 00000000000..30ddab9af72 --- /dev/null +++ b/cpp/src/parquet/level_conversion_benchmark.cc @@ -0,0 +1,59 @@ +#include "benchmark/benchmark.h" + +#include "parquet/column_reader.h" + +constexpr int64_t kLevelCount = 2048; +// Def level indicating the element is missing from the leaf +// array (a parent repeated element was empty). +constexpr int16_t kMissingDefLevel = 0; + +// Definition Level indicating the values has an entry in the leaf element. +constexpr int16_t kPresentDefLevel = 2; + +// A repition level that indicates a repeated element. +constexpr int16_t kHasRepeatedElements = 1; + +std::vector RunDefinitionLevelsToBitmap(const std::vector& def_levels, + ::benchmark::State* state) { + int64_t values_read = 0; + int64_t null_count = 0; + std::vector bitmap(/*count=*/def_levels.size(), 0); + int rep = 0; + for (auto _ : *state) { + parquet::internal::DefinitionLevelsToBitmap( + def_levels.data(), def_levels.size(), /*max_definition_level=*/kPresentDefLevel, + /*max_repetition_level=*/kHasRepeatedElements, &values_read, &null_count, + bitmap.data(), /*valid_bits_offset=*/(rep++ % 8) * def_levels.size()); + } + state->SetBytesProcessed(int64_t(state->iterations()) * def_levels.size()); + return bitmap; +} + +void BM_DefinitionLevelsToBitmapRepeatedAllMissing(::benchmark::State& state) { + std::vector def_levels(/*count=*/kLevelCount, kMissingDefLevel); + auto result = RunDefinitionLevelsToBitmap(def_levels, &state); + ::benchmark::DoNotOptimize(result); +} + +BENCHMARK(BM_DefinitionLevelsToBitmapRepeatedAllMissing); + +void BM_DefinitionLevelsToBitmapRepeatedAllPresent(::benchmark::State& state) { + std::vector def_levels(/*count=*/kLevelCount, kPresentDefLevel); + auto result = RunDefinitionLevelsToBitmap(def_levels, &state); + ::benchmark::DoNotOptimize(result); +} + +BENCHMARK(BM_DefinitionLevelsToBitmapRepeatedAllPresent); + +void BM_DefinitionLevelsToBitmapRepeatedMostPresent(::benchmark::State& state) { + std::vector def_levels(/*count=*/kLevelCount, kPresentDefLevel); + for (int x = 0; x < def_levels.size(); x++) { + if (x % 10 == 0) { + def_levels[x] = kMissingDefLevel; + } + } + auto result = RunDefinitionLevelsToBitmap(def_levels, &state); + ::benchmark::DoNotOptimize(result); +} + +BENCHMARK(BM_DefinitionLevelsToBitmapRepeatedMostPresent); From 0113e02edfb891ff78d59d655d75b239bda59443 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sun, 17 May 2020 03:23:05 +0000 Subject: [PATCH 35/40] update benchmark --- cpp/src/parquet/level_conversion_benchmark.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/level_conversion_benchmark.cc b/cpp/src/parquet/level_conversion_benchmark.cc index 30ddab9af72..181a86d6b30 100644 --- a/cpp/src/parquet/level_conversion_benchmark.cc +++ b/cpp/src/parquet/level_conversion_benchmark.cc @@ -1,6 +1,6 @@ #include "benchmark/benchmark.h" -#include "parquet/column_reader.h" +#include "parquet/level_conversion.h" constexpr int64_t kLevelCount = 2048; // Def level indicating the element is missing from the leaf From 90473905b61f0a0643e4a7df85f301b89a868b48 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sun, 17 May 2020 03:30:51 +0000 Subject: [PATCH 36/40] add header and some includes --- cpp/src/parquet/level_conversion_benchmark.cc | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/level_conversion_benchmark.cc b/cpp/src/parquet/level_conversion_benchmark.cc index 181a86d6b30..661969e0da9 100644 --- a/cpp/src/parquet/level_conversion_benchmark.cc +++ b/cpp/src/parquet/level_conversion_benchmark.cc @@ -1,5 +1,24 @@ -#include "benchmark/benchmark.h" +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include "benchmark/benchmark.h" #include "parquet/level_conversion.h" constexpr int64_t kLevelCount = 2048; From 2b8be0c1bc143cd93e1020465d515101aee73b98 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 22 May 2020 04:05:14 +0000 Subject: [PATCH 37/40] only use bmi2 on avx512 to prevent use on AMD --- cpp/src/parquet/level_conversion.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/level_conversion.cc b/cpp/src/parquet/level_conversion.cc index de2b5782774..cfa5df1a7e0 100644 --- a/cpp/src/parquet/level_conversion.cc +++ b/cpp/src/parquet/level_conversion.cc @@ -43,7 +43,7 @@ inline void CheckLevelRange(const int16_t* levels, int64_t num_levels, } } -#if !defined(ARROW_HAVE_BMI2) +#if !defined(ARROW_HAVE_AVX512) inline void DefinitionLevelsToBitmapScalar( const int16_t* def_levels, int64_t num_def_levels, const int16_t max_definition_level, @@ -147,7 +147,11 @@ void DefinitionLevelsToBitmapLittleEndian( const int16_t max_repetition_level, int64_t* values_read, int64_t* null_count, uint8_t* valid_bits, int64_t valid_bits_offset) { if (max_repetition_level > 0) { -#if defined(ARROW_HAVE_BMI2) +// This is a short term hack to prevent using the pext BMI2 instructions +// on non-intel platforms where performance is subpar. +// In the medium term we will hopefully be able to runtime dispatch +// to use this on intel only platforms that support pext. +#if defined(ARROW_HAVE_AVX512) // BMI2 is required for efficient bit extraction. DefinitionLevelsToBitmapSimd( def_levels, num_def_levels, max_definition_level, values_read, null_count, From 74a6aa95b52d1aa0ab5dd437cf8c722e8f689627 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 2 Jun 2020 06:07:51 +0000 Subject: [PATCH 38/40] fix size argument --- cpp/src/parquet/level_conversion_benchmark.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/level_conversion_benchmark.cc b/cpp/src/parquet/level_conversion_benchmark.cc index 661969e0da9..4f15838d339 100644 --- a/cpp/src/parquet/level_conversion_benchmark.cc +++ b/cpp/src/parquet/level_conversion_benchmark.cc @@ -66,7 +66,7 @@ BENCHMARK(BM_DefinitionLevelsToBitmapRepeatedAllPresent); void BM_DefinitionLevelsToBitmapRepeatedMostPresent(::benchmark::State& state) { std::vector def_levels(/*count=*/kLevelCount, kPresentDefLevel); - for (int x = 0; x < def_levels.size(); x++) { + for (size_t x = 0; x < def_levels.size(); x++) { if (x % 10 == 0) { def_levels[x] = kMissingDefLevel; } From fe9263b2c1fde4385ec577f3010794754d4a9795 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 3 Jun 2020 05:12:30 +0000 Subject: [PATCH 39/40] address comments --- cpp/cmake_modules/SetupCxxFlags.cmake | 4 ++-- cpp/src/arrow/util/bit_util.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 3248320d617..a7985c63ad8 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -42,9 +42,9 @@ if(ARROW_CPU_FLAG STREQUAL "x86") set(CXX_SUPPORTS_SSE4_2 TRUE) else() set(ARROW_SSE4_2_FLAG "-msse4.2") - set(ARROW_AVX2_FLAG "-mavx2 -mbmi2") + set(ARROW_AVX2_FLAG "-mavx2") # skylake-avx512 consists of AVX512F,AVX512BW,AVX512VL,AVX512CD,AVX512DQ - set(ARROW_AVX512_FLAG "-march=skylake-avx512") + set(ARROW_AVX512_FLAG "-march=skylake-avx512 -mbmi2") check_cxx_compiler_flag(${ARROW_SSE4_2_FLAG} CXX_SUPPORTS_SSE4_2) endif() check_cxx_compiler_flag(${ARROW_AVX2_FLAG} CXX_SUPPORTS_AVX2) diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index 1bab31915ca..b8ba8464b7a 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -592,7 +592,7 @@ class FirstTimeBitmapWriter { /// to be unset (i.e. 0). /// \param[in] number_of_bits The number of bits to append from word. void AppendWord(uint64_t word, int64_t number_of_bits) { -#if defined(ARROW_LITTLE_ENDIAN) +#if ARROW_LITTLE_ENDIAN if (ARROW_PREDICT_FALSE(number_of_bits == 0)) { return; } From a0d93c2cbd39fae511ebedb2f7c80307e7a83271 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 4 Jun 2020 04:27:17 +0000 Subject: [PATCH 40/40] try to fix s390x --- cpp/src/arrow/util/bit_util.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index b8ba8464b7a..a3b45cc79db 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -592,7 +592,6 @@ class FirstTimeBitmapWriter { /// to be unset (i.e. 0). /// \param[in] number_of_bits The number of bits to append from word. void AppendWord(uint64_t word, int64_t number_of_bits) { -#if ARROW_LITTLE_ENDIAN if (ARROW_PREDICT_FALSE(number_of_bits == 0)) { return; } @@ -624,7 +623,7 @@ class FirstTimeBitmapWriter { word = word >> bits_to_carry; number_of_bits -= bits_to_carry; } - + word = BitUtil::ToLittleEndian(word); int64_t bytes_for_word = ::arrow::BitUtil::BytesForBits(number_of_bits); std::memcpy(append_position, &word, bytes_for_word); // At this point, the previous current_byte_ has been written to bitmap_. @@ -635,10 +634,6 @@ class FirstTimeBitmapWriter { } else { current_byte_ = *(append_position + bytes_for_word - 1); } - -#else // big-endian - static_assert(false, "AppendWord not implement on Big Endian"); -#endif } void Set() { current_byte_ |= bit_mask_; }