From 482797c495e8874ec8bc970066cb0f5086645220 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 1 Oct 2020 19:43:50 +0200 Subject: [PATCH] ARROW-10058: [C++] Improve repeated levels conversion without BMI2 Use a lookup table to emulate PEXT 5 bits at a time. Remove the slow scalar path. --- cpp/src/parquet/level_conversion.cc | 57 +----- cpp/src/parquet/level_conversion.h | 2 +- cpp/src/parquet/level_conversion_inc.h | 242 ++++++++++++++++++++--- cpp/src/parquet/level_conversion_test.cc | 24 +-- 4 files changed, 235 insertions(+), 90 deletions(-) diff --git a/cpp/src/parquet/level_conversion.cc b/cpp/src/parquet/level_conversion.cc index 3f57be39e72..fadd56e1de9 100644 --- a/cpp/src/parquet/level_conversion.cc +++ b/cpp/src/parquet/level_conversion.cc @@ -36,41 +36,6 @@ namespace { using ::arrow::internal::CpuInfo; -void DefLevelsToBitmapScalar(const int16_t* def_levels, int64_t num_def_levels, - LevelInfo level_info, ValidityBitmapInputOutput* output) { - ::arrow::internal::FirstTimeBitmapWriter valid_bits_writer( - output->valid_bits, - /*start_offset=*/output->valid_bits_offset, - /*length=*/num_def_levels); - for (int x = 0; x < num_def_levels; x++) { - // This indicates that a parent repeated element has zero - // length so the def level is not applicable to this column. - if (def_levels[x] < level_info.repeated_ancestor_def_level) { - continue; - } - if (ARROW_PREDICT_FALSE(valid_bits_writer.position() >= - output->values_read_upper_bound)) { - std::stringstream ss; - ss << "Definition levels exceeded upper bound: " << output->values_read_upper_bound; - throw ParquetException(ss.str()); - } - if (def_levels[x] >= level_info.def_level) { - valid_bits_writer.Set(); - } else { - valid_bits_writer.Clear(); - output->null_count += 1; - } - valid_bits_writer.Next(); - } - valid_bits_writer.Finish(); - output->values_read = valid_bits_writer.position(); - if (output->null_count > 0 && level_info.null_slot_usage > 1) { - throw ParquetException( - "Null values with null_slot_usage > 1 not supported." - "(i.e. FixedSizeLists with null values are not supported"); - } -} - template void DefRepLevelsToListInfo(const int16_t* def_levels, const int16_t* rep_levels, int64_t num_def_levels, LevelInfo level_info, @@ -170,27 +135,21 @@ void DefLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels, // is deleted in a follow-up release. if (level_info.rep_level > 0) { #if defined(ARROW_HAVE_RUNTIME_BMI2) - using FunctionType = decltype(&standard::DefLevelsToBitmapSimd); - // DefLevelsToBitmapSimd with emulated PEXT would be slow, so use the - // scalar version if BMI2 is unavailable. - static FunctionType fn = CpuInfo::GetInstance()->HasEfficientBmi2() - ? DefLevelsToBitmapBmi2WithRepeatedParent - : DefLevelsToBitmapScalar; - fn(def_levels, num_def_levels, level_info, output); -#else - DefLevelsToBitmapScalar(def_levels, num_def_levels, level_info, output); + if (CpuInfo::GetInstance()->HasEfficientBmi2()) { + return DefLevelsToBitmapBmi2WithRepeatedParent(def_levels, num_def_levels, + level_info, output); + } #endif + standard::DefLevelsToBitmapSimd( + def_levels, num_def_levels, level_info, output); } else { - // SIMD here applies to all platforms because the only operation that - // happens is def_levels->bitmap which should have good SIMD options - // on all platforms. standard::DefLevelsToBitmapSimd( def_levels, num_def_levels, level_info, output); } } -uint64_t TestOnlyRunBasedExtract(uint64_t bitmap, uint64_t select_bitmap) { - return standard::RunBasedExtractImpl(bitmap, select_bitmap); +uint64_t TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t select_bitmap) { + return standard::ExtractBitsSoftware(bitmap, select_bitmap); } void DefRepLevelsToList(const int16_t* def_levels, const int16_t* rep_levels, diff --git a/cpp/src/parquet/level_conversion.h b/cpp/src/parquet/level_conversion.h index c664cbae4cb..d406724ce16 100644 --- a/cpp/src/parquet/level_conversion.h +++ b/cpp/src/parquet/level_conversion.h @@ -192,7 +192,7 @@ void PARQUET_EXPORT DefRepLevelsToBitmap(const int16_t* def_levels, // This is exposed to ensure we can properly test a software simulated pext function // (i.e. it isn't hidden by runtime dispatch). -uint64_t PARQUET_EXPORT TestOnlyRunBasedExtract(uint64_t bitmap, uint64_t selection); +uint64_t PARQUET_EXPORT TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t selection); } // namespace internal } // namespace parquet diff --git a/cpp/src/parquet/level_conversion_inc.h b/cpp/src/parquet/level_conversion_inc.h index c688f748043..8f3d3d496de 100644 --- a/cpp/src/parquet/level_conversion_inc.h +++ b/cpp/src/parquet/level_conversion_inc.h @@ -35,44 +35,234 @@ namespace internal { #endif namespace PARQUET_IMPL_NAMESPACE { -/// Algorithm to simulate pext using BitRunReader for cases where all bits -/// not set or set. -uint64_t RunBasedExtractMixed(uint64_t bitmap, uint64_t select_bitmap) { - bitmap = arrow::BitUtil::FromLittleEndian(bitmap); - uint64_t new_bitmap = 0; - ::arrow::internal::BitRunReader selection(reinterpret_cast(&select_bitmap), - /*start_offset=*/0, /*length=*/64); - ::arrow::internal::BitRun run = selection.NextRun(); - int64_t selected_bits = 0; - while (run.length != 0) { - if (run.set) { - new_bitmap |= (bitmap & ::arrow::BitUtil::LeastSignficantBitMask(run.length)) - << selected_bits; - selected_bits += run.length; - } - bitmap = bitmap >> run.length; - run = selection.NextRun(); - } - return arrow::BitUtil::ToLittleEndian(new_bitmap); -} +// clang-format off +/* Python code to generate lookup table: + +kLookupBits = 5 +count = 0 +print('constexpr int kLookupBits = {};'.format(kLookupBits)) +print('constexpr uint8_t kPextTable[1 << kLookupBits][1 << kLookupBits] = {') +print(' ', end = '') +for mask in range(1 << kLookupBits): + for data in range(1 << kLookupBits): + bit_value = 0 + bit_len = 0 + for i in range(kLookupBits): + if mask & (1 << i): + bit_value |= (((data >> i) & 1) << bit_len) + bit_len += 1 + out = '0x{:02X},'.format(bit_value) + count += 1 + if count % (1 << kLookupBits) == 1: + print(' {') + if count % 8 == 1: + print(' ', end = '') + if count % 8 == 0: + print(out, end = '\n') + else: + print(out, end = ' ') + if count % (1 << kLookupBits) == 0: + print(' },', end = '') +print('\n};') + +*/ +// clang-format on -inline uint64_t RunBasedExtractImpl(uint64_t bitmap, uint64_t select_bitmap) { - /// These checks should be inline and are likely to be common cases. +constexpr int kLookupBits = 5; +constexpr uint8_t kPextTable[1 << kLookupBits][1 << kLookupBits] = { + { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }, + { + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, + 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + }, + { + 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, + 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, + 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, + }, + { + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, + 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, + 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, + }, + { + 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, + 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, + }, + { + 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, + 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, + 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, + }, + { + 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, + 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, + 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, + }, + { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, + 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + }, + { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + }, + { + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, + 0x03, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, + 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, + }, + { + 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, + 0x03, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, + 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03, + }, + { + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, + 0x07, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + }, + { + 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, + 0x02, 0x03, 0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, + 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, + }, + { + 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, + 0x05, 0x06, 0x07, 0x06, 0x07, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, + 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07, + }, + { + 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x05, 0x06, 0x06, 0x07, 0x07, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, + 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, + }, + { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, + 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + }, + { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + }, + { + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, + 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, + 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, + }, + { + 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, + 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, + 0x03, 0x03, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03, + }, + { + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, + 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, + 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, + }, + { + 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, + 0x03, 0x03, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, + }, + { + 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, + 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, + 0x06, 0x07, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07, + }, + { + 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, + 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, + 0x07, 0x07, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, + }, + { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, + 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, + 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + }, + { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, + }, + { + 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, + 0x03, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05, + 0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, + }, + { + 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, + 0x03, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x04, 0x04, + 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07, + }, + { + 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, + 0x07, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09, + 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F, + }, + { + 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, + 0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x05, 0x05, + 0x05, 0x05, 0x06, 0x06, 0x06, 0x06, 0x07, 0x07, 0x07, 0x07, + }, + { + 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, + 0x05, 0x06, 0x07, 0x06, 0x07, 0x08, 0x09, 0x08, 0x09, 0x0A, 0x0B, + 0x0A, 0x0B, 0x0C, 0x0D, 0x0C, 0x0D, 0x0E, 0x0F, 0x0E, 0x0F, + }, + { + 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, + 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0A, 0x0A, + 0x0B, 0x0B, 0x0C, 0x0C, 0x0D, 0x0D, 0x0E, 0x0E, 0x0F, 0x0F, + }, + { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, + 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, + 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, + }, +}; + +inline uint64_t ExtractBitsSoftware(uint64_t bitmap, uint64_t select_bitmap) { + // A software emulation of _pext_u64 + + // These checks should be inline and are likely to be common cases. if (select_bitmap == ~uint64_t{0}) { return bitmap; } else if (select_bitmap == 0) { return 0; } - /// Fallback to the slow method. - return RunBasedExtractMixed(bitmap, select_bitmap); + + // Fallback to lookup table method + uint64_t bit_value = 0; + int bit_len = 0; + constexpr uint8_t kLookupMask = (1U << kLookupBits) - 1; + while (select_bitmap != 0) { + const auto mask_len = ARROW_POPCOUNT32(select_bitmap & kLookupMask); + const uint64_t value = kPextTable[select_bitmap & kLookupMask][bitmap & kLookupMask]; + bit_value |= (value << bit_len); + bit_len += mask_len; + bitmap >>= kLookupBits; + select_bitmap >>= kLookupBits; + } + return bit_value; } inline uint64_t ExtractBits(uint64_t bitmap, uint64_t select_bitmap) { -// MING32 doesn't support 64-bit pext. + // MING32 doesn't support 64-bit pext. #if defined(ARROW_HAVE_BMI2) && !defined(__MINGW32__) return _pext_u64(bitmap, select_bitmap); #else - return RunBasedExtractImpl(bitmap, select_bitmap); + return ExtractBitsSoftware(bitmap, select_bitmap); #endif } diff --git a/cpp/src/parquet/level_conversion_test.cc b/cpp/src/parquet/level_conversion_test.cc index b4f2d3ad5d1..a3036758a0a 100644 --- a/cpp/src/parquet/level_conversion_test.cc +++ b/cpp/src/parquet/level_conversion_test.cc @@ -345,20 +345,16 @@ TYPED_TEST(NestedListTest, TestOverflow) { this->Run(test_data, level_info); } -TEST(TestOnlyRunBasedExtract, BasicTest) { - EXPECT_EQ(TestOnlyRunBasedExtract(arrow::BitUtil::ToLittleEndian(0xFF), 0), 0); - EXPECT_EQ(TestOnlyRunBasedExtract(arrow::BitUtil::ToLittleEndian(0xFF), ~uint64_t{0}), - arrow::BitUtil::ToLittleEndian(0xFF)); - - EXPECT_EQ(TestOnlyRunBasedExtract(arrow::BitUtil::ToLittleEndian(0xFF00FF), - arrow::BitUtil::ToLittleEndian(0xAAAA)), - arrow::BitUtil::ToLittleEndian(0x000F)); - EXPECT_EQ(TestOnlyRunBasedExtract(arrow::BitUtil::ToLittleEndian(0xFF0AFF), - arrow::BitUtil::ToLittleEndian(0xAFAA)), - arrow::BitUtil::ToLittleEndian(0x00AF)); - EXPECT_EQ(TestOnlyRunBasedExtract(arrow::BitUtil::ToLittleEndian(0xFFAAFF), - arrow::BitUtil::ToLittleEndian(0xAFAA)), - arrow::BitUtil::ToLittleEndian(0x03AF)); +TEST(TestOnlyExtractBitsSoftware, BasicTest) { + auto check = [](uint64_t bitmap, uint64_t selection, uint64_t expected) -> void { + EXPECT_EQ(TestOnlyExtractBitsSoftware(bitmap, selection), expected); + }; + check(0xFF, 0, 0); + check(0xFF, ~uint64_t{0}, 0xFF); + check(0xFF00FF, 0xAAAA, 0x000F); + check(0xFF0AFF, 0xAFAA, 0x00AF); + check(0xFFAAFF, 0xAFAA, 0x03AF); + check(0xFECBDA9876543210ULL, 0xF00FF00FF00FF00FULL, 0xFBD87430ULL); } } // namespace internal