From eb5b38a5ea034130d0bdfdca1e0613bc7ad96b0e Mon Sep 17 00:00:00 2001 From: Serge Druzkin Date: Thu, 20 Nov 2025 17:58:05 -0800 Subject: [PATCH] Split header and data size in encoding size estimation (#279) Summary: Pull Request resolved: https://github.com/facebookincubator/nimble/pull/279 Split header and data size in encoding size estimation, so that the read factor can only be applied to the compressible data. Current implementation give funky estimates for small sized data. Differential Revision: D84458948 --- .../encodings/EncodingSelectionPolicy.h | 9 +- .../nimble/encodings/EncodingSizeEstimation.h | 121 +++++++++++------- .../tests/EncodingSelectionTests.cpp | 19 ++- 3 files changed, 94 insertions(+), 55 deletions(-) diff --git a/dwio/nimble/encodings/EncodingSelectionPolicy.h b/dwio/nimble/encodings/EncodingSelectionPolicy.h index b8a250e9..38a84fb5 100644 --- a/dwio/nimble/encodings/EncodingSelectionPolicy.h +++ b/dwio/nimble/encodings/EncodingSelectionPolicy.h @@ -172,10 +172,10 @@ class ManualEncodingSelectionPolicy : public EncodingSelectionPolicy { // minimal cost. for (const auto& pair : readFactors_) { auto encodingType = pair.first; - auto size = + auto sizeEstimation = detail::EncodingSizeEstimation::estimateSize( encodingType, values.size(), statistics); - if (!size.has_value()) { + if (!sizeEstimation.has_value()) { NIMBLE_SELECTION_LOG( PURPLE << encodingType << " encoding is incompatible."); continue; @@ -184,9 +184,10 @@ class ManualEncodingSelectionPolicy : public EncodingSelectionPolicy { // We use read factor weights to raise/lower the favorability of each // encoding. auto readFactor = pair.second; - auto cost = size.value() * readFactor; + auto cost = sizeEstimation.value().cost(readFactor); NIMBLE_SELECTION_LOG( - YELLOW << "Encoding: " << encodingType << ", Size: " << size.value() + YELLOW << "Encoding: " << encodingType + << ", Size: " << sizeEstimation.value().size() << ", Factor: " << readFactor << ", Cost: " << cost); if (cost < minCost) { minCost = cost; diff --git a/dwio/nimble/encodings/EncodingSizeEstimation.h b/dwio/nimble/encodings/EncodingSizeEstimation.h index 790ed12f..56f76810 100644 --- a/dwio/nimble/encodings/EncodingSizeEstimation.h +++ b/dwio/nimble/encodings/EncodingSizeEstimation.h @@ -22,10 +22,29 @@ #include "dwio/nimble/common/Exceptions.h" #include "dwio/nimble/common/FixedBitArray.h" #include "dwio/nimble/common/Types.h" +#include "dwio/nimble/encodings/Statistics.h" namespace facebook::nimble { namespace detail { +struct SizeEstimation { + public: + SizeEstimation(uint64_t headerSize, uint64_t dataSize) + : headerSize_{headerSize}, dataSize_{dataSize} {} + + uint64_t cost(double readFactor) const { + return headerSize_ + (dataSize_ * readFactor); + } + + uint64_t size() const { + return headerSize_ + dataSize_; + } + + private: + const uint64_t headerSize_; + const uint64_t dataSize_; +}; + // This class is meant to quickly estimate the size of encoded data using a // given encoding type. It does a lot of assumptions, and it is not meant to be // 100% accurate. @@ -33,16 +52,16 @@ template struct EncodingSizeEstimation { using physicalType = typename TypeTraits::physicalType; - static std::optional estimateNumericSize( + static std::optional estimateNumericSize( const EncodingType encodingType, const uint64_t entryCount, const Statistics& statistics) { switch (encodingType) { case EncodingType::Constant: { return statistics.uniqueCounts().value().size() == 1 - ? std::optional{getEncodingOverhead< - EncodingType::Constant, - physicalType>()} + ? std::optional{SizeEstimation{ + getEncodingOverhead(), + 0}} : std::nullopt; } case EncodingType::MainlyConstant: { @@ -71,25 +90,26 @@ struct EncodingSizeEstimation { // stored bit packed. const auto uncommonIndicesSize = bitPackedBytes(0, entryCount, uncommonCount); - uint32_t overhead = + const uint32_t overhead = getEncodingOverhead() + // Overhead for storing uncommon values getEncodingOverhead() + // Overhead for storing uncommon bitmap getEncodingOverhead() + getEncodingOverhead(); - return overhead + sizeof(physicalType) + uncommonValueSize + - uncommonIndicesSize; + return SizeEstimation{ + overhead + sizeof(physicalType), + uncommonValueSize + uncommonIndicesSize}; } case EncodingType::Trivial: { - return getEncodingOverhead() + - (entryCount * sizeof(physicalType)); + return SizeEstimation{ + getEncodingOverhead(), + entryCount * sizeof(physicalType)}; } case EncodingType::FixedBitWidth: { - return getEncodingOverhead< - EncodingType::FixedBitWidth, - physicalType>() + - bitPackedBytes(statistics.min(), statistics.max(), entryCount); + return SizeEstimation{ + getEncodingOverhead(), + bitPackedBytes(statistics.min(), statistics.max(), entryCount)}; } case EncodingType::Dictionary: { // Assumptions: @@ -100,13 +120,13 @@ struct EncodingSizeEstimation { 0, statistics.uniqueCounts().value().size(), entryCount); const uint64_t alphabetSize = statistics.uniqueCounts().value().size() * sizeof(physicalType); - uint32_t overhead = + const uint32_t overhead = getEncodingOverhead() + // Alphabet overhead getEncodingOverhead() + // Indices overhead getEncodingOverhead(); - return overhead + alphabetSize + indicesSize; + return SizeEstimation{overhead, alphabetSize + indicesSize}; } case EncodingType::RLE: { // Assumptions: @@ -122,13 +142,13 @@ struct EncodingSizeEstimation { statistics.minRepeat(), statistics.maxRepeat(), statistics.consecutiveRepeatCount()); - uint32_t overhead = + const uint32_t overhead = getEncodingOverhead() + // Overhead of run values getEncodingOverhead() + // Overhead of run lengths getEncodingOverhead(); - return overhead + runValuesSize + runLengthsSize; + return SizeEstimation{overhead, runValuesSize + runLengthsSize}; } case EncodingType::Varint: { // Note: the condition below actually support floating point numbers as @@ -145,8 +165,9 @@ struct EncodingSizeEstimation { [&i](const uint64_t sum, const uint64_t bucketSize) { return sum + (bucketSize * (++i)); }); - return getEncodingOverhead() + - dataSize; + return SizeEstimation{ + getEncodingOverhead(), + dataSize}; } else { return std::nullopt; } @@ -157,16 +178,16 @@ struct EncodingSizeEstimation { } } - static std::optional estimateBoolSize( + static std::optional estimateBoolSize( const EncodingType encodingType, const size_t entryCount, const Statistics& statistics) { switch (encodingType) { case EncodingType::Constant: { return statistics.uniqueCounts().value().size() == 1 - ? std::optional{getEncodingOverhead< - EncodingType::Constant, - physicalType>()} + ? std::optional{SizeEstimation{ + getEncodingOverhead(), + 0}} : std::nullopt; } case EncodingType::SparseBool: { @@ -177,16 +198,18 @@ struct EncodingSizeEstimation { const auto exceptionCount = std::min( statistics.uniqueCounts().value().at(true), statistics.uniqueCounts().value().at(false)); - uint32_t overhead = + const uint32_t overhead = getEncodingOverhead() + // Overhead for storing exception indices getEncodingOverhead(); - return overhead + sizeof(bool) + - bitPackedBytes(0, entryCount, exceptionCount); + return SizeEstimation{ + overhead + sizeof(bool), + bitPackedBytes(0, entryCount, exceptionCount)}; } case EncodingType::Trivial: { - return getEncodingOverhead() + - FixedBitArray::bufferSize(entryCount, 1); + return SizeEstimation{ + getEncodingOverhead(), + FixedBitArray::bufferSize(entryCount, 1)}; } case EncodingType::RLE: { // Assumptions: @@ -197,11 +220,11 @@ struct EncodingSizeEstimation { statistics.minRepeat(), statistics.maxRepeat(), statistics.consecutiveRepeatCount()); - uint32_t overhead = + const uint32_t overhead = getEncodingOverhead() + // Overhead of run lengths getEncodingOverhead(); - return overhead + sizeof(bool) + runLengthsSize; + return SizeEstimation{overhead + sizeof(bool), runLengthsSize}; } default: { return std::nullopt; @@ -209,7 +232,7 @@ struct EncodingSizeEstimation { } } - static std::optional estimateStringSize( + static std::optional estimateStringSize( const EncodingType encodingType, const size_t entryCount, const Statistics& statistics) { @@ -217,9 +240,10 @@ struct EncodingSizeEstimation { switch (encodingType) { case EncodingType::Constant: { return statistics.uniqueCounts().value().size() == 1 - ? std::optional{getEncodingOverhead< - EncodingType::Constant, - physicalType>(maxStringSize)} + ? std::optional{SizeEstimation{ + getEncodingOverhead( + maxStringSize), + 0}} : std::nullopt; } case EncodingType::MainlyConstant: { @@ -264,7 +288,7 @@ struct EncodingSizeEstimation { // stored bit packed. const auto uncommonIndicesSize = bitPackedBytes(0, entryCount, uncommonCount); - uint32_t overhead = + const uint32_t overhead = getEncodingOverhead( maxUniqueCount->first.size()) + // Overhead for storing uncommon values @@ -273,17 +297,18 @@ struct EncodingSizeEstimation { // Overhead for storing uncommon bitmap getEncodingOverhead(); - return overhead + alphabetSize + uncommonIndicesSize; + return SizeEstimation{overhead, alphabetSize + uncommonIndicesSize}; } case EncodingType::Trivial: { // We assume string lengths will be stored bit packed. - return getEncodingOverhead( - maxStringSize) + + return SizeEstimation{ + getEncodingOverhead( + maxStringSize), statistics.totalStringsLength() + - bitPackedBytes( - statistics.min().size(), - statistics.max().size(), - entryCount); + bitPackedBytes( + statistics.min().size(), + statistics.max().size(), + entryCount)}; } case EncodingType::Dictionary: { // Assumptions: @@ -305,7 +330,7 @@ struct EncodingSizeEstimation { bitPackedBytes(statistics.min().size(), statistics.max().size(), statistics.uniqueCounts().value().size()); - uint32_t overhead = + const uint32_t overhead = getEncodingOverhead( maxStringSize) + // Alphabet overhead @@ -313,7 +338,7 @@ struct EncodingSizeEstimation { maxStringSize) + // Indices overhead getEncodingOverhead(); - return overhead + alphabetSize + indicesSize; + return SizeEstimation{overhead, alphabetSize + indicesSize}; } case EncodingType::RLE: { // Assumptions: @@ -321,7 +346,7 @@ struct EncodingSizeEstimation { // bit-packing). Run lengths are stored using bit-packing (with bit // width needed to store max repetition count). - uint64_t runValuesSize = + const uint64_t runValuesSize = // (unique) strings blob size std::accumulate( statistics.uniqueCounts().value().cbegin(), @@ -344,7 +369,7 @@ struct EncodingSizeEstimation { statistics.minRepeat(), statistics.maxRepeat(), statistics.consecutiveRepeatCount()); - uint32_t overhead = + const uint32_t overhead = getEncodingOverhead() + // Overhead of run values getEncodingOverhead() + @@ -352,7 +377,7 @@ struct EncodingSizeEstimation { getEncodingOverhead() + // Overhead of run lengths getEncodingOverhead(); - return overhead + runValuesSize + runLengthsSize; + return SizeEstimation{overhead, runValuesSize + runLengthsSize}; } default: { return std::nullopt; @@ -360,7 +385,7 @@ struct EncodingSizeEstimation { } } - static std::optional estimateSize( + static std::optional estimateSize( const EncodingType encodingType, const size_t entryCount, const Statistics& statistics) { diff --git a/dwio/nimble/encodings/tests/EncodingSelectionTests.cpp b/dwio/nimble/encodings/tests/EncodingSelectionTests.cpp index 1ba93ae9..c30fbd0f 100644 --- a/dwio/nimble/encodings/tests/EncodingSelectionTests.cpp +++ b/dwio/nimble/encodings/tests/EncodingSelectionTests.cpp @@ -134,7 +134,7 @@ void verifySizeEstimate( encodingTypeForEstimation, values.size(), nimble::Statistics::create(values)); - EXPECT_EQ(estimatedSize, expectedEstimatedSize); + EXPECT_EQ(estimatedSize.value().size(), expectedEstimatedSize); } template @@ -180,7 +180,7 @@ void test(std::span values, std::vector expected) { LOG(INFO) << "Expected: " << expected[i].encodingType << "<" << expected[i].dataType << ">[" << expected[i].nestedEncodingName << ":" << expected[i].level << "]"; - LOG(INFO) << "Actual: " << actual[i].encodingType << "<" + LOG(INFO) << "Actual: " << actual[i].encodingType << "<" << actual[i].dataType << ">[" << actual[i].nestedEncodingName << ":" << actual[i].level << "]"; EXPECT_EQ(expected[i].encodingType, actual[i].encodingType); @@ -435,7 +435,7 @@ TYPED_TEST(EncodingSelectionNumericTests, SelectRunLength) { if constexpr ( nimble::isFloatingPointType() || std::is_same_v || - sizeof(T) > 4) { + sizeof(T) >= 4) { // Floating point types and big types prefer storing the run values as // dictionary test( @@ -994,3 +994,16 @@ TEST(EncodingSelectionTests, TestNullable) { LOG(INFO) << "Final size: " << serialized.size(); } + +TEST(EncodingSelectionTests, TestSizeEstimateCost) { + std::vector values{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + auto estimatedSize = + nimble::detail::EncodingSizeEstimation::estimateSize( + nimble::EncodingType::Trivial, + values.size(), + nimble::Statistics::create(values)) + .value(); + EXPECT_EQ(estimatedSize.size(), 17); + EXPECT_EQ(estimatedSize.cost(10), 107); + EXPECT_EQ(estimatedSize.cost(0), 7); +}