diff --git a/dwio/nimble/encodings/EncodingSelectionPolicy.h b/dwio/nimble/encodings/EncodingSelectionPolicy.h index b8a250e9..38a84fb5 100644 --- a/dwio/nimble/encodings/EncodingSelectionPolicy.h +++ b/dwio/nimble/encodings/EncodingSelectionPolicy.h @@ -172,10 +172,10 @@ class ManualEncodingSelectionPolicy : public EncodingSelectionPolicy { // minimal cost. for (const auto& pair : readFactors_) { auto encodingType = pair.first; - auto size = + auto sizeEstimation = detail::EncodingSizeEstimation::estimateSize( encodingType, values.size(), statistics); - if (!size.has_value()) { + if (!sizeEstimation.has_value()) { NIMBLE_SELECTION_LOG( PURPLE << encodingType << " encoding is incompatible."); continue; @@ -184,9 +184,10 @@ class ManualEncodingSelectionPolicy : public EncodingSelectionPolicy { // We use read factor weights to raise/lower the favorability of each // encoding. auto readFactor = pair.second; - auto cost = size.value() * readFactor; + auto cost = sizeEstimation.value().cost(readFactor); NIMBLE_SELECTION_LOG( - YELLOW << "Encoding: " << encodingType << ", Size: " << size.value() + YELLOW << "Encoding: " << encodingType + << ", Size: " << sizeEstimation.value().size() << ", Factor: " << readFactor << ", Cost: " << cost); if (cost < minCost) { minCost = cost; diff --git a/dwio/nimble/encodings/EncodingSizeEstimation.h b/dwio/nimble/encodings/EncodingSizeEstimation.h index 790ed12f..56f76810 100644 --- a/dwio/nimble/encodings/EncodingSizeEstimation.h +++ b/dwio/nimble/encodings/EncodingSizeEstimation.h @@ -22,10 +22,29 @@ #include "dwio/nimble/common/Exceptions.h" #include "dwio/nimble/common/FixedBitArray.h" #include "dwio/nimble/common/Types.h" +#include "dwio/nimble/encodings/Statistics.h" namespace facebook::nimble { namespace detail { +struct SizeEstimation { + public: + SizeEstimation(uint64_t headerSize, uint64_t dataSize) + : headerSize_{headerSize}, dataSize_{dataSize} {} + + uint64_t cost(double readFactor) const { + return headerSize_ + (dataSize_ * readFactor); + } + + uint64_t size() const { + return headerSize_ + dataSize_; + } + + private: + const uint64_t headerSize_; + const uint64_t dataSize_; +}; + // This class is meant to quickly estimate the size of encoded data using a // given encoding type. It does a lot of assumptions, and it is not meant to be // 100% accurate. @@ -33,16 +52,16 @@ template struct EncodingSizeEstimation { using physicalType = typename TypeTraits::physicalType; - static std::optional estimateNumericSize( + static std::optional estimateNumericSize( const EncodingType encodingType, const uint64_t entryCount, const Statistics& statistics) { switch (encodingType) { case EncodingType::Constant: { return statistics.uniqueCounts().value().size() == 1 - ? std::optional{getEncodingOverhead< - EncodingType::Constant, - physicalType>()} + ? std::optional{SizeEstimation{ + getEncodingOverhead(), + 0}} : std::nullopt; } case EncodingType::MainlyConstant: { @@ -71,25 +90,26 @@ struct EncodingSizeEstimation { // stored bit packed. const auto uncommonIndicesSize = bitPackedBytes(0, entryCount, uncommonCount); - uint32_t overhead = + const uint32_t overhead = getEncodingOverhead() + // Overhead for storing uncommon values getEncodingOverhead() + // Overhead for storing uncommon bitmap getEncodingOverhead() + getEncodingOverhead(); - return overhead + sizeof(physicalType) + uncommonValueSize + - uncommonIndicesSize; + return SizeEstimation{ + overhead + sizeof(physicalType), + uncommonValueSize + uncommonIndicesSize}; } case EncodingType::Trivial: { - return getEncodingOverhead() + - (entryCount * sizeof(physicalType)); + return SizeEstimation{ + getEncodingOverhead(), + entryCount * sizeof(physicalType)}; } case EncodingType::FixedBitWidth: { - return getEncodingOverhead< - EncodingType::FixedBitWidth, - physicalType>() + - bitPackedBytes(statistics.min(), statistics.max(), entryCount); + return SizeEstimation{ + getEncodingOverhead(), + bitPackedBytes(statistics.min(), statistics.max(), entryCount)}; } case EncodingType::Dictionary: { // Assumptions: @@ -100,13 +120,13 @@ struct EncodingSizeEstimation { 0, statistics.uniqueCounts().value().size(), entryCount); const uint64_t alphabetSize = statistics.uniqueCounts().value().size() * sizeof(physicalType); - uint32_t overhead = + const uint32_t overhead = getEncodingOverhead() + // Alphabet overhead getEncodingOverhead() + // Indices overhead getEncodingOverhead(); - return overhead + alphabetSize + indicesSize; + return SizeEstimation{overhead, alphabetSize + indicesSize}; } case EncodingType::RLE: { // Assumptions: @@ -122,13 +142,13 @@ struct EncodingSizeEstimation { statistics.minRepeat(), statistics.maxRepeat(), statistics.consecutiveRepeatCount()); - uint32_t overhead = + const uint32_t overhead = getEncodingOverhead() + // Overhead of run values getEncodingOverhead() + // Overhead of run lengths getEncodingOverhead(); - return overhead + runValuesSize + runLengthsSize; + return SizeEstimation{overhead, runValuesSize + runLengthsSize}; } case EncodingType::Varint: { // Note: the condition below actually support floating point numbers as @@ -145,8 +165,9 @@ struct EncodingSizeEstimation { [&i](const uint64_t sum, const uint64_t bucketSize) { return sum + (bucketSize * (++i)); }); - return getEncodingOverhead() + - dataSize; + return SizeEstimation{ + getEncodingOverhead(), + dataSize}; } else { return std::nullopt; } @@ -157,16 +178,16 @@ struct EncodingSizeEstimation { } } - static std::optional estimateBoolSize( + static std::optional estimateBoolSize( const EncodingType encodingType, const size_t entryCount, const Statistics& statistics) { switch (encodingType) { case EncodingType::Constant: { return statistics.uniqueCounts().value().size() == 1 - ? std::optional{getEncodingOverhead< - EncodingType::Constant, - physicalType>()} + ? std::optional{SizeEstimation{ + getEncodingOverhead(), + 0}} : std::nullopt; } case EncodingType::SparseBool: { @@ -177,16 +198,18 @@ struct EncodingSizeEstimation { const auto exceptionCount = std::min( statistics.uniqueCounts().value().at(true), statistics.uniqueCounts().value().at(false)); - uint32_t overhead = + const uint32_t overhead = getEncodingOverhead() + // Overhead for storing exception indices getEncodingOverhead(); - return overhead + sizeof(bool) + - bitPackedBytes(0, entryCount, exceptionCount); + return SizeEstimation{ + overhead + sizeof(bool), + bitPackedBytes(0, entryCount, exceptionCount)}; } case EncodingType::Trivial: { - return getEncodingOverhead() + - FixedBitArray::bufferSize(entryCount, 1); + return SizeEstimation{ + getEncodingOverhead(), + FixedBitArray::bufferSize(entryCount, 1)}; } case EncodingType::RLE: { // Assumptions: @@ -197,11 +220,11 @@ struct EncodingSizeEstimation { statistics.minRepeat(), statistics.maxRepeat(), statistics.consecutiveRepeatCount()); - uint32_t overhead = + const uint32_t overhead = getEncodingOverhead() + // Overhead of run lengths getEncodingOverhead(); - return overhead + sizeof(bool) + runLengthsSize; + return SizeEstimation{overhead + sizeof(bool), runLengthsSize}; } default: { return std::nullopt; @@ -209,7 +232,7 @@ struct EncodingSizeEstimation { } } - static std::optional estimateStringSize( + static std::optional estimateStringSize( const EncodingType encodingType, const size_t entryCount, const Statistics& statistics) { @@ -217,9 +240,10 @@ struct EncodingSizeEstimation { switch (encodingType) { case EncodingType::Constant: { return statistics.uniqueCounts().value().size() == 1 - ? std::optional{getEncodingOverhead< - EncodingType::Constant, - physicalType>(maxStringSize)} + ? std::optional{SizeEstimation{ + getEncodingOverhead( + maxStringSize), + 0}} : std::nullopt; } case EncodingType::MainlyConstant: { @@ -264,7 +288,7 @@ struct EncodingSizeEstimation { // stored bit packed. const auto uncommonIndicesSize = bitPackedBytes(0, entryCount, uncommonCount); - uint32_t overhead = + const uint32_t overhead = getEncodingOverhead( maxUniqueCount->first.size()) + // Overhead for storing uncommon values @@ -273,17 +297,18 @@ struct EncodingSizeEstimation { // Overhead for storing uncommon bitmap getEncodingOverhead(); - return overhead + alphabetSize + uncommonIndicesSize; + return SizeEstimation{overhead, alphabetSize + uncommonIndicesSize}; } case EncodingType::Trivial: { // We assume string lengths will be stored bit packed. - return getEncodingOverhead( - maxStringSize) + + return SizeEstimation{ + getEncodingOverhead( + maxStringSize), statistics.totalStringsLength() + - bitPackedBytes( - statistics.min().size(), - statistics.max().size(), - entryCount); + bitPackedBytes( + statistics.min().size(), + statistics.max().size(), + entryCount)}; } case EncodingType::Dictionary: { // Assumptions: @@ -305,7 +330,7 @@ struct EncodingSizeEstimation { bitPackedBytes(statistics.min().size(), statistics.max().size(), statistics.uniqueCounts().value().size()); - uint32_t overhead = + const uint32_t overhead = getEncodingOverhead( maxStringSize) + // Alphabet overhead @@ -313,7 +338,7 @@ struct EncodingSizeEstimation { maxStringSize) + // Indices overhead getEncodingOverhead(); - return overhead + alphabetSize + indicesSize; + return SizeEstimation{overhead, alphabetSize + indicesSize}; } case EncodingType::RLE: { // Assumptions: @@ -321,7 +346,7 @@ struct EncodingSizeEstimation { // bit-packing). Run lengths are stored using bit-packing (with bit // width needed to store max repetition count). - uint64_t runValuesSize = + const uint64_t runValuesSize = // (unique) strings blob size std::accumulate( statistics.uniqueCounts().value().cbegin(), @@ -344,7 +369,7 @@ struct EncodingSizeEstimation { statistics.minRepeat(), statistics.maxRepeat(), statistics.consecutiveRepeatCount()); - uint32_t overhead = + const uint32_t overhead = getEncodingOverhead() + // Overhead of run values getEncodingOverhead() + @@ -352,7 +377,7 @@ struct EncodingSizeEstimation { getEncodingOverhead() + // Overhead of run lengths getEncodingOverhead(); - return overhead + runValuesSize + runLengthsSize; + return SizeEstimation{overhead, runValuesSize + runLengthsSize}; } default: { return std::nullopt; @@ -360,7 +385,7 @@ struct EncodingSizeEstimation { } } - static std::optional estimateSize( + static std::optional estimateSize( const EncodingType encodingType, const size_t entryCount, const Statistics& statistics) { diff --git a/dwio/nimble/encodings/tests/EncodingSelectionTests.cpp b/dwio/nimble/encodings/tests/EncodingSelectionTests.cpp index 1ba93ae9..c30fbd0f 100644 --- a/dwio/nimble/encodings/tests/EncodingSelectionTests.cpp +++ b/dwio/nimble/encodings/tests/EncodingSelectionTests.cpp @@ -134,7 +134,7 @@ void verifySizeEstimate( encodingTypeForEstimation, values.size(), nimble::Statistics::create(values)); - EXPECT_EQ(estimatedSize, expectedEstimatedSize); + EXPECT_EQ(estimatedSize.value().size(), expectedEstimatedSize); } template @@ -180,7 +180,7 @@ void test(std::span values, std::vector expected) { LOG(INFO) << "Expected: " << expected[i].encodingType << "<" << expected[i].dataType << ">[" << expected[i].nestedEncodingName << ":" << expected[i].level << "]"; - LOG(INFO) << "Actual: " << actual[i].encodingType << "<" + LOG(INFO) << "Actual: " << actual[i].encodingType << "<" << actual[i].dataType << ">[" << actual[i].nestedEncodingName << ":" << actual[i].level << "]"; EXPECT_EQ(expected[i].encodingType, actual[i].encodingType); @@ -435,7 +435,7 @@ TYPED_TEST(EncodingSelectionNumericTests, SelectRunLength) { if constexpr ( nimble::isFloatingPointType() || std::is_same_v || - sizeof(T) > 4) { + sizeof(T) >= 4) { // Floating point types and big types prefer storing the run values as // dictionary test( @@ -994,3 +994,16 @@ TEST(EncodingSelectionTests, TestNullable) { LOG(INFO) << "Final size: " << serialized.size(); } + +TEST(EncodingSelectionTests, TestSizeEstimateCost) { + std::vector values{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + auto estimatedSize = + nimble::detail::EncodingSizeEstimation::estimateSize( + nimble::EncodingType::Trivial, + values.size(), + nimble::Statistics::create(values)) + .value(); + EXPECT_EQ(estimatedSize.size(), 17); + EXPECT_EQ(estimatedSize.cost(10), 107); + EXPECT_EQ(estimatedSize.cost(0), 7); +}