Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions dwio/nimble/encodings/EncodingSelectionPolicy.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,10 +172,10 @@ class ManualEncodingSelectionPolicy : public EncodingSelectionPolicy<T> {
// minimal cost.
for (const auto& pair : readFactors_) {
auto encodingType = pair.first;
auto size =
auto sizeEstimation =
detail::EncodingSizeEstimation<T, FixedByteWidth>::estimateSize(
encodingType, values.size(), statistics);
if (!size.has_value()) {
if (!sizeEstimation.has_value()) {
NIMBLE_SELECTION_LOG(
PURPLE << encodingType << " encoding is incompatible.");
continue;
Expand All @@ -184,9 +184,10 @@ class ManualEncodingSelectionPolicy : public EncodingSelectionPolicy<T> {
// We use read factor weights to raise/lower the favorability of each
// encoding.
auto readFactor = pair.second;
auto cost = size.value() * readFactor;
auto cost = sizeEstimation.value().cost(readFactor);
NIMBLE_SELECTION_LOG(
YELLOW << "Encoding: " << encodingType << ", Size: " << size.value()
YELLOW << "Encoding: " << encodingType
<< ", Size: " << sizeEstimation.value().size()
<< ", Factor: " << readFactor << ", Cost: " << cost);
if (cost < minCost) {
minCost = cost;
Expand Down
121 changes: 73 additions & 48 deletions dwio/nimble/encodings/EncodingSizeEstimation.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,27 +22,46 @@
#include "dwio/nimble/common/Exceptions.h"
#include "dwio/nimble/common/FixedBitArray.h"
#include "dwio/nimble/common/Types.h"
#include "dwio/nimble/encodings/Statistics.h"

namespace facebook::nimble {
namespace detail {

struct SizeEstimation {
public:
SizeEstimation(uint64_t headerSize, uint64_t dataSize)
: headerSize_{headerSize}, dataSize_{dataSize} {}

uint64_t cost(double readFactor) const {
return headerSize_ + (dataSize_ * readFactor);
}

uint64_t size() const {
return headerSize_ + dataSize_;
}

private:
const uint64_t headerSize_;
const uint64_t dataSize_;
};

// This class is meant to quickly estimate the size of encoded data using a
// given encoding type. It does a lot of assumptions, and it is not meant to be
// 100% accurate.
template <typename T, bool FixedByteWidth>
struct EncodingSizeEstimation {
using physicalType = typename TypeTraits<T>::physicalType;

static std::optional<uint64_t> estimateNumericSize(
static std::optional<SizeEstimation> estimateNumericSize(
const EncodingType encodingType,
const uint64_t entryCount,
const Statistics<physicalType>& statistics) {
switch (encodingType) {
case EncodingType::Constant: {
return statistics.uniqueCounts().value().size() == 1
? std::optional<uint64_t>{getEncodingOverhead<
EncodingType::Constant,
physicalType>()}
? std::optional<SizeEstimation>{SizeEstimation{
getEncodingOverhead<EncodingType::Constant, physicalType>(),
0}}
: std::nullopt;
}
case EncodingType::MainlyConstant: {
Expand Down Expand Up @@ -71,25 +90,26 @@ struct EncodingSizeEstimation {
// stored bit packed.
const auto uncommonIndicesSize =
bitPackedBytes(0, entryCount, uncommonCount);
uint32_t overhead =
const uint32_t overhead =
getEncodingOverhead<EncodingType::MainlyConstant, physicalType>() +
// Overhead for storing uncommon values
getEncodingOverhead<EncodingType::FixedBitWidth, physicalType>() +
// Overhead for storing uncommon bitmap
getEncodingOverhead<EncodingType::SparseBool, bool>() +
getEncodingOverhead<EncodingType::FixedBitWidth, uint32_t>();
return overhead + sizeof(physicalType) + uncommonValueSize +
uncommonIndicesSize;
return SizeEstimation{
overhead + sizeof(physicalType),
uncommonValueSize + uncommonIndicesSize};
}
case EncodingType::Trivial: {
return getEncodingOverhead<EncodingType::Trivial, physicalType>() +
(entryCount * sizeof(physicalType));
return SizeEstimation{
getEncodingOverhead<EncodingType::Trivial, physicalType>(),
entryCount * sizeof(physicalType)};
}
case EncodingType::FixedBitWidth: {
return getEncodingOverhead<
EncodingType::FixedBitWidth,
physicalType>() +
bitPackedBytes(statistics.min(), statistics.max(), entryCount);
return SizeEstimation{
getEncodingOverhead<EncodingType::FixedBitWidth, physicalType>(),
bitPackedBytes(statistics.min(), statistics.max(), entryCount)};
}
case EncodingType::Dictionary: {
// Assumptions:
Expand All @@ -100,13 +120,13 @@ struct EncodingSizeEstimation {
0, statistics.uniqueCounts().value().size(), entryCount);
const uint64_t alphabetSize =
statistics.uniqueCounts().value().size() * sizeof(physicalType);
uint32_t overhead =
const uint32_t overhead =
getEncodingOverhead<EncodingType::Dictionary, physicalType>() +
// Alphabet overhead
getEncodingOverhead<EncodingType::Trivial, physicalType>() +
// Indices overhead
getEncodingOverhead<EncodingType::FixedBitWidth, uint32_t>();
return overhead + alphabetSize + indicesSize;
return SizeEstimation{overhead, alphabetSize + indicesSize};
}
case EncodingType::RLE: {
// Assumptions:
Expand All @@ -122,13 +142,13 @@ struct EncodingSizeEstimation {
statistics.minRepeat(),
statistics.maxRepeat(),
statistics.consecutiveRepeatCount());
uint32_t overhead =
const uint32_t overhead =
getEncodingOverhead<EncodingType::RLE, physicalType>() +
// Overhead of run values
getEncodingOverhead<EncodingType::FixedBitWidth, physicalType>() +
// Overhead of run lengths
getEncodingOverhead<EncodingType::FixedBitWidth, uint32_t>();
return overhead + runValuesSize + runLengthsSize;
return SizeEstimation{overhead, runValuesSize + runLengthsSize};
}
case EncodingType::Varint: {
// Note: the condition below actually support floating point numbers as
Expand All @@ -145,8 +165,9 @@ struct EncodingSizeEstimation {
[&i](const uint64_t sum, const uint64_t bucketSize) {
return sum + (bucketSize * (++i));
});
return getEncodingOverhead<EncodingType::Varint, physicalType>() +
dataSize;
return SizeEstimation{
getEncodingOverhead<EncodingType::Varint, physicalType>(),
dataSize};
} else {
return std::nullopt;
}
Expand All @@ -157,16 +178,16 @@ struct EncodingSizeEstimation {
}
}

static std::optional<uint64_t> estimateBoolSize(
static std::optional<SizeEstimation> estimateBoolSize(
const EncodingType encodingType,
const size_t entryCount,
const Statistics<physicalType>& statistics) {
switch (encodingType) {
case EncodingType::Constant: {
return statistics.uniqueCounts().value().size() == 1
? std::optional<uint64_t>{getEncodingOverhead<
EncodingType::Constant,
physicalType>()}
? std::optional<SizeEstimation>{SizeEstimation{
getEncodingOverhead<EncodingType::Constant, physicalType>(),
0}}
: std::nullopt;
}
case EncodingType::SparseBool: {
Expand All @@ -177,16 +198,18 @@ struct EncodingSizeEstimation {
const auto exceptionCount = std::min(
statistics.uniqueCounts().value().at(true),
statistics.uniqueCounts().value().at(false));
uint32_t overhead =
const uint32_t overhead =
getEncodingOverhead<EncodingType::SparseBool, physicalType>() +
// Overhead for storing exception indices
getEncodingOverhead<EncodingType::FixedBitWidth, uint32_t>();
return overhead + sizeof(bool) +
bitPackedBytes(0, entryCount, exceptionCount);
return SizeEstimation{
overhead + sizeof(bool),
bitPackedBytes(0, entryCount, exceptionCount)};
}
case EncodingType::Trivial: {
return getEncodingOverhead<EncodingType::Trivial, physicalType>() +
FixedBitArray::bufferSize(entryCount, 1);
return SizeEstimation{
getEncodingOverhead<EncodingType::Trivial, physicalType>(),
FixedBitArray::bufferSize(entryCount, 1)};
}
case EncodingType::RLE: {
// Assumptions:
Expand All @@ -197,29 +220,30 @@ struct EncodingSizeEstimation {
statistics.minRepeat(),
statistics.maxRepeat(),
statistics.consecutiveRepeatCount());
uint32_t overhead =
const uint32_t overhead =
getEncodingOverhead<EncodingType::RLE, physicalType>() +
// Overhead of run lengths
getEncodingOverhead<EncodingType::FixedBitWidth, uint32_t>();
return overhead + sizeof(bool) + runLengthsSize;
return SizeEstimation{overhead + sizeof(bool), runLengthsSize};
}
default: {
return std::nullopt;
}
}
}

static std::optional<uint64_t> estimateStringSize(
static std::optional<SizeEstimation> estimateStringSize(
const EncodingType encodingType,
const size_t entryCount,
const Statistics<std::string_view>& statistics) {
const uint32_t maxStringSize = statistics.max().size();
switch (encodingType) {
case EncodingType::Constant: {
return statistics.uniqueCounts().value().size() == 1
? std::optional<uint64_t>{getEncodingOverhead<
EncodingType::Constant,
physicalType>(maxStringSize)}
? std::optional<SizeEstimation>{SizeEstimation{
getEncodingOverhead<EncodingType::Constant, physicalType>(
maxStringSize),
0}}
: std::nullopt;
}
case EncodingType::MainlyConstant: {
Expand Down Expand Up @@ -264,7 +288,7 @@ struct EncodingSizeEstimation {
// stored bit packed.
const auto uncommonIndicesSize =
bitPackedBytes(0, entryCount, uncommonCount);
uint32_t overhead =
const uint32_t overhead =
getEncodingOverhead<EncodingType::MainlyConstant, physicalType>(
maxUniqueCount->first.size()) +
// Overhead for storing uncommon values
Expand All @@ -273,17 +297,18 @@ struct EncodingSizeEstimation {
// Overhead for storing uncommon bitmap
getEncodingOverhead<EncodingType::SparseBool, bool>();

return overhead + alphabetSize + uncommonIndicesSize;
return SizeEstimation{overhead, alphabetSize + uncommonIndicesSize};
}
case EncodingType::Trivial: {
// We assume string lengths will be stored bit packed.
return getEncodingOverhead<EncodingType::Trivial, physicalType>(
maxStringSize) +
return SizeEstimation{
getEncodingOverhead<EncodingType::Trivial, physicalType>(
maxStringSize),
statistics.totalStringsLength() +
bitPackedBytes(
statistics.min().size(),
statistics.max().size(),
entryCount);
bitPackedBytes(
statistics.min().size(),
statistics.max().size(),
entryCount)};
}
case EncodingType::Dictionary: {
// Assumptions:
Expand All @@ -305,23 +330,23 @@ struct EncodingSizeEstimation {
bitPackedBytes(statistics.min().size(),
statistics.max().size(),
statistics.uniqueCounts().value().size());
uint32_t overhead =
const uint32_t overhead =
getEncodingOverhead<EncodingType::Dictionary, physicalType>(
maxStringSize) +
// Alphabet overhead
getEncodingOverhead<EncodingType::Trivial, physicalType>(
maxStringSize) +
// Indices overhead
getEncodingOverhead<EncodingType::FixedBitWidth, uint32_t>();
return overhead + alphabetSize + indicesSize;
return SizeEstimation{overhead, alphabetSize + indicesSize};
}
case EncodingType::RLE: {
// Assumptions:
// Run values are stored using dictionary (and inside, trivial +
// bit-packing). Run lengths are stored using bit-packing (with bit
// width needed to store max repetition count).

uint64_t runValuesSize =
const uint64_t runValuesSize =
// (unique) strings blob size
std::accumulate(
statistics.uniqueCounts().value().cbegin(),
Expand All @@ -344,23 +369,23 @@ struct EncodingSizeEstimation {
statistics.minRepeat(),
statistics.maxRepeat(),
statistics.consecutiveRepeatCount());
uint32_t overhead =
const uint32_t overhead =
getEncodingOverhead<EncodingType::RLE, physicalType>() +
// Overhead of run values
getEncodingOverhead<EncodingType::Dictionary, physicalType>() +
getEncodingOverhead<EncodingType::Trivial, physicalType>() +
getEncodingOverhead<EncodingType::FixedBitWidth, uint32_t>() +
// Overhead of run lengths
getEncodingOverhead<EncodingType::FixedBitWidth, uint32_t>();
return overhead + runValuesSize + runLengthsSize;
return SizeEstimation{overhead, runValuesSize + runLengthsSize};
}
default: {
return std::nullopt;
}
}
}

static std::optional<uint64_t> estimateSize(
static std::optional<SizeEstimation> estimateSize(
const EncodingType encodingType,
const size_t entryCount,
const Statistics<physicalType>& statistics) {
Expand Down
19 changes: 16 additions & 3 deletions dwio/nimble/encodings/tests/EncodingSelectionTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ void verifySizeEstimate(
encodingTypeForEstimation,
values.size(),
nimble::Statistics<T>::create(values));
EXPECT_EQ(estimatedSize, expectedEstimatedSize);
EXPECT_EQ(estimatedSize.value().size(), expectedEstimatedSize);
}

template <typename T>
Expand Down Expand Up @@ -180,7 +180,7 @@ void test(std::span<const T> values, std::vector<EncodingDetails> expected) {
LOG(INFO) << "Expected: " << expected[i].encodingType << "<"
<< expected[i].dataType << ">[" << expected[i].nestedEncodingName
<< ":" << expected[i].level << "]";
LOG(INFO) << "Actual: " << actual[i].encodingType << "<"
LOG(INFO) << "Actual: " << actual[i].encodingType << "<"
<< actual[i].dataType << ">[" << actual[i].nestedEncodingName
<< ":" << actual[i].level << "]";
EXPECT_EQ(expected[i].encodingType, actual[i].encodingType);
Expand Down Expand Up @@ -435,7 +435,7 @@ TYPED_TEST(EncodingSelectionNumericTests, SelectRunLength) {

if constexpr (
nimble::isFloatingPointType<T>() || std::is_same_v<int32_t, T> ||
sizeof(T) > 4) {
sizeof(T) >= 4) {
// Floating point types and big types prefer storing the run values as
// dictionary
test<T>(
Expand Down Expand Up @@ -994,3 +994,16 @@ TEST(EncodingSelectionTests, TestNullable) {

LOG(INFO) << "Final size: " << serialized.size();
}

TEST(EncodingSelectionTests, TestSizeEstimateCost) {
std::vector<uint8_t> values{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
auto estimatedSize =
nimble::detail::EncodingSizeEstimation<uint8_t, false>::estimateSize(
nimble::EncodingType::Trivial,
values.size(),
nimble::Statistics<uint8_t>::create(values))
.value();
EXPECT_EQ(estimatedSize.size(), 17);
EXPECT_EQ(estimatedSize.cost(10), 107);
EXPECT_EQ(estimatedSize.cost(0), 7);
}
Loading