-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-8413: [C++][Parquet] Refactor Generating validity bitmap for values column #6985
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0a7e537
83c38c4
3480711
677c81f
a8e7637
0d2ebd2
e69d87a
ee20f7a
bdc1333
5b5df2a
fa4e364
a9c8b05
9b3bbeb
91f6814
0fef43b
44411f3
04109d4
f08d836
6a08225
189403d
9aeb97d
435523f
4226e16
a362883
439cb01
e1d0b78
66e8595
c0baaa9
d07849d
c82ac10
1593f9e
d541d3a
0f57459
858ef01
0113e02
9047390
2b8be0c
74a6aa9
fe9263b
a0d93c2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,6 +26,7 @@ | |
| #include <string> | ||
| #include <vector> | ||
|
|
||
| #include <gmock/gmock.h> | ||
| #include <gtest/gtest.h> | ||
|
|
||
| #include "arrow/buffer.h" | ||
|
|
@@ -46,6 +47,8 @@ using internal::CopyBitmap; | |
| using internal::CountSetBits; | ||
| using internal::InvertBitmap; | ||
|
|
||
| using ::testing::ElementsAreArray; | ||
|
|
||
| template <class BitmapWriter> | ||
| void WriteVectorToWriter(BitmapWriter& writer, const std::vector<int> values) { | ||
| for (const auto& value : values) { | ||
|
|
@@ -315,6 +318,184 @@ TEST(FirstTimeBitmapWriter, NormalOperation) { | |
| } | ||
| } | ||
|
|
||
| std::string BitmapToString(const uint8_t* bitmap, int64_t bit_count) { | ||
| return arrow::internal::Bitmap(bitmap, /*offset*/ 0, /*length=*/bit_count).ToString(); | ||
| } | ||
|
|
||
| std::string BitmapToString(const std::vector<uint8_t>& bitmap, int64_t bit_count) { | ||
| return BitmapToString(bitmap.data(), bit_count); | ||
| } | ||
|
|
||
| TEST(FirstTimeBitmapWriter, AppendWordOffsetOverwritesCorrectBitsOnExistingByte) { | ||
| auto check_append = [](const std::string& expected_bits, int64_t offset) { | ||
| std::vector<uint8_t> valid_bits = {0x00}; | ||
|
||
| constexpr int64_t kBitsAfterAppend = 8; | ||
| internal::FirstTimeBitmapWriter writer(valid_bits.data(), offset, | ||
| /*length=*/(8 * valid_bits.size()) - offset); | ||
| writer.AppendWord(/*word=*/0xFF, /*number_of_bits=*/kBitsAfterAppend - offset); | ||
| writer.Finish(); | ||
| EXPECT_EQ(BitmapToString(valid_bits, kBitsAfterAppend), expected_bits); | ||
| }; | ||
| check_append("11111111", 0); | ||
| check_append("01111111", 1); | ||
| check_append("00111111", 2); | ||
| check_append("00011111", 3); | ||
| check_append("00001111", 4); | ||
| check_append("00000111", 5); | ||
| check_append("00000011", 6); | ||
| check_append("00000001", 7); | ||
|
|
||
| auto check_with_set = [](const std::string& expected_bits, int64_t offset) { | ||
| std::vector<uint8_t> valid_bits = {0x1}; | ||
| constexpr int64_t kBitsAfterAppend = 8; | ||
| internal::FirstTimeBitmapWriter writer(valid_bits.data(), offset, | ||
| /*length=*/(8 * valid_bits.size()) - offset); | ||
| writer.AppendWord(/*word=*/0xFF, /*number_of_bits=*/kBitsAfterAppend - offset); | ||
| writer.Finish(); | ||
| EXPECT_EQ(BitmapToString(valid_bits, kBitsAfterAppend), expected_bits); | ||
| }; | ||
| // 0ffset zero would not be a valid mask. | ||
| check_with_set("11111111", 1); | ||
| check_with_set("10111111", 2); | ||
| check_with_set("10011111", 3); | ||
| check_with_set("10001111", 4); | ||
| check_with_set("10000111", 5); | ||
| check_with_set("10000011", 6); | ||
| check_with_set("10000001", 7); | ||
|
|
||
| auto check_with_preceding = [](const std::string& expected_bits, int64_t offset) { | ||
| std::vector<uint8_t> valid_bits = {0xFF}; | ||
| constexpr int64_t kBitsAfterAppend = 8; | ||
| internal::FirstTimeBitmapWriter writer(valid_bits.data(), offset, | ||
| /*length=*/(8 * valid_bits.size()) - offset); | ||
| writer.AppendWord(/*word=*/0xFF, /*number_of_bits=*/kBitsAfterAppend - offset); | ||
| writer.Finish(); | ||
| EXPECT_EQ(BitmapToString(valid_bits, kBitsAfterAppend), expected_bits); | ||
| }; | ||
| check_with_preceding("11111111", 0); | ||
| check_with_preceding("11111111", 1); | ||
| check_with_preceding("11111111", 2); | ||
| check_with_preceding("11111111", 3); | ||
| check_with_preceding("11111111", 4); | ||
| check_with_preceding("11111111", 5); | ||
| check_with_preceding("11111111", 6); | ||
| check_with_preceding("11111111", 7); | ||
| } | ||
|
|
||
| TEST(FirstTimeBitmapWriter, AppendZeroBitsHasNoImpact) { | ||
| std::vector<uint8_t> valid_bits(/*count=*/1, 0); | ||
| internal::FirstTimeBitmapWriter writer(valid_bits.data(), /*start_offset=*/1, | ||
| /*length=*/valid_bits.size() * 8); | ||
| writer.AppendWord(/*word=*/0xFF, /*number_of_bits=*/0); | ||
| writer.AppendWord(/*word=*/0xFF, /*number_of_bits=*/0); | ||
| writer.AppendWord(/*word=*/0x01, /*number_of_bits=*/1); | ||
| writer.Finish(); | ||
| EXPECT_EQ(valid_bits[0], 0x2); | ||
| } | ||
|
|
||
| TEST(FirstTimeBitmapWriter, AppendLessThanByte) { | ||
| { | ||
| std::vector<uint8_t> valid_bits(/*count*/ 8, 0); | ||
| internal::FirstTimeBitmapWriter writer(valid_bits.data(), /*start_offset=*/1, | ||
| /*length=*/8); | ||
| writer.AppendWord(0xB, 4); | ||
| writer.Finish(); | ||
| EXPECT_EQ(BitmapToString(valid_bits, /*bit_count=*/8), "01101000"); | ||
| } | ||
| { | ||
| // Test with all bits initially set. | ||
| std::vector<uint8_t> valid_bits(/*count*/ 8, 0xFF); | ||
| internal::FirstTimeBitmapWriter writer(valid_bits.data(), /*start_offset=*/1, | ||
| /*length=*/8); | ||
| writer.AppendWord(0xB, 4); | ||
| writer.Finish(); | ||
| EXPECT_EQ(BitmapToString(valid_bits, /*bit_count=*/8), "11101000"); | ||
| } | ||
| } | ||
|
|
||
| TEST(FirstTimeBitmapWriter, AppendByteThenMore) { | ||
| { | ||
| std::vector<uint8_t> valid_bits(/*count*/ 8, 0); | ||
| internal::FirstTimeBitmapWriter writer(valid_bits.data(), /*start_offset=*/0, | ||
| /*length=*/9); | ||
| writer.AppendWord(0xC3, 8); | ||
| writer.AppendWord(0x01, 1); | ||
| writer.Finish(); | ||
| EXPECT_EQ(BitmapToString(valid_bits, /*bit_count=*/9), "11000011 1"); | ||
| } | ||
| { | ||
| std::vector<uint8_t> valid_bits(/*count*/ 8, 0xFF); | ||
| internal::FirstTimeBitmapWriter writer(valid_bits.data(), /*start_offset=*/0, | ||
| /*length=*/9); | ||
| writer.AppendWord(0xC3, 8); | ||
| writer.AppendWord(0x01, 1); | ||
| writer.Finish(); | ||
| EXPECT_EQ(BitmapToString(valid_bits, /*bit_count=*/9), "11000011 1"); | ||
| } | ||
| } | ||
|
|
||
| TEST(FirstTimeBitmapWriter, AppendWordShiftsBitsCorrectly) { | ||
| constexpr uint64_t kPattern = 0x9A9A9A9A9A9A9A9A; | ||
| auto check_append = [&](const std::string& leading_bits, const std::string& middle_bits, | ||
| const std::string& trailing_bits, int64_t offset, | ||
| bool preset_buffer_bits = false) { | ||
| ASSERT_GE(offset, 8); | ||
| std::vector<uint8_t> valid_bits(/*count=*/10, preset_buffer_bits ? 0xFF : 0); | ||
| valid_bits[0] = 0x99; | ||
| internal::FirstTimeBitmapWriter writer(valid_bits.data(), offset, | ||
| /*length=*/(9 * sizeof(kPattern)) - offset); | ||
| writer.AppendWord(/*word=*/kPattern, /*number_of_bits=*/64); | ||
| writer.Finish(); | ||
| EXPECT_EQ(valid_bits[0], 0x99); // shouldn't get changed. | ||
| EXPECT_EQ(BitmapToString(valid_bits.data() + 1, /*num_bits=*/8), leading_bits); | ||
| for (int x = 2; x < 9; x++) { | ||
| EXPECT_EQ(BitmapToString(valid_bits.data() + x, /*num_bits=*/8), middle_bits) | ||
| << "x: " << x << " " << offset << " " << BitmapToString(valid_bits.data(), 80); | ||
| } | ||
| EXPECT_EQ(BitmapToString(valid_bits.data() + 9, /*num_bits=*/8), trailing_bits); | ||
| }; | ||
| // Original Pattern = "01011001" | ||
| check_append(/*leading_bits= */ "01011001", /*middle_bits=*/"01011001", | ||
| /*trailing_bits=*/"00000000", /*offset=*/8); | ||
| check_append("00101100", "10101100", "10000000", 9); | ||
| check_append("00010110", "01010110", "01000000", 10); | ||
| check_append("00001011", "00101011", "00100000", 11); | ||
| check_append("00000101", "10010101", "10010000", 12); | ||
| check_append("00000010", "11001010", "11001000", 13); | ||
| check_append("00000001", "01100101", "01100100", 14); | ||
| check_append("00000000", "10110010", "10110010", 15); | ||
|
|
||
| check_append(/*leading_bits= */ "01011001", /*middle_bits=*/"01011001", | ||
| /*trailing_bits=*/"11111111", /*offset=*/8, /*preset_buffer_bits=*/true); | ||
| check_append("10101100", "10101100", "10000000", 9, true); | ||
| check_append("11010110", "01010110", "01000000", 10, true); | ||
| check_append("11101011", "00101011", "00100000", 11, true); | ||
| check_append("11110101", "10010101", "10010000", 12, true); | ||
| check_append("11111010", "11001010", "11001000", 13, true); | ||
| check_append("11111101", "01100101", "01100100", 14, true); | ||
| check_append("11111110", "10110010", "10110010", 15, true); | ||
| } | ||
|
|
||
| TEST(TestAppendBitmap, AppendWordOnlyApproriateBytesWritten) { | ||
| std::vector<uint8_t> valid_bits = {0x00, 0x00}; | ||
|
|
||
| uint64_t bitmap = 0x1FF; | ||
| { | ||
| internal::FirstTimeBitmapWriter writer(valid_bits.data(), /*start_offset=*/1, | ||
| /*length=*/(8 * valid_bits.size()) - 1); | ||
| writer.AppendWord(bitmap, /*number_of_bits*/ 7); | ||
| writer.Finish(); | ||
| EXPECT_THAT(valid_bits, ElementsAreArray(std::vector<uint8_t>{0xFE, 0x00})); | ||
| } | ||
| { | ||
| internal::FirstTimeBitmapWriter writer(valid_bits.data(), /*start_offset=*/1, | ||
| /*length=*/(8 * valid_bits.size()) - 1); | ||
| writer.AppendWord(bitmap, /*number_of_bits*/ 8); | ||
| writer.Finish(); | ||
| EXPECT_THAT(valid_bits, ElementsAreArray(std::vector<uint8_t>{0xFE, 0x03})); | ||
| } | ||
| } | ||
|
|
||
| // Tests for GenerateBits and GenerateBitsUnrolled | ||
|
|
||
| struct GenerateBitsFunctor { | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.