From 0067fcc8bd672ec110c48232b7b6bbce39e9a7b1 Mon Sep 17 00:00:00 2001 From: Jimmy Lu Date: Tue, 7 Oct 2025 13:46:03 -0700 Subject: [PATCH] fix: Handle null constant result in duplicated array and map column readers Summary: In case of some irregular encodings of deduplicated arrays or maps, specifically: 1. The deduplicated field is stored as flat map values, with at least 2 keys; 2. Array elements in one of the map values are all nulls, and thus stored as null streams; 3. The reader reads the key with null elements runs first, and the read starts a new run; 4. Then read the other key, and it is a continuation of previous run, plus some new elements at end. When the above conditions are all satisfied, the elements vector is changed to a constant vector by `NullColumnReader` first, then we try to write to it because the second key is reusing the value from previous run, so it tries to copy the cached value into result vector, and throw an exception. Fix this by resetting the result if it is not writable (constant or dictionary). Differential Revision: D84085512 --- .../selective/VariableLengthColumnReader.cpp | 97 ++++++++++--------- .../tests/SelectiveNimbleReaderTest.cpp | 38 +++++++- 2 files changed, 83 insertions(+), 52 deletions(-) diff --git a/dwio/nimble/velox/selective/VariableLengthColumnReader.cpp b/dwio/nimble/velox/selective/VariableLengthColumnReader.cpp index 052fbba9..26971b67 100644 --- a/dwio/nimble/velox/selective/VariableLengthColumnReader.cpp +++ b/dwio/nimble/velox/selective/VariableLengthColumnReader.cpp @@ -53,6 +53,51 @@ bool estimateMaterializedSizeImpl( return true; } +void tryReuseArrayVectorBase( + velox::ArrayVectorBase& vector, + velox::vector_size_t size, + velox::BufferPtr& offsets, + velox::BufferPtr& sizes) { + if (vector.offsets() && vector.offsets()->isMutable()) { + offsets = vector.mutableOffsets(size); + } + if (vector.sizes() && vector.sizes()->isMutable()) { + sizes = vector.mutableSizes(size); + } +} + +velox::VectorPtr tryReuseWritableVector(const velox::VectorPtr& vector) { + if (vector && vector->encoding() != velox::VectorEncoding::Simple::CONSTANT && + vector->encoding() != velox::VectorEncoding::Simple::DICTIONARY && + vector.use_count() == 1) { + vector->resize(0); + return vector; + } + return nullptr; +} + +void tryReuseArrayVector( + velox::ArrayVector& vector, + velox::vector_size_t size, + velox::BufferPtr& offsets, + velox::BufferPtr& sizes, + velox::VectorPtr& elements) { + tryReuseArrayVectorBase(vector, size, offsets, sizes); + elements = tryReuseWritableVector(vector.elements()); +} + +void tryReuseMapVector( + velox::MapVector& vector, + velox::vector_size_t size, + velox::BufferPtr& offsets, + velox::BufferPtr& sizes, + velox::VectorPtr& keys, + velox::VectorPtr& values) { + tryReuseArrayVectorBase(vector, size, offsets, sizes); + keys = tryReuseWritableVector(vector.mapKeys()); + values = tryReuseWritableVector(vector.mapValues()); +} + velox::DictionaryVector* prepareDictionaryArrayResult( velox::VectorPtr& result, const velox::TypePtr& type, @@ -78,29 +123,11 @@ velox::DictionaryVector* prepareDictionaryArrayResult( alphabet = dictionaryVector->valueVector(); auto arrayVector = alphabet->as(); if (arrayVector) { - if (arrayVector->sizes() && arrayVector->sizes()->isMutable()) { - sizes = arrayVector->mutableSizes(size); - } - if (arrayVector->offsets() && arrayVector->offsets()->isMutable()) { - offsets = arrayVector->mutableOffsets(size); - } - if (arrayVector->elements() && arrayVector->elements().unique()) { - elements = arrayVector->elements(); - elements->resize(0); - } + tryReuseArrayVector(*arrayVector, size, offsets, sizes, elements); } } } else if (auto arrayVector = result->as()) { - if (arrayVector->sizes() && arrayVector->sizes()->isMutable()) { - sizes = arrayVector->mutableSizes(size); - } - if (arrayVector->offsets() && arrayVector->offsets()->isMutable()) { - offsets = arrayVector->mutableOffsets(size); - } - if (arrayVector->elements() && arrayVector->elements().unique()) { - elements = arrayVector->elements(); - elements->resize(0); - } + tryReuseArrayVector(*arrayVector, size, offsets, sizes, elements); } } @@ -171,37 +198,11 @@ velox::DictionaryVector* prepareDictionaryMapResult( alphabet = dictionaryVector->valueVector(); auto mapVector = alphabet->as(); if (mapVector) { - if (mapVector->sizes() && mapVector->sizes()->isMutable()) { - sizes = mapVector->mutableSizes(size); - } - if (mapVector->offsets() && mapVector->offsets()->isMutable()) { - offsets = mapVector->mutableOffsets(size); - } - if (mapVector->mapKeys() && mapVector->mapKeys().unique()) { - keys = mapVector->mapKeys(); - keys->resize(0); - } - if (mapVector->mapValues() && mapVector->mapValues().unique()) { - elements = mapVector->mapValues(); - elements->resize(0); - } + tryReuseMapVector(*mapVector, size, offsets, sizes, keys, elements); } } } else if (auto mapVector = result->as()) { - if (mapVector->sizes() && mapVector->sizes()->isMutable()) { - sizes = mapVector->mutableSizes(size); - } - if (mapVector->offsets() && mapVector->offsets()->isMutable()) { - offsets = mapVector->mutableOffsets(size); - } - if (mapVector->mapKeys() && mapVector->mapKeys().unique()) { - keys = mapVector->mapKeys(); - keys->resize(0); - } - if (mapVector->mapValues() && mapVector->mapValues().unique()) { - elements = mapVector->mapValues(); - elements->resize(0); - } + tryReuseMapVector(*mapVector, size, offsets, sizes, keys, elements); } } diff --git a/dwio/nimble/velox/selective/tests/SelectiveNimbleReaderTest.cpp b/dwio/nimble/velox/selective/tests/SelectiveNimbleReaderTest.cpp index 08371f9e..1409e86a 100644 --- a/dwio/nimble/velox/selective/tests/SelectiveNimbleReaderTest.cpp +++ b/dwio/nimble/velox/selective/tests/SelectiveNimbleReaderTest.cpp @@ -22,16 +22,15 @@ #include namespace facebook::nimble { +namespace { + +using namespace facebook::velox; enum FilterType { kNone, kKeep, kDrop }; auto format_as(FilterType filterType) { return fmt::underlying(filterType); } -namespace { - -using namespace facebook::velox; - // This test suite covers the basic and mostly single batch test cases, as well // as some corner cases that are hard to cover in randomized tests. We rely on // E2EFilterTest for more comprehensive tests with multi stripes and multi @@ -1006,6 +1005,37 @@ TEST_F(SelectiveNimbleReaderTest, arrayWithOffsetsLastRunFilteredOutAfterRead) { {{{1}}, {{1}}}, {false, true}, {1, 1}, std::nullopt, true); } +TEST_F(SelectiveNimbleReaderTest, arrayWithOffsetsReuseNullResult) { + auto vector = makeRowVector({ + std::make_shared( + pool(), + MAP(BIGINT(), ARRAY(BIGINT())), + nullptr, + 4, + makeIndices({0, 2, 4, 6}), + makeIndices({2, 2, 2, 2}), + makeFlatVector({1, 2, 1, 2, 1, 2, 1, 2}), + makeNullableArrayVector({ + {std::nullopt, std::nullopt}, + {std::optional(3)}, + {std::nullopt, std::nullopt}, + {std::optional(3)}, + {std::nullopt}, + {std::optional(3)}, + {std::nullopt}, + {std::optional(4)}, + })), + }); + VeloxWriterOptions writerOptions; + writerOptions.flatMapColumns = {"c0"}; + writerOptions.dictionaryArrayColumns = {"c0"}; + auto fileContent = test::createNimbleFile(*rootPool(), vector, writerOptions); + auto scanSpec = std::make_shared("root"); + scanSpec->addAllChildFields(*vector->type()); + auto readers = makeReaders(vector, fileContent, scanSpec); + validate(*vector, *readers.rowReader, 2, [](auto) { return true; }); +} + TEST_F(SelectiveNimbleReaderTest, slidingWindowMapSubfieldPruning) { common::BigintRange keyFilter(2, 2, false); checkSlidingWindowMap(