From d9858284d3e37349e08f6386f11e0e116e31ef08 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Wed, 19 Feb 2025 11:41:53 +0800 Subject: [PATCH] [fix](array index) Correct null bitmap writing for inverted index (#47846) This pull request focuses on improving the handling of null values in the inverted index writer and simplifying the codebase by removing redundant null map checks. The most important changes include removing unnecessary null map handling in several methods and ensuring proper null bitmap updates. Improvements to null value handling and code simplification: * [`be/src/olap/rowset/segment_v2/column_writer.cpp`](diffhunk://#diff-db6023c6e1df0c3616055f02e769cc20fcef7ee083cb3755cec1b661bb7b42ffL952-L958): Removed redundant null map handling in `Status ArrayColumnWriter::append_nullable` method. * [`be/src/olap/rowset/segment_v2/inverted_index_writer.cpp`](diffhunk://#diff-97781916b276f771710ab520c79ca29d5e4e331296fad7573fc9933a376dc165L328-R328): Simplified `add_array_nulls` method to always return `Status::OK()`. * [`be/src/olap/rowset/segment_v2/inverted_index_writer.cpp`](diffhunk://#diff-97781916b276f771710ab520c79ca29d5e4e331296fad7573fc9933a376dc165L429-R426): Added null map check before accessing elements in the loop to prevent potential null pointer dereference. [[1]](diffhunk://#diff-97781916b276f771710ab520c79ca29d5e4e331296fad7573fc9933a376dc165L429-R426) [[2]](diffhunk://#diff-97781916b276f771710ab520c79ca29d5e4e331296fad7573fc9933a376dc165L525-R531) * [`be/src/olap/rowset/segment_v2/inverted_index_writer.cpp`](diffhunk://#diff-97781916b276f771710ab520c79ca29d5e4e331296fad7573fc9933a376dc165R513): Updated `_null_bitmap` in the `add_null_document` method to ensure proper null bitmap updates. * [`be/src/olap/task/index_builder.cpp`](diffhunk://#diff-df38b3b177cd231676ce7a405526b3419c543e29171143ddec02960a84a930c6L645-R645): Removed redundant null map handling in `Status IndexBuilder::_add_nullable` method. --- .../olap/rowset/segment_v2/column_writer.cpp | 10 +- .../segment_v2/inverted_index_writer.cpp | 52 +- .../rowset/segment_v2/inverted_index_writer.h | 2 +- be/src/olap/task/index_builder.cpp | 41 +- .../segment_v2/inverted_index_array_test.cpp | 879 +++++++++++++++++- 5 files changed, 905 insertions(+), 79 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp index 2637017b78d02b..f506cb24fce9a6 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.cpp +++ b/be/src/olap/rowset/segment_v2/column_writer.cpp @@ -469,7 +469,9 @@ Status ScalarColumnWriter::init() { return Status::OK(); } Status add_nulls(uint32_t count) override { return Status::OK(); } - Status add_array_nulls(uint32_t row_id) override { return Status::OK(); } + Status add_array_nulls(const uint8_t* null_map, size_t num_rows) override { + return Status::OK(); + } Status finish() override { return Status::OK(); } int64_t size() const override { return 0; } void close_on_error() override {} @@ -951,11 +953,7 @@ Status ArrayColumnWriter::append_nullable(const uint8_t* null_map, const uint8_t RETURN_IF_ERROR(append_data(ptr, num_rows)); if (is_nullable()) { if (_opts.need_inverted_index) { - for (int row_id = 0; row_id < num_rows; row_id++) { - if (null_map[row_id] == 1) { - RETURN_IF_ERROR(_inverted_index_builder->add_array_nulls(row_id)); - } - } + RETURN_IF_ERROR(_inverted_index_builder->add_array_nulls(null_map, num_rows)); } RETURN_IF_ERROR(_null_writer->append_data(&null_map, num_rows)); } diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index d85511722ec092..093d460ae43fd6 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -303,8 +303,26 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { return Status::OK(); } - Status add_array_nulls(uint32_t row_id) override { - _null_bitmap.add(row_id); + Status add_array_nulls(const uint8_t* null_map, size_t num_rows) override { + DCHECK(_rid >= num_rows); + if (num_rows == 0 || null_map == nullptr) { + return Status::OK(); + } + std::vector null_indices; + null_indices.reserve(num_rows / 8); + + // because _rid is the row id in block, not segment, and we add data before we add nulls, + // so we need to subtract num_rows to get the row id in segment + for (size_t i = 0; i < num_rows; i++) { + if (null_map[i] == 1) { + null_indices.push_back(_rid - num_rows + static_cast(i)); + } + } + + if (!null_indices.empty()) { + _null_bitmap.addMany(null_indices.size(), null_indices.data()); + } + return Status::OK(); } @@ -378,8 +396,9 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { return Status::OK(); } - Status add_array_values(size_t field_size, const void* value_ptr, const uint8_t* null_map, - const uint8_t* offsets_ptr, size_t count) override { + Status add_array_values(size_t field_size, const void* value_ptr, + const uint8_t* nested_null_map, const uint8_t* offsets_ptr, + size_t count) override { DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::add_array_values_count_is_zero", { count = 0; }) if (count == 0) { @@ -404,7 +423,7 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { lucene::document::Field* new_field = nullptr; CL_NS(analysis)::TokenStream* ts = nullptr; for (auto j = start_off; j < start_off + array_elem_size; ++j) { - if (null_map[j] == 1) { + if (nested_null_map && nested_null_map[j] == 1) { continue; } auto* v = (Slice*)((const uint8_t*)value_ptr + j * field_size); @@ -500,7 +519,7 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { for (int i = 0; i < count; ++i) { auto array_elem_size = offsets[i + 1] - offsets[i]; for (size_t j = start_off; j < start_off + array_elem_size; ++j) { - if (null_map[j] == 1) { + if (nested_null_map && nested_null_map[j] == 1) { continue; } const CppType* p = &reinterpret_cast(value_ptr)[j]; @@ -520,7 +539,8 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::add_array_values_field_is_nullptr", { _field = nullptr; }) DBUG_EXECUTE_IF( - "InvertedIndexColumnWriterImpl::add_array_values_index_writer_is_nullptr", + "InvertedIndexColumnWriterImpl::add_array_values_index_writer_is_" + "nullptr", { _index_writer = nullptr; }) if (_field == nullptr || _index_writer == nullptr) { LOG(ERROR) << "field or index writer is null in inverted index writer."; @@ -582,9 +602,10 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { std::string new_value; size_t value_length = sizeof(CppType); - DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::add_value_bkd_writer_add_throw_error", { - _CLTHROWA(CL_ERR_IllegalArgument, ("packedValue should be length=xxx")); - }); + DBUG_EXECUTE_IF( + "InvertedIndexColumnWriterImpl::add_value_bkd_writer_add_throw_" + "error", + { _CLTHROWA(CL_ERR_IllegalArgument, ("packedValue should be length=xxx")); }); _value_key_coder->full_encode_ascending(&value, &new_value); _bkd_writer->add((const uint8_t*)new_value.c_str(), value_length, _rid); @@ -643,8 +664,8 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { _bkd_writer->finish(data_out.get(), index_out.get()), int(field_type)); } else { - LOG(WARNING) - << "Inverted index writer create output error occurred: nullptr"; + LOG(WARNING) << "Inverted index writer create output error " + "occurred: nullptr"; _CLTHROWA(CL_ERR_IO, "Create output error with nullptr"); } } else if constexpr (field_is_slice_type(field_type)) { @@ -653,9 +674,12 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { InvertedIndexDescriptor::get_temporary_null_bitmap_file_name())); write_null_bitmap(null_bitmap_out.get()); DBUG_EXECUTE_IF( - "InvertedIndexWriter._throw_clucene_error_in_fulltext_writer_close", { + "InvertedIndexWriter._throw_clucene_error_in_fulltext_" + "writer_close", + { _CLTHROWA(CL_ERR_IO, - "debug point: test throw error in fulltext index writer"); + "debug point: test throw error in fulltext " + "index writer"); }); } } catch (CLuceneError& e) { diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.h b/be/src/olap/rowset/segment_v2/inverted_index_writer.h index da90752db09168..a8f719ee1268c6 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.h @@ -64,7 +64,7 @@ class InvertedIndexColumnWriter { size_t count) = 0; virtual Status add_nulls(uint32_t count) = 0; - virtual Status add_array_nulls(uint32_t row_id) = 0; + virtual Status add_array_nulls(const uint8_t* null_map, size_t num_rows) = 0; virtual Status finish() = 0; diff --git a/be/src/olap/task/index_builder.cpp b/be/src/olap/task/index_builder.cpp index bc677ea6f5c049..84f2345bb83e69 100644 --- a/be/src/olap/task/index_builder.cpp +++ b/be/src/olap/task/index_builder.cpp @@ -589,9 +589,9 @@ Status IndexBuilder::_write_inverted_index_data(TabletSchemaSPtr tablet_schema, return converted_result.first; } const auto* ptr = (const uint8_t*)converted_result.second->get_data(); - if (converted_result.second->get_nullmap()) { - RETURN_IF_ERROR(_add_nullable(column_name, writer_sign, field.get(), - converted_result.second->get_nullmap(), &ptr, + const auto* null_map = converted_result.second->get_nullmap(); + if (null_map) { + RETURN_IF_ERROR(_add_nullable(column_name, writer_sign, field.get(), null_map, &ptr, block->rows())); } else { RETURN_IF_ERROR(_add_data(column_name, writer_sign, field.get(), &ptr, block->rows())); @@ -606,18 +606,6 @@ Status IndexBuilder::_add_nullable(const std::string& column_name, const std::pair& index_writer_sign, Field* field, const uint8_t* null_map, const uint8_t** ptr, size_t num_rows) { - size_t offset = 0; - auto next_run_step = [&]() { - size_t step = 1; - for (auto i = offset + 1; i < num_rows; ++i) { - if (null_map[offset] == null_map[i]) { - step++; - } else { - break; - } - } - return step; - }; // TODO: need to process null data for inverted index if (field->type() == FieldType::OLAP_FIELD_TYPE_ARRAY) { DCHECK(field->get_sub_field_count() == 1); @@ -638,20 +626,27 @@ Status IndexBuilder::_add_nullable(const std::string& column_name, DBUG_EXECUTE_IF("IndexBuilder::_add_nullable_add_array_values_error", { _CLTHROWA(CL_ERR_IO, "debug point: _add_nullable_add_array_values_error"); }) + RETURN_IF_ERROR(_inverted_index_builders[index_writer_sign]->add_array_nulls(null_map, + num_rows)); } catch (const std::exception& e) { return Status::Error( "CLuceneError occured: {}", e.what()); } - // we should refresh nullmap for array - for (int row_id = 0; row_id < num_rows; row_id++) { - if (null_map && null_map[row_id] == 1) { - RETURN_IF_ERROR( - _inverted_index_builders[index_writer_sign]->add_array_nulls(row_id)); - } - } + return Status::OK(); } - + size_t offset = 0; + auto next_run_step = [&]() { + size_t step = 1; + for (auto i = offset + 1; i < num_rows; ++i) { + if (null_map[offset] == null_map[i]) { + step++; + } else { + break; + } + } + return step; + }; try { do { auto step = next_run_step(); diff --git a/be/test/olap/rowset/segment_v2/inverted_index_array_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index_array_test.cpp index c576097aa5d942..c1a24456955f07 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index_array_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index_array_test.cpp @@ -18,10 +18,13 @@ #include #include #include +#include #include #include +#include #include +#include #include #include @@ -30,6 +33,7 @@ #include "io/fs/local_file_system.h" #include "olap/rowset/segment_v2/inverted_index_compound_reader.h" #include "olap/rowset/segment_v2/inverted_index_desc.h" +#include "olap/rowset/segment_v2/inverted_index_file_reader.h" #include "olap/rowset/segment_v2/inverted_index_file_writer.h" #include "olap/rowset/segment_v2/inverted_index_fs_directory.h" #include "olap/rowset/segment_v2/inverted_index_writer.h" @@ -37,6 +41,7 @@ #include "olap/tablet_schema.h" #include "olap/tablet_schema_helper.h" #include "runtime/exec_env.h" +#include "util/faststring.h" #include "util/slice.h" #include "vec/columns/column_array.h" #include "vec/columns/column_nullable.h" @@ -55,11 +60,29 @@ namespace doris { namespace segment_v2 { class InvertedIndexArrayTest : public testing::Test { + using ExpectedDocMap = std::map>; + public: const std::string kTestDir = "./ut_dir/inverted_index_array_test"; - void check_terms_stats(string file_str) { - std::unique_ptr reader; + void check_terms_stats(std::string index_prefix, ExpectedDocMap* expected, + std::vector expected_null_bitmap = {}, + InvertedIndexStorageFormatPB format = InvertedIndexStorageFormatPB::V1, + const TabletIndex* index_meta = nullptr) { + std::string file_str; + if (format == InvertedIndexStorageFormatPB::V1) { + file_str = InvertedIndexDescriptor::get_index_file_path_v1(index_prefix, + index_meta->index_id(), ""); + } else if (format == InvertedIndexStorageFormatPB::V2) { + file_str = InvertedIndexDescriptor::get_index_file_path_v2(index_prefix); + } + std::unique_ptr reader = std::make_unique( + io::global_local_filesystem(), index_prefix, format); + auto st = reader->init(); + EXPECT_EQ(st, Status::OK()); + auto result = reader->open(index_meta); + EXPECT_TRUE(result.has_value()) << "Failed to open compound reader" << result.error(); + auto compound_reader = std::move(result.value()); try { CLuceneError err; CL_NS(store)::IndexInput* index_input = nullptr; @@ -68,14 +91,33 @@ class InvertedIndexArrayTest : public testing::Test { if (!ok) { throw err; } - reader = std::make_unique(index_input, 4096); - } catch (...) { - EXPECT_TRUE(false); + + std::shared_ptr null_bitmap = std::make_shared(); + const char* null_bitmap_file_name = + InvertedIndexDescriptor::get_temporary_null_bitmap_file_name(); + if (compound_reader->fileExists(null_bitmap_file_name)) { + std::unique_ptr null_bitmap_in; + assert(compound_reader->openInput(null_bitmap_file_name, null_bitmap_in, err, + 4096)); + size_t null_bitmap_size = null_bitmap_in->length(); + doris::faststring buf; + buf.resize(null_bitmap_size); + null_bitmap_in->readBytes(reinterpret_cast(buf.data()), null_bitmap_size); + *null_bitmap = roaring::Roaring::read(reinterpret_cast(buf.data()), false); + EXPECT_TRUE(expected_null_bitmap.size() == null_bitmap->cardinality()); + for (int i : expected_null_bitmap) { + EXPECT_TRUE(null_bitmap->contains(i)); + } + } + index_input->close(); + _CLLDELETE(index_input); + } catch (const CLuceneError& e) { + EXPECT_TRUE(false) << "CLuceneError: " << e.what(); } std::cout << "Term statistics for " << file_str << std::endl; std::cout << "==================================" << std::endl; - lucene::store::Directory* dir = reader.get(); + lucene::store::Directory* dir = compound_reader.get(); IndexReader* r = IndexReader::open(dir); @@ -90,15 +132,31 @@ class InvertedIndexArrayTest : public testing::Test { lucene_wcstoutf8string(te->term(false)->text(), te->term(false)->textLength()); printf("Term: %s ", token.c_str()); + if (expected) { + auto it = expected->find(token); + if (it != expected->end()) { + TermDocs* td = r->termDocs(te->term(false)); + std::vector actual_docs; + while (td->next()) { + actual_docs.push_back(td->doc()); + } + td->close(); + _CLLDELETE(td); + EXPECT_EQ(actual_docs, it->second) << "Term: " << token; + } + } printf("Freq: %d\n", te->docFreq()); } printf("Term count: %d\n\n", nterms); + if (expected) { + ASSERT_EQ(nterms, expected->size()); + } te->close(); _CLLDELETE(te); r->close(); _CLLDELETE(r); - reader->close(); + compound_reader->close(); } void SetUp() override { @@ -121,6 +179,114 @@ class InvertedIndexArrayTest : public testing::Test { EXPECT_TRUE(io::global_local_filesystem()->delete_directory(kTestDir).ok()); } + // create a TabletSchema with an array column (and a normal int column as key) + TabletSchemaSPtr create_schema_with_array(KeysType keys_type = DUP_KEYS) { + TabletSchemaSPtr tablet_schema = std::make_shared(); + TabletSchemaPB tablet_schema_pb; + tablet_schema_pb.set_keys_type(keys_type); + + tablet_schema->init_from_pb(tablet_schema_pb); + TabletColumn array; + array.set_name("arr1"); + array.set_type(FieldType::OLAP_FIELD_TYPE_ARRAY); + array.set_length(0); + array.set_index_length(0); + array.set_is_nullable(false); + array.set_is_bf_column(false); + TabletColumn child; + child.set_name("arr_sub_string"); + child.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + child.set_length(INT_MAX); + array.add_sub_column(child); + tablet_schema->append_column(array); + return tablet_schema; + } + + void test_non_null_string(std::string_view rowset_id, int seg_id, Field* field) { + EXPECT_TRUE(field->type() == FieldType::OLAP_FIELD_TYPE_ARRAY); + std::string index_path_prefix {InvertedIndexDescriptor::get_index_file_path_prefix( + local_segment_path(kTestDir, rowset_id, seg_id))}; + int index_id = 26033; + std::string index_path = + InvertedIndexDescriptor::get_index_file_path_v1(index_path_prefix, index_id, ""); + auto fs = io::global_local_filesystem(); + + auto index_meta_pb = std::make_unique(); + index_meta_pb->set_index_type(IndexType::INVERTED); + index_meta_pb->set_index_id(index_id); + index_meta_pb->set_index_name("index_inverted_arr1"); + index_meta_pb->clear_col_unique_id(); + index_meta_pb->add_col_unique_id(0); + + TabletIndex idx_meta; + idx_meta.index_type(); + idx_meta.init_from_pb(*index_meta_pb.get()); + auto index_file_writer = std::make_unique( + fs, index_path_prefix, std::string {rowset_id}, seg_id, + InvertedIndexStorageFormatPB::V1); + std::unique_ptr _inverted_index_builder = nullptr; + EXPECT_EQ(InvertedIndexColumnWriter::create(field, &_inverted_index_builder, + index_file_writer.get(), &idx_meta), + Status::OK()); + + // Construct two arrays: The first row is ["amory","doris"], and the second row is ["amory", "commiter"] + vectorized::Array a1, a2; + a1.push_back("amory"); + a1.push_back("doris"); + a2.push_back("amory"); + a2.push_back("commiter"); + + // Construct array type: DataTypeArray(DataTypeString) + vectorized::DataTypePtr s1 = std::make_shared(); + vectorized::DataTypePtr array_type = std::make_shared(s1); + vectorized::MutableColumnPtr col = array_type->create_column(); + col->insert(a1); + col->insert(a2); + vectorized::ColumnPtr column_array = std::move(col); + vectorized::ColumnWithTypeAndName type_and_name(column_array, array_type, "arr1"); + + // Put the array column into the Block (assuming only this column) + vectorized::Block block; + block.insert(type_and_name); + // block.rows() should be 2 + + // Use OlapBlockDataConvertor to convert + // Note: Here we need a TabletSchema object, in this example we construct a simple schema, + // Assuming that the 0th column in the schema is our array column (the actual UT has the corresponding TabletColumn) + TabletSchemaSPtr tablet_schema = create_schema_with_array(); + vectorized::OlapBlockDataConvertor convertor(tablet_schema.get(), {0}); + convertor.set_source_content(&block, 0, block.rows()); + auto [st, accessor] = convertor.convert_column_data(0); + EXPECT_EQ(st, Status::OK()); + // The conversion result is actually an array of 4 pointers: + // [0]: Total number of elements (elem_cnt) + // [1]: Offsets array pointer + // [2]: Nested item data pointer + // [3]: Nested nullmap pointer + const auto* data_ptr = reinterpret_cast(accessor->get_data()); + const auto* offsets_ptr = reinterpret_cast(data_ptr[1]); + const void* item_data = reinterpret_cast(data_ptr[2]); + const auto* item_nullmap = reinterpret_cast(data_ptr[3]); + + // Get the length of the subfield, used for inverted index writing + auto field_size = field->get_sub_field(0)->size(); + // Call the inverted index writing interface, passing in item_data, item_nullmap, offsets_ptr, and the number of rows (the number of array rows in the Block) + st = _inverted_index_builder->add_array_values(field_size, item_data, item_nullmap, + offsets_ptr, block.rows()); + EXPECT_EQ(st, Status::OK()); + const auto* null_map = accessor->get_nullmap(); + // add nulls + st = _inverted_index_builder->add_array_nulls(null_map, block.rows()); + EXPECT_EQ(st, Status::OK()); + + EXPECT_EQ(_inverted_index_builder->finish(), Status::OK()); + EXPECT_EQ(index_file_writer->close(), Status::OK()); + + ExpectedDocMap expected = {{"amory", {0, 1}}, {"doris", {0}}, {"commiter", {1}}}; + check_terms_stats(index_path_prefix, &expected, {}, InvertedIndexStorageFormatPB::V1, + &idx_meta); + } + void test_string(std::string_view rowset_id, int seg_id, Field* field) { EXPECT_TRUE(field->type() == FieldType::OLAP_FIELD_TYPE_ARRAY); std::string index_path_prefix {InvertedIndexDescriptor::get_index_file_path_prefix( @@ -147,9 +313,8 @@ class InvertedIndexArrayTest : public testing::Test { EXPECT_EQ(InvertedIndexColumnWriter::create(field, &_inverted_index_builder, index_file_writer.get(), &idx_meta), Status::OK()); - vectorized::PaddedPODArray _slice; - _slice.resize(5); + // Construct two arrays: The first row is ["amory","doris"], and the second row is [NULL, "amory", "commiter"] vectorized::Array a1, a2; a1.push_back("amory"); a1.push_back("doris"); @@ -157,36 +322,644 @@ class InvertedIndexArrayTest : public testing::Test { a2.push_back("amory"); a2.push_back("commiter"); + // Construct array type: DataTypeArray(DataTypeNullable(DataTypeString)) vectorized::DataTypePtr s1 = std::make_shared( std::make_shared()); - vectorized::DataTypePtr au = std::make_shared(s1); - vectorized::MutableColumnPtr col = au->create_column(); + vectorized::DataTypePtr array_type = std::make_shared(s1); + vectorized::MutableColumnPtr col = array_type->create_column(); col->insert(a1); col->insert(a2); vectorized::ColumnPtr column_array = std::move(col); - vectorized::ColumnWithTypeAndName type_and_name(column_array, au, "arr1"); + vectorized::ColumnWithTypeAndName type_and_name(column_array, array_type, "arr1"); + + // Put the array column into the Block (assuming only this column) + vectorized::Block block; + block.insert(type_and_name); + // block.rows() should be 2 + + // Use OlapBlockDataConvertor to convert + // Note: Here we need a TabletSchema object, in this example we construct a simple schema, + // Assuming that the 0th column in the schema is our array column (the actual UT has the corresponding TabletColumn) + TabletSchemaSPtr tablet_schema = create_schema_with_array(); + vectorized::OlapBlockDataConvertor convertor(tablet_schema.get(), {0}); + convertor.set_source_content(&block, 0, block.rows()); + auto [st, accessor] = convertor.convert_column_data(0); + EXPECT_EQ(st, Status::OK()); + // The conversion result is actually an array of 4 pointers: + // [0]: Total number of elements (elem_cnt) + // [1]: Offsets array pointer + // [2]: Nested item data pointer + // [3]: Nested nullmap pointer + const auto* data_ptr = reinterpret_cast(accessor->get_data()); + const auto* offsets_ptr = reinterpret_cast(data_ptr[1]); + const void* item_data = reinterpret_cast(data_ptr[2]); + const auto* item_nullmap = reinterpret_cast(data_ptr[3]); + + // Get the length of the subfield, used for inverted index writing + auto field_size = field->get_sub_field(0)->size(); + // Call the inverted index writing interface, passing in item_data, item_nullmap, offsets_ptr, and the number of rows (the number of array rows in the Block) + st = _inverted_index_builder->add_array_values(field_size, item_data, item_nullmap, + offsets_ptr, block.rows()); + EXPECT_EQ(st, Status::OK()); + const auto* null_map = accessor->get_nullmap(); + // add nulls + st = _inverted_index_builder->add_array_nulls(null_map, block.rows()); + EXPECT_EQ(st, Status::OK()); + EXPECT_EQ(_inverted_index_builder->finish(), Status::OK()); + EXPECT_EQ(index_file_writer->close(), Status::OK()); + + ExpectedDocMap expected = {{"amory", {0, 1}}, {"doris", {0}}, {"commiter", {1}}}; + check_terms_stats(index_path_prefix, &expected, {}, InvertedIndexStorageFormatPB::V1, + &idx_meta); + } + + void test_null_write_v2(std::string_view rowset_id, int seg_id, Field* field) { + EXPECT_TRUE(field->type() == FieldType::OLAP_FIELD_TYPE_ARRAY); + std::string index_path_prefix {InvertedIndexDescriptor::get_index_file_path_prefix( + local_segment_path(kTestDir, rowset_id, seg_id))}; + int index_id = 26033; + std::string index_path = InvertedIndexDescriptor::get_index_file_path_v2(index_path_prefix); + auto fs = io::global_local_filesystem(); + + auto index_meta_pb = std::make_unique(); + index_meta_pb->set_index_type(IndexType::INVERTED); + index_meta_pb->set_index_id(index_id); + index_meta_pb->set_index_name("index_inverted_arr1"); + index_meta_pb->clear_col_unique_id(); + index_meta_pb->add_col_unique_id(0); + + TabletIndex idx_meta; + idx_meta.index_type(); + idx_meta.init_from_pb(*index_meta_pb.get()); + io::FileWriterPtr file_writer; + io::FileWriterOptions opts; + Status sts = fs->create_file(index_path, &file_writer, &opts); + ASSERT_TRUE(sts.ok()); + auto index_file_writer = std::make_unique( + fs, index_path_prefix, std::string {rowset_id}, seg_id, + InvertedIndexStorageFormatPB::V2, std::move(file_writer)); + std::unique_ptr _inverted_index_builder = nullptr; + EXPECT_EQ(InvertedIndexColumnWriter::create(field, &_inverted_index_builder, + index_file_writer.get(), &idx_meta), + Status::OK()); + + // Simulate outer null cases: 5 rows, outer null map = {1, 0, 0, 1, 0}, i.e., rows 0 and 3 are null + std::vector outer_null_map = {1, 0, 0, 1, 0}; + + // Construct inner array type: DataTypeArray(DataTypeNullable(DataTypeString)) + vectorized::DataTypePtr inner_string_type = std::make_shared( + std::make_shared()); + vectorized::DataTypePtr array_type = + std::make_shared(inner_string_type); + // To support outer array null values, wrap it in a Nullable type + vectorized::DataTypePtr final_type = + std::make_shared(array_type); + + // Construct 5 rows of data: + // Row 0: null + // Row 1: a2 = [Null, "test"] + // Row 2: a3 = ["mixed", Null, "data"] + // Row 3: null + // Row 4: a5 = ["non-null"] + vectorized::MutableColumnPtr col = final_type->create_column(); + // Row 0: insert null + col->insert(vectorized::Null()); + // Row 1: insert a2 + vectorized::Array a2; + a2.push_back(vectorized::Null()); + a2.push_back("test"); + col->insert(a2); + // Row 2: insert a3 + vectorized::Array a3; + a3.push_back("mixed"); + a3.push_back(vectorized::Null()); + a3.push_back("data"); + col->insert(a3); + // Row 3: insert null + col->insert(vectorized::Null()); + // Row 4: insert a5 + vectorized::Array a5; + a5.push_back("non-null"); + col->insert(a5); + + vectorized::ColumnPtr column_array = std::move(col); + vectorized::ColumnWithTypeAndName type_and_name(column_array, final_type, "arr1"); + + // Construct Block, containing only the array column, with 5 rows + vectorized::Block block; + block.insert(type_and_name); + + // Construct TabletSchema (containing the array column) - reference the existing helper function + TabletSchemaSPtr tablet_schema = create_schema_with_array(); + // In this schema, assume the 0th column is the key, and the arr1 column is the non-key column with index 1 + vectorized::OlapBlockDataConvertor convertor(tablet_schema.get(), {0}); + convertor.set_source_content(&block, 0, block.rows()); + + // Convert array column data + auto [st, accessor] = convertor.convert_column_data(0); + EXPECT_EQ(st, Status::OK()); + // OlapColumnDataConvertorArray conversion result is a 4-tuple: + // [0]: element total count (elem_cnt, not used directly) + // [1]: offsets array pointer + // [2]: nested item data conversion result pointer + // [3]: nested nullmap pointer + const auto* data_ptr = reinterpret_cast(accessor->get_data()); + const auto* offsets_ptr = reinterpret_cast(data_ptr[1]); + const void* item_data = reinterpret_cast(data_ptr[2]); + const auto* item_nullmap = reinterpret_cast(data_ptr[3]); + + // Call the inverted index writing interface, passing in the converted nested data, nullmap, and offsets + auto field_size = field->get_sub_field(0)->size(); + st = _inverted_index_builder->add_array_values(field_size, item_data, item_nullmap, + offsets_ptr, block.rows()); + EXPECT_EQ(st, Status::OK()); + const auto* null_map = accessor->get_nullmap(); + // add nulls + st = _inverted_index_builder->add_array_nulls(null_map, block.rows()); + EXPECT_EQ(st, Status::OK()); + EXPECT_EQ(_inverted_index_builder->finish(), Status::OK()); + EXPECT_EQ(index_file_writer->close(), Status::OK()); + + // Expected inverted index result: only index non-null elements + // Row 1: non-null in a2 is "test" + // Row 2: non-null in a3 is "mixed" and "data" + // Row 4: non-null in a5 is "non-null" + ExpectedDocMap expected = {{"test", {1}}, {"mixed", {2}}, {"data", {2}}, {"non-null", {4}}}; + std::vector expected_null_bitmap = {0, 3}; + check_terms_stats(index_path_prefix, &expected, expected_null_bitmap, + InvertedIndexStorageFormatPB::V2, &idx_meta); + } + + void test_null_write(std::string_view rowset_id, int seg_id, Field* field) { + EXPECT_TRUE(field->type() == FieldType::OLAP_FIELD_TYPE_ARRAY); + std::string index_path_prefix {InvertedIndexDescriptor::get_index_file_path_prefix( + local_segment_path(kTestDir, rowset_id, seg_id))}; + int index_id = 26033; + std::string index_path = + InvertedIndexDescriptor::get_index_file_path_v1(index_path_prefix, index_id, ""); + auto fs = io::global_local_filesystem(); + + auto index_meta_pb = std::make_unique(); + index_meta_pb->set_index_type(IndexType::INVERTED); + index_meta_pb->set_index_id(index_id); + index_meta_pb->set_index_name("index_inverted_arr1"); + index_meta_pb->clear_col_unique_id(); + index_meta_pb->add_col_unique_id(0); + + TabletIndex idx_meta; + idx_meta.index_type(); + idx_meta.init_from_pb(*index_meta_pb.get()); + auto index_file_writer = std::make_unique( + fs, index_path_prefix, std::string {rowset_id}, seg_id, + InvertedIndexStorageFormatPB::V1); + std::unique_ptr _inverted_index_builder = nullptr; + EXPECT_EQ(InvertedIndexColumnWriter::create(field, &_inverted_index_builder, + index_file_writer.get(), &idx_meta), + Status::OK()); + + // Simulate outer null cases: 5 rows, outer null map = {1, 0, 0, 1, 0}, i.e., rows 0 and 3 are null + std::vector outer_null_map = {1, 0, 0, 1, 0}; + + // Construct inner array type: DataTypeArray(DataTypeNullable(DataTypeString)) + vectorized::DataTypePtr inner_string_type = std::make_shared( + std::make_shared()); + vectorized::DataTypePtr array_type = + std::make_shared(inner_string_type); + // To support outer array null values, wrap it in a Nullable type + vectorized::DataTypePtr final_type = + std::make_shared(array_type); + + // Construct 5 rows of data: + // Row 0: null + // Row 1: a2 = [Null, "test"] + // Row 2: a3 = ["mixed", Null, "data"] + // Row 3: null + // Row 4: a5 = ["non-null"] + vectorized::MutableColumnPtr col = final_type->create_column(); + // Row 0: insert null + col->insert(vectorized::Null()); + // Row 1: insert a2 + vectorized::Array a2; + a2.push_back(vectorized::Null()); + a2.push_back("test"); + col->insert(a2); + // Row 2: insert a3 + vectorized::Array a3; + a3.push_back("mixed"); + a3.push_back(vectorized::Null()); + a3.push_back("data"); + col->insert(a3); + // Row 3: insert null + col->insert(vectorized::Null()); + // Row 4: insert a5 + vectorized::Array a5; + a5.push_back("non-null"); + col->insert(a5); + + vectorized::ColumnPtr column_array = std::move(col); + vectorized::ColumnWithTypeAndName type_and_name(column_array, final_type, "arr1"); + + // Construct Block, containing only the array column, with 5 rows + vectorized::Block block; + block.insert(type_and_name); - vectorized::PaddedPODArray _offsets; - _offsets.reserve(3); - _offsets.emplace_back(0); - _offsets.emplace_back(2); - _offsets.emplace_back(5); - const uint8_t* offsets_ptr = (const uint8_t*)(_offsets.data()); + // Construct TabletSchema (containing the array column) - reference the existing helper function + TabletSchemaSPtr tablet_schema = create_schema_with_array(); + // In this schema, assume the 0th column is the key, and the arr1 column is the non-key column with index 1 + vectorized::OlapBlockDataConvertor convertor(tablet_schema.get(), {0}); + convertor.set_source_content(&block, 0, block.rows()); - auto* col_arr = assert_cast(column_array.get()); + // Convert array column data + auto [st, accessor] = convertor.convert_column_data(0); + EXPECT_EQ(st, Status::OK()); + // OlapColumnDataConvertorArray conversion result is a 4-tuple: + // [0]: element total count (elem_cnt, not used directly) + // [1]: offsets array pointer + // [2]: nested item data conversion result pointer + // [3]: nested nullmap pointer + const auto* data_ptr = reinterpret_cast(accessor->get_data()); + const auto* offsets_ptr = reinterpret_cast(data_ptr[1]); + const void* item_data = reinterpret_cast(data_ptr[2]); + const auto* item_nullmap = reinterpret_cast(data_ptr[3]); + + // Call the inverted index writing interface, passing in the converted nested data, nullmap, and offsets + auto field_size = field->get_sub_field(0)->size(); + st = _inverted_index_builder->add_array_values(field_size, item_data, item_nullmap, + offsets_ptr, block.rows()); + EXPECT_EQ(st, Status::OK()); + const auto* null_map = accessor->get_nullmap(); + // add nulls + st = _inverted_index_builder->add_array_nulls(null_map, block.rows()); + EXPECT_EQ(st, Status::OK()); + EXPECT_EQ(_inverted_index_builder->finish(), Status::OK()); + EXPECT_EQ(index_file_writer->close(), Status::OK()); + + // Expected inverted index result: only index non-null elements + // Row 1: non-null in a2 is "test" + // Row 2: non-null in a3 is "mixed" and "data" + // Row 4: non-null in a5 is "non-null" + ExpectedDocMap expected = {{"test", {1}}, {"mixed", {2}}, {"data", {2}}, {"non-null", {4}}}; + std::vector expected_null_bitmap = {0, 3}; + check_terms_stats(index_path_prefix, &expected, expected_null_bitmap, + InvertedIndexStorageFormatPB::V1, &idx_meta); + } + + void test_multi_block_write(std::string_view rowset_id, int seg_id, Field* field) { + EXPECT_TRUE(field->type() == FieldType::OLAP_FIELD_TYPE_ARRAY); + std::string index_path_prefix {InvertedIndexDescriptor::get_index_file_path_prefix( + local_segment_path(kTestDir, rowset_id, seg_id))}; + int index_id = 26033; + std::string index_path = + InvertedIndexDescriptor::get_index_file_path_v1(index_path_prefix, index_id, ""); + auto fs = io::global_local_filesystem(); + + auto index_meta_pb = std::make_unique(); + index_meta_pb->set_index_type(IndexType::INVERTED); + index_meta_pb->set_index_id(index_id); + index_meta_pb->set_index_name("index_inverted_arr1"); + index_meta_pb->clear_col_unique_id(); + index_meta_pb->add_col_unique_id(0); + + TabletIndex idx_meta; + idx_meta.init_from_pb(*index_meta_pb.get()); + auto index_file_writer = std::make_unique( + fs, index_path_prefix, "multi_block", 0, InvertedIndexStorageFormatPB::V1); + std::unique_ptr _inverted_index_builder = nullptr; + EXPECT_EQ(InvertedIndexColumnWriter::create(field, &_inverted_index_builder, + index_file_writer.get(), &idx_meta), + Status::OK()); + + ExpectedDocMap merged_expected; + + // --- Block 1 --- + { + const int row_num = 4; + // construct data type: Nullable( Array( Nullable(String) ) ) + vectorized::DataTypePtr inner_string = std::make_shared( + std::make_shared()); + vectorized::DataTypePtr array_type = + std::make_shared(inner_string); + vectorized::DataTypePtr final_type = + std::make_shared(array_type); + + // construct MutableColumn + vectorized::MutableColumnPtr col = final_type->create_column(); + // simulate outer null: row0 and row3 are null, the rest are non-null + col->insert(vectorized::Null()); // row0: null + { + // row1: non-null, array with 1 element: "block1_data1" + vectorized::Array arr; + arr.push_back("block1_data1"); + col->insert(arr); + } + { + // row2: non-null, array with 1 element: "block1_data2" + vectorized::Array arr; + arr.push_back("block1_data2"); + col->insert(arr); + } + col->insert(vectorized::Null()); // row3: null + + vectorized::ColumnPtr column_array = std::move(col); + vectorized::ColumnWithTypeAndName type_and_name(column_array, final_type, "arr1"); + + // construct Block (containing only the arr1 column) + vectorized::Block block; + block.insert(type_and_name); + + // use TabletSchema containing the array column (arr1 is the non-key column with index 1 in the schema) + TabletSchemaSPtr tablet_schema = create_schema_with_array(); + vectorized::OlapBlockDataConvertor convertor(tablet_schema.get(), {0}); + convertor.set_source_content(&block, 0, block.rows()); + + // convert the arr1 column in the block + auto [st, accessor] = convertor.convert_column_data(0); + EXPECT_EQ(st, Status::OK()); + // the conversion result is a 4-tuple: [0]: element count, [1]: offsets pointer, [2]: item data, [3]: item nullmap + const auto* data_ptr = reinterpret_cast(accessor->get_data()); + const auto* offsets_ptr = reinterpret_cast(data_ptr[1]); + const void* item_data = reinterpret_cast(data_ptr[2]); + const auto* item_nullmap = reinterpret_cast(data_ptr[3]); + auto field_size = field->get_sub_field(0)->size(); + st = _inverted_index_builder->add_array_values(field_size, item_data, item_nullmap, + offsets_ptr, row_num); + EXPECT_EQ(st, Status::OK()); + const auto* null_map = accessor->get_nullmap(); + // add nulls + st = _inverted_index_builder->add_array_nulls(null_map, row_num); + EXPECT_EQ(st, Status::OK()); + + // for Block1, the expected non-null behavior is row1 and row2 + ExpectedDocMap expected = {{"block1_data1", {1}}, {"block1_data2", {2}}}; + merged_expected.insert(expected.begin(), expected.end()); + } + + // --- Block 2 --- + { + const int row_num = 2; + vectorized::DataTypePtr inner_string = std::make_shared( + std::make_shared()); + vectorized::DataTypePtr array_type = + std::make_shared(inner_string); + vectorized::DataTypePtr final_type = + std::make_shared(array_type); + + vectorized::MutableColumnPtr col = final_type->create_column(); + // row0: non-null, array with 1 element: "block2_data1" + { + vectorized::Array arr; + arr.push_back("block2_data1"); + col->insert(arr); + } + // row1: null + col->insert(vectorized::Null()); + + vectorized::ColumnPtr column_array = std::move(col); + vectorized::ColumnWithTypeAndName type_and_name(column_array, final_type, "arr1"); + + vectorized::Block block; + block.insert(type_and_name); + + TabletSchemaSPtr tablet_schema = create_schema_with_array(); + vectorized::OlapBlockDataConvertor convertor(tablet_schema.get(), {0}); + convertor.set_source_content(&block, 0, block.rows()); + + auto [st, accessor] = convertor.convert_column_data(0); + EXPECT_EQ(st, Status::OK()); + const auto* data_ptr = reinterpret_cast(accessor->get_data()); + const auto* offsets_ptr = reinterpret_cast(data_ptr[1]); + const void* item_data = reinterpret_cast(data_ptr[2]); + const auto* item_nullmap = reinterpret_cast(data_ptr[3]); + + auto field_size = field->get_sub_field(0)->size(); + st = _inverted_index_builder->add_array_values(field_size, item_data, item_nullmap, + offsets_ptr, row_num); + EXPECT_EQ(st, Status::OK()); + const auto* null_map = accessor->get_nullmap(); + // add nulls + st = _inverted_index_builder->add_array_nulls(null_map, row_num); + EXPECT_EQ(st, Status::OK()); + + ExpectedDocMap expected = {{"block2_data1", {4}}}; + merged_expected.insert(expected.begin(), expected.end()); + } + + // --- Block 3 --- + { + const int row_num = 2; + vectorized::DataTypePtr inner_string = std::make_shared( + std::make_shared()); + vectorized::DataTypePtr array_type = + std::make_shared(inner_string); + vectorized::DataTypePtr final_type = + std::make_shared(array_type); + + vectorized::MutableColumnPtr col = final_type->create_column(); + // row0: non-null, array with 1 element: "block3_data1" + { + vectorized::Array arr; + arr.push_back("block3_data1"); + col->insert(arr); + } + // row1: null + col->insert(vectorized::Null()); + + vectorized::ColumnPtr column_array = std::move(col); + vectorized::ColumnWithTypeAndName type_and_name(column_array, final_type, "arr1"); + + vectorized::Block block; + block.insert(type_and_name); + + TabletSchemaSPtr tablet_schema = create_schema_with_array(); + vectorized::OlapBlockDataConvertor convertor(tablet_schema.get(), {0}); + convertor.set_source_content(&block, 0, block.rows()); + + auto [st, accessor] = convertor.convert_column_data(0); + EXPECT_EQ(st, Status::OK()); + const auto* data_ptr = reinterpret_cast(accessor->get_data()); + const auto* offsets_ptr = reinterpret_cast(data_ptr[1]); + const void* item_data = reinterpret_cast(data_ptr[2]); + const auto* item_nullmap = reinterpret_cast(data_ptr[3]); + auto field_size = field->get_sub_field(0)->size(); + st = _inverted_index_builder->add_array_values(field_size, item_data, item_nullmap, + offsets_ptr, row_num); + EXPECT_EQ(st, Status::OK()); + const auto* null_map = accessor->get_nullmap(); + // add nulls + st = _inverted_index_builder->add_array_nulls(null_map, row_num); + EXPECT_EQ(st, Status::OK()); + + ExpectedDocMap expected = {{"block3_data1", {6}}}; + merged_expected.insert(expected.begin(), expected.end()); + } + + EXPECT_EQ(_inverted_index_builder->finish(), Status::OK()); + EXPECT_EQ(index_file_writer->close(), Status::OK()); + + std::vector expected_null_bitmap = {0, 3, 5, 7}; + check_terms_stats(index_path_prefix, &merged_expected, expected_null_bitmap, + InvertedIndexStorageFormatPB::V1, &idx_meta); + } + + void test_array_numeric(std::string_view rowset_id, int seg_id, Field* field) { + EXPECT_TRUE(field->type() == FieldType::OLAP_FIELD_TYPE_ARRAY); + std::string index_path_prefix {InvertedIndexDescriptor::get_index_file_path_prefix( + local_segment_path(kTestDir, rowset_id, seg_id))}; + int index_id = 26033; + std::string index_path = + InvertedIndexDescriptor::get_index_file_path_v1(index_path_prefix, index_id, ""); + auto fs = io::global_local_filesystem(); + + auto index_meta_pb = std::make_unique(); + index_meta_pb->set_index_type(IndexType::INVERTED); + index_meta_pb->set_index_id(index_id); + index_meta_pb->set_index_name("index_inverted_arr_numeric"); + index_meta_pb->clear_col_unique_id(); + index_meta_pb->add_col_unique_id(0); + + TabletIndex idx_meta; + idx_meta.init_from_pb(*index_meta_pb.get()); + auto index_file_writer = std::make_unique( + fs, index_path_prefix, std::string {rowset_id}, seg_id, + InvertedIndexStorageFormatPB::V1); + std::unique_ptr _inverted_index_builder = nullptr; + EXPECT_EQ(InvertedIndexColumnWriter::create(field, &_inverted_index_builder, + index_file_writer.get(), &idx_meta), + Status::OK()); + + vectorized::DataTypePtr inner_int = std::make_shared(); + vectorized::DataTypePtr array_type = std::make_shared(inner_int); + vectorized::DataTypePtr final_type = + std::make_shared(array_type); + + // create a MutableColumnPtr + vectorized::MutableColumnPtr col = final_type->create_column(); + // row0: non-null, array [123, 456] + { + vectorized::Array arr; + arr.push_back(123); + arr.push_back(456); + col->insert(arr); + } + // row1: null + col->insert(vectorized::Null()); + // row2: non-null, array [789, 101112] + { + vectorized::Array arr; + arr.push_back(789); + arr.push_back(101112); + col->insert(arr); + } + // wrap the constructed column into a ColumnWithTypeAndName + vectorized::ColumnPtr column_array = std::move(col); + vectorized::ColumnWithTypeAndName type_and_name(column_array, final_type, "arr_num"); + + // construct Block (containing only this column), with 3 rows + vectorized::Block block; + block.insert(type_and_name); + + TabletSchemaSPtr tablet_schema = std::make_shared(); + TabletSchemaPB tablet_schema_pb; + tablet_schema_pb.set_keys_type(KeysType::DUP_KEYS); + + tablet_schema->init_from_pb(tablet_schema_pb); + TabletColumn array; + array.set_name("arr1"); + array.set_type(FieldType::OLAP_FIELD_TYPE_ARRAY); + array.set_length(0); + array.set_index_length(0); + array.set_is_nullable(false); + array.set_is_bf_column(false); + TabletColumn child; + child.set_name("arr_sub_int"); + child.set_type(FieldType::OLAP_FIELD_TYPE_INT); + child.set_length(INT_MAX); + array.add_sub_column(child); + tablet_schema->append_column(array); + + vectorized::OlapBlockDataConvertor convertor(tablet_schema.get(), {0}); + convertor.set_source_content(&block, 0, block.rows()); + auto [st, accessor] = convertor.convert_column_data(0); + EXPECT_EQ(st, Status::OK()); + // the conversion result is a 4-tuple: [0]: element total count, [1]: offsets pointer, [2]: item data, [3]: item nullmap + const auto* data_ptr = reinterpret_cast(accessor->get_data()); + const auto* offsets_ptr = reinterpret_cast(data_ptr[1]); + const void* item_data = reinterpret_cast(data_ptr[2]); + const auto* item_nullmap = reinterpret_cast(data_ptr[3]); + + // get the size of the sub field (4 bytes for INT type) + auto field_size = field->get_sub_field(0)->size(); + st = _inverted_index_builder->add_array_values(field_size, item_data, item_nullmap, + offsets_ptr, block.rows()); + EXPECT_EQ(st, Status::OK()); + const auto* null_map = accessor->get_nullmap(); + // add nulls + st = _inverted_index_builder->add_array_nulls(null_map, block.rows()); + EXPECT_EQ(st, Status::OK()); + EXPECT_EQ(_inverted_index_builder->finish(), Status::OK()); + EXPECT_EQ(index_file_writer->close(), Status::OK()); + + // expected inverted index: row0 contains "123" and "456" (doc id 0), row1 is null, row2 contains "789" and "101112" (doc id 2) + ExpectedDocMap expected = {{"123", {0}}, {"456", {0}}, {"789", {2}}, {"101112", {2}}}; + std::vector expected_null_bitmap = {1}; + + std::unique_ptr reader = std::make_unique( + io::global_local_filesystem(), index_path_prefix, InvertedIndexStorageFormatPB::V1); + auto sts = reader->init(); + EXPECT_EQ(sts, Status::OK()); + auto result = reader->open(&idx_meta); + EXPECT_TRUE(result.has_value()) << "Failed to open compound reader" << result.error(); + auto compound_reader = std::move(result.value()); + try { + CLuceneError err; + CL_NS(store)::IndexInput* index_input = nullptr; + auto ok = DorisFSDirectory::FSIndexInput::open( + io::global_local_filesystem(), index_path.c_str(), index_input, err, 4096); + if (!ok) { + throw err; + } + + std::shared_ptr null_bitmap = std::make_shared(); + const char* null_bitmap_file_name = + InvertedIndexDescriptor::get_temporary_null_bitmap_file_name(); + if (compound_reader->fileExists(null_bitmap_file_name)) { + std::unique_ptr null_bitmap_in; + assert(compound_reader->openInput(null_bitmap_file_name, null_bitmap_in, err, + 4096)); + size_t null_bitmap_size = null_bitmap_in->length(); + doris::faststring buf; + buf.resize(null_bitmap_size); + null_bitmap_in->readBytes(reinterpret_cast(buf.data()), null_bitmap_size); + *null_bitmap = roaring::Roaring::read(reinterpret_cast(buf.data()), false); + assert(expected_null_bitmap.size() == null_bitmap->cardinality()); + for (int i : expected_null_bitmap) { + EXPECT_TRUE(null_bitmap->contains(i)); + } + } + index_input->close(); + _CLLDELETE(index_input); + } catch (const CLuceneError& e) { + EXPECT_TRUE(false) << "CLuceneError: " << e.what(); + } + } + +private: + static void build_slices(vectorized::PaddedPODArray& slices, + const vectorized::ColumnPtr& column_array, size_t num_strings) { + const auto* col_arr = assert_cast(column_array.get()); const vectorized::UInt8* nested_null_map = assert_cast(col_arr->get_data_ptr().get()) - ->get_null_map_data() + ->get_null_map_column() + .get_data() .data(); - auto* col_arr_str = assert_cast( + const auto* col_arr_str = assert_cast( assert_cast(col_arr->get_data_ptr().get()) ->get_nested_column_ptr() .get()); const char* char_data = (const char*)(col_arr_str->get_chars().data()); const vectorized::ColumnString::Offset* offset_cur = col_arr_str->get_offsets().data(); - const vectorized::ColumnString::Offset* offset_end = offset_cur + 5; - - Slice* slice = _slice.data(); + const vectorized::ColumnString::Offset* offset_end = offset_cur + num_strings; + Slice* slice = slices.data(); size_t string_offset = *(offset_cur - 1); const vectorized::UInt8* nullmap_cur = nested_null_map; while (offset_cur != offset_end) { @@ -202,16 +975,6 @@ class InvertedIndexArrayTest : public testing::Test { ++slice; ++offset_cur; } - - auto field_size = field->get_sub_field(0)->size(); - Status st = _inverted_index_builder->add_array_values( - field_size, reinterpret_cast(_slice.data()), - reinterpret_cast(nested_null_map), offsets_ptr, 2); - EXPECT_EQ(st, Status::OK()); - EXPECT_EQ(_inverted_index_builder->finish(), Status::OK()); - EXPECT_EQ(index_file_writer->close(), Status::OK()); - - check_terms_stats(index_path); } }; @@ -227,8 +990,54 @@ TEST_F(InvertedIndexArrayTest, ArrayString) { arrayTabletColumn.add_sub_column(arraySubColumn); Field* field = FieldFactory::create(arrayTabletColumn); test_string("rowset_id", 0, field); + test_non_null_string("rowset_id_non_null", 0, field); + delete field; +} + +TEST_F(InvertedIndexArrayTest, ComplexNullCases) { + TabletColumn arrayTabletColumn; + arrayTabletColumn.set_unique_id(0); + arrayTabletColumn.set_name("arr1"); + arrayTabletColumn.set_type(FieldType::OLAP_FIELD_TYPE_ARRAY); + TabletColumn arraySubColumn; + arraySubColumn.set_unique_id(1); + arraySubColumn.set_name("arr_sub_string"); + arraySubColumn.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + arrayTabletColumn.add_sub_column(arraySubColumn); + Field* field = FieldFactory::create(arrayTabletColumn); + test_null_write("complex_null", 0, field); + test_null_write_v2("complex_null_v2", 0, field); + delete field; +} + +TEST_F(InvertedIndexArrayTest, MultiBlockWrite) { + TabletColumn arrayTabletColumn; + arrayTabletColumn.set_unique_id(0); + arrayTabletColumn.set_name("arr1"); + arrayTabletColumn.set_type(FieldType::OLAP_FIELD_TYPE_ARRAY); + TabletColumn arraySubColumn; + arraySubColumn.set_unique_id(1); + arraySubColumn.set_name("arr_sub_string"); + arraySubColumn.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + arrayTabletColumn.add_sub_column(arraySubColumn); + Field* field = FieldFactory::create(arrayTabletColumn); + test_multi_block_write("multi_block", 0, field); delete field; } +TEST_F(InvertedIndexArrayTest, ArrayInt) { + TabletColumn arrayTabletColumn; + arrayTabletColumn.set_unique_id(0); + arrayTabletColumn.set_name("arr1"); + arrayTabletColumn.set_type(FieldType::OLAP_FIELD_TYPE_ARRAY); + TabletColumn arraySubColumn; + arraySubColumn.set_unique_id(1); + arraySubColumn.set_name("arr_sub_int"); + arraySubColumn.set_type(FieldType::OLAP_FIELD_TYPE_INT); + arrayTabletColumn.add_sub_column(arraySubColumn); + Field* field = FieldFactory::create(arrayTabletColumn); + test_array_numeric("int_test", 0, field); + delete field; +} } // namespace segment_v2 -} // namespace doris +} // namespace doris \ No newline at end of file