From 89c4bc91564e28613b5d66aa40514d09953672a0 Mon Sep 17 00:00:00 2001 From: Yukang-Lian Date: Thu, 19 Oct 2023 11:02:51 +0800 Subject: [PATCH 1/5] 1 --- be/test/olap/primary_key_index_test.cpp | 233 +++++++++++++++++++++++- 1 file changed, 232 insertions(+), 1 deletion(-) diff --git a/be/test/olap/primary_key_index_test.cpp b/be/test/olap/primary_key_index_test.cpp index 4de6be24feb9e6..f91eac3e9f4f34 100644 --- a/be/test/olap/primary_key_index_test.cpp +++ b/be/test/olap/primary_key_index_test.cpp @@ -44,7 +44,7 @@ class PrimaryKeyIndexTest : public testing::Test { EXPECT_TRUE(io::global_local_filesystem()->delete_and_create_directory(kTestDir).ok()); } void TearDown() override { - EXPECT_TRUE(io::global_local_filesystem()->delete_directory(kTestDir).ok()); + //EXPECT_TRUE(io::global_local_filesystem()->delete_directory(kTestDir).ok()); } private: @@ -167,4 +167,235 @@ TEST_F(PrimaryKeyIndexTest, builder) { } } +TEST_F(PrimaryKeyIndexTest, single_page) { + std::string filename = kTestDir + "/builder"; + io::FileWriterPtr file_writer; + auto fs = io::global_local_filesystem(); + EXPECT_TRUE(fs->create_file(filename, &file_writer).ok()); + + PrimaryKeyIndexBuilder builder(file_writer.get(), 0); + static_cast(builder.init()); + size_t num_rows = 0; + std::vector keys; + for (int i = 1000; i < 10000; i += 2) { + keys.push_back(std::to_string(i)); + static_cast(builder.add_item(std::to_string(i))); + num_rows++; + } + EXPECT_EQ("1000", builder.min_key().to_string()); + EXPECT_EQ("9998", builder.max_key().to_string()); + segment_v2::PrimaryKeyIndexMetaPB index_meta; + EXPECT_TRUE(builder.finalize(&index_meta)); + EXPECT_EQ(builder.disk_size(), file_writer->bytes_appended()); + EXPECT_TRUE(file_writer->close().ok()); + EXPECT_EQ(num_rows, builder.num_rows()); + + PrimaryKeyIndexReader index_reader; + io::FileReaderSPtr file_reader; + EXPECT_TRUE(fs->open_file(filename, &file_reader).ok()); + EXPECT_TRUE(index_reader.parse_index(file_reader, index_meta).ok()); + EXPECT_TRUE(index_reader.parse_bf(file_reader, index_meta).ok()); + EXPECT_EQ(num_rows, index_reader.num_rows()); + + std::unique_ptr index_iterator; + EXPECT_TRUE(index_reader.new_iterator(&index_iterator).ok()); + bool exact_match = false; + uint32_t row_id; + for (size_t i = 0; i < keys.size(); i++) { + bool exists = index_reader.check_present(keys[i]); + EXPECT_TRUE(exists); + auto status = index_iterator->seek_at_or_after(&keys[i], &exact_match); + EXPECT_TRUE(status.ok()); + EXPECT_TRUE(exact_match); + row_id = index_iterator->get_current_ordinal(); + EXPECT_EQ(i, row_id); + } + // find a non-existing key "8701" + { + string key("8701"); + Slice slice(key); + bool exists = index_reader.check_present(slice); + EXPECT_FALSE(exists); + auto status = index_iterator->seek_at_or_after(&slice, &exact_match); + EXPECT_TRUE(status.ok()); + EXPECT_FALSE(exact_match); + row_id = index_iterator->get_current_ordinal(); + EXPECT_EQ(3851, row_id); + } + + // find prefix "87" + { + string key("87"); + Slice slice(key); + bool exists = index_reader.check_present(slice); + EXPECT_FALSE(exists); + auto status = index_iterator->seek_at_or_after(&slice, &exact_match); + EXPECT_TRUE(status.ok()); + EXPECT_FALSE(exact_match); + row_id = index_iterator->get_current_ordinal(); + EXPECT_EQ(3850, row_id); + } + + // find prefix "9999" + { + string key("9999"); + Slice slice(key); + bool exists = index_reader.check_present(slice); + EXPECT_FALSE(exists); + auto status = index_iterator->seek_at_or_after(&slice, &exact_match); + EXPECT_FALSE(exact_match); + EXPECT_TRUE(status.is()); + } + + // read all key + { + int32_t remaining = num_rows; + std::string last_key; + int num_batch = 0; + int batch_size = 1024; + while (remaining > 0) { + std::unique_ptr iter; + EXPECT_TRUE(index_reader.new_iterator(&iter).ok()); + + size_t num_to_read = std::min(batch_size, remaining); + auto index_type = vectorized::DataTypeFactory::instance().create_data_type( + index_reader.type_info()->type(), 1, 0); + auto index_column = index_type->create_column(); + Slice last_key_slice(last_key); + EXPECT_TRUE(iter->seek_at_or_after(&last_key_slice, &exact_match).ok()); + + size_t num_read = num_to_read; + EXPECT_TRUE(iter->next_batch(&num_read, index_column).ok()); + EXPECT_EQ(num_to_read, num_read); + last_key = index_column->get_data_at(num_read - 1).to_string(); + // exclude last_key, last_key will be read in next batch. + if (num_read == batch_size && num_read != remaining) { + num_read -= 1; + } + for (size_t i = 0; i < num_read; i++) { + Slice key = + Slice(index_column->get_data_at(i).data, index_column->get_data_at(i).size); + DCHECK_EQ(keys[i + (batch_size - 1) * num_batch], key.to_string()); + } + num_batch++; + remaining -= num_read; + } + } +} + +TEST_F(PrimaryKeyIndexTest, multiple_pages) { + std::string filename = kTestDir + "/builder"; + io::FileWriterPtr file_writer; + auto fs = io::global_local_filesystem(); + EXPECT_TRUE(fs->create_file(filename, &file_writer).ok()); + + PrimaryKeyIndexBuilder builder(file_writer.get(), 0); + static_cast(builder.init()); + size_t num_rows = 0; + std::vector keys; + for (int i = 1000; i < 10000; i += 2) { + keys.push_back(std::to_string(i)); + static_cast(builder.add_item(std::to_string(i))); + num_rows++; + } + EXPECT_EQ("1000", builder.min_key().to_string()); + EXPECT_EQ("9998", builder.max_key().to_string()); + segment_v2::PrimaryKeyIndexMetaPB index_meta; + EXPECT_TRUE(builder.finalize(&index_meta)); + EXPECT_EQ(builder.disk_size(), file_writer->bytes_appended()); + EXPECT_TRUE(file_writer->close().ok()); + EXPECT_EQ(num_rows, builder.num_rows()); + + PrimaryKeyIndexReader index_reader; + io::FileReaderSPtr file_reader; + EXPECT_TRUE(fs->open_file(filename, &file_reader).ok()); + EXPECT_TRUE(index_reader.parse_index(file_reader, index_meta).ok()); + EXPECT_TRUE(index_reader.parse_bf(file_reader, index_meta).ok()); + EXPECT_EQ(num_rows, index_reader.num_rows()); + + std::unique_ptr index_iterator; + EXPECT_TRUE(index_reader.new_iterator(&index_iterator).ok()); + bool exact_match = false; + uint32_t row_id; + for (size_t i = 0; i < keys.size(); i++) { + bool exists = index_reader.check_present(keys[i]); + EXPECT_TRUE(exists); + auto status = index_iterator->seek_at_or_after(&keys[i], &exact_match); + EXPECT_TRUE(status.ok()); + EXPECT_TRUE(exact_match); + row_id = index_iterator->get_current_ordinal(); + EXPECT_EQ(i, row_id); + } + // find a non-existing key "8701" + { + string key("8701"); + Slice slice(key); + bool exists = index_reader.check_present(slice); + EXPECT_FALSE(exists); + auto status = index_iterator->seek_at_or_after(&slice, &exact_match); + EXPECT_TRUE(status.ok()); + EXPECT_FALSE(exact_match); + row_id = index_iterator->get_current_ordinal(); + EXPECT_EQ(3851, row_id); + } + + // find prefix "87" + { + string key("87"); + Slice slice(key); + bool exists = index_reader.check_present(slice); + EXPECT_FALSE(exists); + auto status = index_iterator->seek_at_or_after(&slice, &exact_match); + EXPECT_TRUE(status.ok()); + EXPECT_FALSE(exact_match); + row_id = index_iterator->get_current_ordinal(); + EXPECT_EQ(3850, row_id); + } + + // find prefix "9999" + { + string key("9999"); + Slice slice(key); + bool exists = index_reader.check_present(slice); + EXPECT_FALSE(exists); + auto status = index_iterator->seek_at_or_after(&slice, &exact_match); + EXPECT_FALSE(exact_match); + EXPECT_TRUE(status.is()); + } + + // read all key + { + int32_t remaining = num_rows; + std::string last_key; + int num_batch = 0; + int batch_size = 1024; + while (remaining > 0) { + std::unique_ptr iter; + EXPECT_TRUE(index_reader.new_iterator(&iter).ok()); + + size_t num_to_read = std::min(batch_size, remaining); + auto index_type = vectorized::DataTypeFactory::instance().create_data_type( + index_reader.type_info()->type(), 1, 0); + auto index_column = index_type->create_column(); + Slice last_key_slice(last_key); + EXPECT_TRUE(iter->seek_at_or_after(&last_key_slice, &exact_match).ok()); + + size_t num_read = num_to_read; + EXPECT_TRUE(iter->next_batch(&num_read, index_column).ok()); + EXPECT_EQ(num_to_read, num_read); + last_key = index_column->get_data_at(num_read - 1).to_string(); + // exclude last_key, last_key will be read in next batch. + if (num_read == batch_size && num_read != remaining) { + num_read -= 1; + } + for (size_t i = 0; i < num_read; i++) { + Slice key = + Slice(index_column->get_data_at(i).data, index_column->get_data_at(i).size); + DCHECK_EQ(keys[i + (batch_size - 1) * num_batch], key.to_string()); + } + num_batch++; + remaining -= num_read; + } + } +} } // namespace doris From a41b6942352edf877a09df0b06dd9a6ea1caa399 Mon Sep 17 00:00:00 2001 From: Yukang-Lian Date: Thu, 19 Oct 2023 23:14:26 +0800 Subject: [PATCH 2/5] 2 --- be/src/olap/primary_key_index.h | 3 + .../rowset/segment_v2/indexed_column_writer.h | 3 + be/test/olap/primary_key_index_test.cpp | 159 ++++-------------- 3 files changed, 39 insertions(+), 126 deletions(-) diff --git a/be/src/olap/primary_key_index.h b/be/src/olap/primary_key_index.h index 233644b4e07173..59b88c2f72427a 100644 --- a/be/src/olap/primary_key_index.h +++ b/be/src/olap/primary_key_index.h @@ -67,6 +67,9 @@ class PrimaryKeyIndexBuilder { uint64_t disk_size() const { return _disk_size; } + // used for be ut + uint32_t data_page_num() const { return _primary_key_index_builder->data_page_num(); } + Slice min_key() { return Slice(_min_key.data(), _min_key.size() - _seq_col_length); } Slice max_key() { return Slice(_max_key.data(), _max_key.size() - _seq_col_length); } diff --git a/be/src/olap/rowset/segment_v2/indexed_column_writer.h b/be/src/olap/rowset/segment_v2/indexed_column_writer.h index ba61708dd90936..ecb26782ad1c7f 100644 --- a/be/src/olap/rowset/segment_v2/indexed_column_writer.h +++ b/be/src/olap/rowset/segment_v2/indexed_column_writer.h @@ -22,6 +22,7 @@ #include #include +#include #include #include "common/status.h" @@ -85,6 +86,8 @@ class IndexedColumnWriter { uint64_t disk_size() const { return _disk_size; } + uint32_t data_page_num() const { return _num_data_pages + 1; } + private: Status _finish_current_data_page(size_t& num_val); diff --git a/be/test/olap/primary_key_index_test.cpp b/be/test/olap/primary_key_index_test.cpp index f91eac3e9f4f34..28f543ccc58779 100644 --- a/be/test/olap/primary_key_index_test.cpp +++ b/be/test/olap/primary_key_index_test.cpp @@ -167,23 +167,26 @@ TEST_F(PrimaryKeyIndexTest, builder) { } } -TEST_F(PrimaryKeyIndexTest, single_page) { - std::string filename = kTestDir + "/builder"; +TEST_F(PrimaryKeyIndexTest, multiple_pages) { + std::string filename = kTestDir + "/multiple_pages"; io::FileWriterPtr file_writer; auto fs = io::global_local_filesystem(); EXPECT_TRUE(fs->create_file(filename, &file_writer).ok()); + config::primary_key_data_page_size = 5 * 5; PrimaryKeyIndexBuilder builder(file_writer.get(), 0); static_cast(builder.init()); size_t num_rows = 0; - std::vector keys; - for (int i = 1000; i < 10000; i += 2) { - keys.push_back(std::to_string(i)); - static_cast(builder.add_item(std::to_string(i))); + std::vector keys {"00000", "00002", "00004", "00006", "00008", + "00010", "00012", "00014", "00016", "00018"}; + for (const std::string& key : keys) { + static_cast(builder.add_item(key)); num_rows++; } - EXPECT_EQ("1000", builder.min_key().to_string()); - EXPECT_EQ("9998", builder.max_key().to_string()); + EXPECT_EQ("00000", builder.min_key().to_string()); + EXPECT_EQ("00018", builder.max_key().to_string()); + EXPECT_EQ(builder.size(), 2 * 5 * 5); + EXPECT_GT(builder.data_page_num(), 1); segment_v2::PrimaryKeyIndexMetaPB index_meta; EXPECT_TRUE(builder.finalize(&index_meta)); EXPECT_EQ(builder.disk_size(), file_writer->bytes_appended()); @@ -210,35 +213,21 @@ TEST_F(PrimaryKeyIndexTest, single_page) { row_id = index_iterator->get_current_ordinal(); EXPECT_EQ(i, row_id); } - // find a non-existing key "8701" - { - string key("8701"); - Slice slice(key); - bool exists = index_reader.check_present(slice); - EXPECT_FALSE(exists); - auto status = index_iterator->seek_at_or_after(&slice, &exact_match); - EXPECT_TRUE(status.ok()); - EXPECT_FALSE(exact_match); - row_id = index_iterator->get_current_ordinal(); - EXPECT_EQ(3851, row_id); - } - // find prefix "87" - { - string key("87"); - Slice slice(key); + std::vector non_exist_keys {"00001", "00003", "00005", "00007", "00009", + "00011", "00013", "00015", "00017"}; + for (size_t i = 0; i < non_exist_keys.size(); i++) { + Slice slice(non_exist_keys[i]); bool exists = index_reader.check_present(slice); EXPECT_FALSE(exists); auto status = index_iterator->seek_at_or_after(&slice, &exact_match); EXPECT_TRUE(status.ok()); EXPECT_FALSE(exact_match); row_id = index_iterator->get_current_ordinal(); - EXPECT_EQ(3850, row_id); + EXPECT_EQ(i + 1, row_id); } - - // find prefix "9999" { - string key("9999"); + string key("00019"); Slice slice(key); bool exists = index_reader.check_present(slice); EXPECT_FALSE(exists); @@ -246,45 +235,10 @@ TEST_F(PrimaryKeyIndexTest, single_page) { EXPECT_FALSE(exact_match); EXPECT_TRUE(status.is()); } - - // read all key - { - int32_t remaining = num_rows; - std::string last_key; - int num_batch = 0; - int batch_size = 1024; - while (remaining > 0) { - std::unique_ptr iter; - EXPECT_TRUE(index_reader.new_iterator(&iter).ok()); - - size_t num_to_read = std::min(batch_size, remaining); - auto index_type = vectorized::DataTypeFactory::instance().create_data_type( - index_reader.type_info()->type(), 1, 0); - auto index_column = index_type->create_column(); - Slice last_key_slice(last_key); - EXPECT_TRUE(iter->seek_at_or_after(&last_key_slice, &exact_match).ok()); - - size_t num_read = num_to_read; - EXPECT_TRUE(iter->next_batch(&num_read, index_column).ok()); - EXPECT_EQ(num_to_read, num_read); - last_key = index_column->get_data_at(num_read - 1).to_string(); - // exclude last_key, last_key will be read in next batch. - if (num_read == batch_size && num_read != remaining) { - num_read -= 1; - } - for (size_t i = 0; i < num_read; i++) { - Slice key = - Slice(index_column->get_data_at(i).data, index_column->get_data_at(i).size); - DCHECK_EQ(keys[i + (batch_size - 1) * num_batch], key.to_string()); - } - num_batch++; - remaining -= num_read; - } - } } -TEST_F(PrimaryKeyIndexTest, multiple_pages) { - std::string filename = kTestDir + "/builder"; +TEST_F(PrimaryKeyIndexTest, single_page) { + std::string filename = kTestDir + "/single_page"; io::FileWriterPtr file_writer; auto fs = io::global_local_filesystem(); EXPECT_TRUE(fs->create_file(filename, &file_writer).ok()); @@ -292,14 +246,16 @@ TEST_F(PrimaryKeyIndexTest, multiple_pages) { PrimaryKeyIndexBuilder builder(file_writer.get(), 0); static_cast(builder.init()); size_t num_rows = 0; - std::vector keys; - for (int i = 1000; i < 10000; i += 2) { - keys.push_back(std::to_string(i)); - static_cast(builder.add_item(std::to_string(i))); + std::vector keys {"00000", "00002", "00004", "00006", "00008", + "00010", "00012", "00014", "00016", "00018"}; + for (const std::string& key : keys) { + static_cast(builder.add_item(key)); num_rows++; } - EXPECT_EQ("1000", builder.min_key().to_string()); - EXPECT_EQ("9998", builder.max_key().to_string()); + EXPECT_EQ("00000", builder.min_key().to_string()); + EXPECT_EQ("00018", builder.max_key().to_string()); + EXPECT_EQ(builder.size(), 2 * 5 * 5); + EXPECT_EQ(builder.data_page_num(), 1); segment_v2::PrimaryKeyIndexMetaPB index_meta; EXPECT_TRUE(builder.finalize(&index_meta)); EXPECT_EQ(builder.disk_size(), file_writer->bytes_appended()); @@ -326,35 +282,21 @@ TEST_F(PrimaryKeyIndexTest, multiple_pages) { row_id = index_iterator->get_current_ordinal(); EXPECT_EQ(i, row_id); } - // find a non-existing key "8701" - { - string key("8701"); - Slice slice(key); - bool exists = index_reader.check_present(slice); - EXPECT_FALSE(exists); - auto status = index_iterator->seek_at_or_after(&slice, &exact_match); - EXPECT_TRUE(status.ok()); - EXPECT_FALSE(exact_match); - row_id = index_iterator->get_current_ordinal(); - EXPECT_EQ(3851, row_id); - } - // find prefix "87" - { - string key("87"); - Slice slice(key); + std::vector non_exist_keys {"00001", "00003", "00005", "00007", "00009", + "00011", "00013", "00015", "00017"}; + for (size_t i = 0; i < non_exist_keys.size(); i++) { + Slice slice(non_exist_keys[i]); bool exists = index_reader.check_present(slice); EXPECT_FALSE(exists); auto status = index_iterator->seek_at_or_after(&slice, &exact_match); EXPECT_TRUE(status.ok()); EXPECT_FALSE(exact_match); row_id = index_iterator->get_current_ordinal(); - EXPECT_EQ(3850, row_id); + EXPECT_EQ(i + 1, row_id); } - - // find prefix "9999" { - string key("9999"); + string key("00019"); Slice slice(key); bool exists = index_reader.check_present(slice); EXPECT_FALSE(exists); @@ -362,40 +304,5 @@ TEST_F(PrimaryKeyIndexTest, multiple_pages) { EXPECT_FALSE(exact_match); EXPECT_TRUE(status.is()); } - - // read all key - { - int32_t remaining = num_rows; - std::string last_key; - int num_batch = 0; - int batch_size = 1024; - while (remaining > 0) { - std::unique_ptr iter; - EXPECT_TRUE(index_reader.new_iterator(&iter).ok()); - - size_t num_to_read = std::min(batch_size, remaining); - auto index_type = vectorized::DataTypeFactory::instance().create_data_type( - index_reader.type_info()->type(), 1, 0); - auto index_column = index_type->create_column(); - Slice last_key_slice(last_key); - EXPECT_TRUE(iter->seek_at_or_after(&last_key_slice, &exact_match).ok()); - - size_t num_read = num_to_read; - EXPECT_TRUE(iter->next_batch(&num_read, index_column).ok()); - EXPECT_EQ(num_to_read, num_read); - last_key = index_column->get_data_at(num_read - 1).to_string(); - // exclude last_key, last_key will be read in next batch. - if (num_read == batch_size && num_read != remaining) { - num_read -= 1; - } - for (size_t i = 0; i < num_read; i++) { - Slice key = - Slice(index_column->get_data_at(i).data, index_column->get_data_at(i).size); - DCHECK_EQ(keys[i + (batch_size - 1) * num_batch], key.to_string()); - } - num_batch++; - remaining -= num_read; - } - } } } // namespace doris From db429b107f2a65fd4e9c5dcc1284befb0080ca1b Mon Sep 17 00:00:00 2001 From: Yukang-Lian Date: Thu, 19 Oct 2023 23:16:42 +0800 Subject: [PATCH 3/5] 3 --- be/test/olap/primary_key_index_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/test/olap/primary_key_index_test.cpp b/be/test/olap/primary_key_index_test.cpp index 28f543ccc58779..16131fac68fc3b 100644 --- a/be/test/olap/primary_key_index_test.cpp +++ b/be/test/olap/primary_key_index_test.cpp @@ -44,7 +44,7 @@ class PrimaryKeyIndexTest : public testing::Test { EXPECT_TRUE(io::global_local_filesystem()->delete_and_create_directory(kTestDir).ok()); } void TearDown() override { - //EXPECT_TRUE(io::global_local_filesystem()->delete_directory(kTestDir).ok()); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(kTestDir).ok()); } private: From b58ef41da3f063c4445c49298d5848866516354a Mon Sep 17 00:00:00 2001 From: Yukang-Lian Date: Thu, 19 Oct 2023 23:53:57 +0800 Subject: [PATCH 4/5] 4 --- be/test/olap/primary_key_index_test.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/be/test/olap/primary_key_index_test.cpp b/be/test/olap/primary_key_index_test.cpp index 16131fac68fc3b..fb6d7caec39497 100644 --- a/be/test/olap/primary_key_index_test.cpp +++ b/be/test/olap/primary_key_index_test.cpp @@ -242,6 +242,7 @@ TEST_F(PrimaryKeyIndexTest, single_page) { io::FileWriterPtr file_writer; auto fs = io::global_local_filesystem(); EXPECT_TRUE(fs->create_file(filename, &file_writer).ok()); + config::primary_key_data_page_size = 32768; PrimaryKeyIndexBuilder builder(file_writer.get(), 0); static_cast(builder.init()); From 8da8807772ee4df1f1973d956522b12b3cd18afb Mon Sep 17 00:00:00 2001 From: Yukang-Lian Date: Fri, 20 Oct 2023 00:08:15 +0800 Subject: [PATCH 5/5] 5 --- be/test/olap/primary_key_index_test.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/be/test/olap/primary_key_index_test.cpp b/be/test/olap/primary_key_index_test.cpp index fb6d7caec39497..fb96e7411e678f 100644 --- a/be/test/olap/primary_key_index_test.cpp +++ b/be/test/olap/primary_key_index_test.cpp @@ -213,6 +213,20 @@ TEST_F(PrimaryKeyIndexTest, multiple_pages) { row_id = index_iterator->get_current_ordinal(); EXPECT_EQ(i, row_id); } + for (size_t i = 0; i < keys.size(); i++) { + bool exists = index_reader.check_present(keys[i]); + EXPECT_TRUE(exists); + auto status = index_iterator->seek_to_ordinal(i); + EXPECT_TRUE(status.ok()); + row_id = index_iterator->get_current_ordinal(); + EXPECT_EQ(i, row_id); + } + { + auto status = index_iterator->seek_to_ordinal(10); + EXPECT_TRUE(status.ok()); + row_id = index_iterator->get_current_ordinal(); + EXPECT_EQ(10, row_id); + } std::vector non_exist_keys {"00001", "00003", "00005", "00007", "00009", "00011", "00013", "00015", "00017"};