From 50e6d2a9d08a9df6ad1b832f9cb9cdd0629a3a36 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Thu, 26 Dec 2024 19:42:25 +0800 Subject: [PATCH 1/3] [opt](bloomfilter index) optimize memory usage for bloom filter index writer (#45833) Issue Number: close #xxx Related PR: #xxx Problem Summary: Optimize memory usage when adding string values for bloom filter index. Using uint64 hash value instead of string values itself, it is expected to save a lot of memory for especially long text --- be/src/olap/rowset/segment_v2/bloom_filter.h | 10 + .../segment_v2/bloom_filter_index_writer.cpp | 27 +- .../bloom_filter_index_reader_writer_test.cpp | 460 +++++++++++++++++- 3 files changed, 485 insertions(+), 12 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/bloom_filter.h b/be/src/olap/rowset/segment_v2/bloom_filter.h index 13b1558431e974..15644b8d6bfccd 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter.h +++ b/be/src/olap/rowset/segment_v2/bloom_filter.h @@ -153,6 +153,16 @@ class BloomFilter { return hash_code; } + static Result hash(const char* buf, uint32_t size, HashStrategyPB strategy) { + if (strategy == HASH_MURMUR3_X64_64) { + uint64_t hash_code; + murmur_hash3_x64_64(buf, size, DEFAULT_SEED, &hash_code); + return hash_code; + } else { + return Status::InvalidArgument("invalid strategy:{}", strategy); + } + } + virtual void add_bytes(const char* buf, uint32_t size) { if (buf == nullptr) { *_has_null = true; diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp index 7497436546697c..017393d8ffa35a 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp @@ -84,9 +84,10 @@ class BloomFilterIndexWriterImpl : public BloomFilterIndexWriter { for (int i = 0; i < count; ++i) { if (_values.find(*v) == _values.end()) { if constexpr (_is_slice_type()) { - CppType new_value; - RETURN_IF_CATCH_EXCEPTION(_type_info->deep_copy(&new_value, v, &_arena)); - _values.insert(new_value); + const auto* s = reinterpret_cast(v); + auto hash = + DORIS_TRY(BloomFilter::hash(s->data, s->size, _bf_options.strategy)); + _hash_values.insert(hash); } else if constexpr (_is_int128()) { int128_t new_value; memcpy(&new_value, v, sizeof(PackedInt128)); @@ -105,25 +106,28 @@ class BloomFilterIndexWriterImpl : public BloomFilterIndexWriter { Status flush() override { std::unique_ptr bf; RETURN_IF_ERROR(BloomFilter::create(BLOCK_BLOOM_FILTER, &bf)); - RETURN_IF_ERROR(bf->init(_values.size(), _bf_options.fpp, _bf_options.strategy)); - bf->set_has_null(_has_null); - for (auto& v : _values) { - if constexpr (_is_slice_type()) { - Slice* s = (Slice*)&v; - bf->add_bytes(s->data, s->size); - } else { + if constexpr (_is_slice_type()) { + RETURN_IF_ERROR(bf->init(_hash_values.size(), _bf_options.fpp, _bf_options.strategy)); + for (const auto& h : _hash_values) { + bf->add_hash(h); + } + } else { + RETURN_IF_ERROR(bf->init(_values.size(), _bf_options.fpp, _bf_options.strategy)); + for (auto& v : _values) { bf->add_bytes((char*)&v, sizeof(CppType)); } } + bf->set_has_null(_has_null); _bf_buffer_size += bf->size(); _bfs.push_back(std::move(bf)); _values.clear(); + _hash_values.clear(); _has_null = false; return Status::OK(); } Status finish(io::FileWriter* file_writer, ColumnIndexMetaPB* index_meta) override { - if (_values.size() > 0) { + if (_values.size() > 0 || !_hash_values.empty()) { RETURN_IF_ERROR(flush()); } index_meta->set_type(BLOOM_FILTER_INDEX); @@ -172,6 +176,7 @@ class BloomFilterIndexWriterImpl : public BloomFilterIndexWriter { // distinct values ValueDict _values; std::vector> _bfs; + std::set _hash_values; }; } // namespace diff --git a/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp b/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp index 258dd9a5ff8b51..a395ee0cd2f266 100644 --- a/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp +++ b/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp @@ -160,7 +160,12 @@ void test_bloom_filter_index_reader_writer_template( } // test nullptr EXPECT_TRUE(bf->test_bytes(nullptr, 1)); - + if (is_slice_type) { + Slice* value = (Slice*)(not_exist_value); + EXPECT_FALSE(bf->test_bytes(value->data, value->size)); + } else { + EXPECT_FALSE(bf->test_bytes((char*)not_exist_value, sizeof(CppType))); + } delete reader; } } @@ -295,5 +300,458 @@ TEST_F(BloomFilterIndexReaderWriterTest, test_decimal) { delete[] val; } +TEST_F(BloomFilterIndexReaderWriterTest, test_primary_key_bloom_filter_index_char) { + size_t num = 1024 * 3; + std::string* val = new std::string[num]; + for (int i = 0; i < num; ++i) { + // there will be 3 bloom filter pages + val[i] = "primary_key_" + std::to_string(10000 + i); + } + Slice* slices = new Slice[num]; + for (int i = 0; i < num; ++i) { + // there will be 3 bloom filter pages + slices[i] = Slice(val[i].c_str(), val[i].size()); + } + std::string file_name = "primary_key_bloom_filter_index_char"; + Slice not_exist_value("primary_key_not_exist_char"); + auto st = test_bloom_filter_index_reader_writer_template( + file_name, slices, num, 1, ¬_exist_value, true, true); + EXPECT_TRUE(st.ok()); + delete[] val; + delete[] slices; +} + +TEST_F(BloomFilterIndexReaderWriterTest, test_primary_key_bloom_filter_index) { + size_t num = 1024 * 3; + std::vector val_strings(num); + for (size_t i = 0; i < num; ++i) { + val_strings[i] = "primary_key_" + std::to_string(i); + } + std::vector slices(num); + for (size_t i = 0; i < num; ++i) { + slices[i] = Slice(val_strings[i]); + } + + std::string file_name = "primary_key_bloom_filter_index"; + Slice not_exist_value("primary_key_not_exist"); + + auto st = test_bloom_filter_index_reader_writer_template( + file_name, slices.data(), num, 0, ¬_exist_value, true, true); + EXPECT_TRUE(st.ok()); +} + +TEST_F(BloomFilterIndexReaderWriterTest, test_primary_key_bloom_filter_index_int) { + size_t num = 1024 * 3; + int* val = new int[num]; + for (int i = 0; i < num; ++i) { + // there will be 3 bloom filter pages + val[i] = 10000 + i + 1; + } + + std::string file_name = "primary_key_bloom_filter_index_int"; + int not_exist_value = 18888; + auto st = test_bloom_filter_index_reader_writer_template( + file_name, val, num, 1, ¬_exist_value, false, true); + EXPECT_FALSE(st.ok()); + EXPECT_EQ(st.code(), TStatusCode::NOT_IMPLEMENTED_ERROR); + delete[] val; +} + +TEST_F(BloomFilterIndexReaderWriterTest, test_datev2) { + size_t num = 1024 * 3; + uint32_t* val = new uint32_t[num]; + for (size_t i = 0; i < num; ++i) { + val[i] = 20210101 + i; // YYYYMMDD + } + + std::string file_name = "bloom_filter_datev2"; + uint32_t not_exist_value = 20211231; + auto st = test_bloom_filter_index_reader_writer_template( + file_name, val, num, 1, ¬_exist_value); + EXPECT_TRUE(st.ok()); + delete[] val; +} + +TEST_F(BloomFilterIndexReaderWriterTest, test_datetimev2) { + size_t num = 1024 * 3; + uint64_t* val = new uint64_t[num]; + for (size_t i = 0; i < num; ++i) { + val[i] = 20210101000000 + i; // YYYYMMDDHHMMSS + } + + std::string file_name = "bloom_filter_datetimev2"; + uint64_t not_exist_value = 20211231235959; + auto st = test_bloom_filter_index_reader_writer_template( + file_name, val, num, 1, ¬_exist_value); + EXPECT_TRUE(st.ok()); + delete[] val; +} + +TEST_F(BloomFilterIndexReaderWriterTest, test_decimal32) { + size_t num = 1024 * 3; + int32_t* val = new int32_t[num]; + for (size_t i = 0; i < num; ++i) { + val[i] = static_cast(i * 100 + 1); + } + + std::string file_name = "bloom_filter_decimal32"; + int32_t not_exist_value = 99999; + auto st = test_bloom_filter_index_reader_writer_template( + file_name, val, num, 1, ¬_exist_value); + EXPECT_TRUE(st.ok()); + delete[] val; +} + +TEST_F(BloomFilterIndexReaderWriterTest, test_decimal64) { + size_t num = 1024 * 3; + ; + int64_t* val = new int64_t[num]; + for (size_t i = 0; i < num; ++i) { + val[i] = static_cast(i * 1000 + 123); + } + + std::string file_name = "bloom_filter_decimal64"; + int64_t not_exist_value = 9999999; + auto st = test_bloom_filter_index_reader_writer_template( + file_name, val, num, 1, ¬_exist_value); + EXPECT_TRUE(st.ok()); + delete[] val; +} + +TEST_F(BloomFilterIndexReaderWriterTest, test_ipv4) { + size_t num = 1024 * 3; // 3072 + uint32_t* val = new uint32_t[num]; + for (size_t i = 0; i < num; ++i) { + val[i] = (192 << 24) | (168 << 16) | (i & 0xFFFF); + } + + std::string file_name = "bloom_filter_ipv4"; + uint32_t not_exist_value = (10 << 24) | (0 << 16) | (0 << 8) | 1; // 10.0.0.1 + auto st = test_bloom_filter_index_reader_writer_template( + file_name, val, num, 1, ¬_exist_value); + EXPECT_TRUE(st.ok()); + delete[] val; +} + +TEST_F(BloomFilterIndexReaderWriterTest, test_decimal128i) { + size_t num = 1024 * 3; + int128_t* val = new int128_t[num]; + + int128_t base_value = int128_t(1000000000ULL) * int128_t(1000000000ULL); + + for (size_t i = 0; i < num; ++i) { + val[i] = base_value + int128_t(i); + } + + std::string file_name = "bloom_filter_decimal128i"; + int128_t not_exist_value = int128_t(9999999999999999999ULL); + + auto st = + test_bloom_filter_index_reader_writer_template( + file_name, val, num, 1, ¬_exist_value); + EXPECT_TRUE(st.ok()); + delete[] val; +} + +TEST_F(BloomFilterIndexReaderWriterTest, test_decimal256) { + size_t num = 1024 * 3; + using Decimal256Type = wide::Int256; + + Decimal256Type* val = new Decimal256Type[num]; + + Decimal256Type base_value = Decimal256Type(1000000000ULL); // 1e9 + base_value *= Decimal256Type(1000000000ULL); // base_value = 1e18 + base_value *= Decimal256Type(100000000ULL); // base_value = 1e26 + base_value *= Decimal256Type(100000000ULL); // base_value = 1e34 + base_value *= Decimal256Type(10000ULL); // base_value = 1e38 + + for (size_t i = 0; i < num; ++i) { + val[i] = base_value + Decimal256Type(i); + } + + std::string file_name = "bloom_filter_decimal256"; + + Decimal256Type not_exist_value = base_value + Decimal256Type(9999999ULL); + + auto st = test_bloom_filter_index_reader_writer_template( + file_name, val, num, 1, ¬_exist_value); + EXPECT_TRUE(st.ok()); + delete[] val; +} + +TEST_F(BloomFilterIndexReaderWriterTest, test_ipv6) { + size_t num = 1024 * 3; + uint128_t* val = new uint128_t[num]; + for (size_t i = 0; i < num; ++i) { + val[i] = (uint128_t(0x20010DB800000000) << 64) | uint128_t(i); + } + + std::string file_name = "bloom_filter_ipv6"; + uint128_t not_exist_value = (uint128_t(0x20010DB800000000) << 64) | uint128_t(999999); + + auto st = test_bloom_filter_index_reader_writer_template( + file_name, val, num, 1, ¬_exist_value); + EXPECT_TRUE(st.ok()); + delete[] val; +} + +template +Status write_ngram_bloom_filter_index_file(const std::string& file_name, Slice* values, + size_t num_values, const TypeInfo* type_info, + BloomFilterIndexWriter* bf_index_writer, + ColumnIndexMetaPB* meta) { + auto fs = io::global_local_filesystem(); + std::string fname = dname + "/" + file_name; + io::FileWriterPtr file_writer; + Status st = fs->create_file(fname, &file_writer); + EXPECT_TRUE(st.ok()) << st.to_string(); + + size_t i = 0; + while (i < num_values) { + size_t num = std::min(static_cast(1024), num_values - i); + st = bf_index_writer->add_values(values + i, num); + EXPECT_TRUE(st.ok()); + st = bf_index_writer->flush(); + EXPECT_TRUE(st.ok()); + i += num; + } + bf_index_writer->add_nulls(1); + st = bf_index_writer->finish(file_writer.get(), meta); + EXPECT_TRUE(st.ok()) << "Writer finish status: " << st.to_string(); + EXPECT_TRUE(file_writer->close().ok()); + + return Status::OK(); +} + +Status read_and_test_ngram_bloom_filter_index_file(const std::string& file_name, size_t num_values, + uint8_t gram_size, uint16_t bf_size, + const ColumnIndexMetaPB& meta, + const std::vector& test_patterns) { + BloomFilterIndexReader* reader = nullptr; + std::unique_ptr iter; + get_bloom_filter_reader_iter(file_name, meta, &reader, &iter); + EXPECT_EQ(reader->algorithm(), BloomFilterAlgorithmPB::NGRAM_BLOOM_FILTER); + + NgramTokenExtractor extractor(gram_size); + uint16_t gram_bf_size = bf_size; + + size_t total_pages = (num_values + 1023) / 1024; + for (size_t page = 0; page < total_pages; ++page) { + std::unique_ptr bf; + auto st = iter->read_bloom_filter(page, &bf); + EXPECT_TRUE(st.ok()); + + for (const auto& pattern : test_patterns) { + std::unique_ptr query_bf; + st = BloomFilter::create(NGRAM_BLOOM_FILTER, &query_bf, gram_bf_size); + EXPECT_TRUE(st.ok()); + + if (extractor.string_like_to_bloom_filter(pattern.data(), pattern.size(), *query_bf)) { + bool contains = bf->contains(*query_bf); + bool expected = false; + if ((page == 0 && (pattern == "ngram15" || pattern == "ngram1000")) || + (page == 1 && pattern == "ngram1499")) { + expected = true; + } + EXPECT_EQ(contains, expected) << "Pattern: " << pattern << ", Page: " << page; + } + } + } + + delete reader; + return Status::OK(); +} + +template +Status test_ngram_bloom_filter_index_reader_writer(const std::string& file_name, Slice* values, + size_t num_values, uint8_t gram_size, + uint16_t bf_size) { + const auto* type_info = get_scalar_type_info(); + ColumnIndexMetaPB meta; + + BloomFilterOptions bf_options; + std::unique_ptr bf_index_writer; + RETURN_IF_ERROR(NGramBloomFilterIndexWriterImpl::create(bf_options, type_info, gram_size, + bf_size, &bf_index_writer)); + + RETURN_IF_ERROR(write_ngram_bloom_filter_index_file( + file_name, values, num_values, type_info, bf_index_writer.get(), &meta)); + + std::vector test_patterns = {"ngram15", "ngram1000", "ngram1499", + "non-existent-string"}; + + RETURN_IF_ERROR(read_and_test_ngram_bloom_filter_index_file(file_name, num_values, gram_size, + bf_size, meta, test_patterns)); + + return Status::OK(); +} + +TEST_F(BloomFilterIndexReaderWriterTest, test_ngram_bloom_filter) { + size_t num = 1500; + std::vector val(num); + for (size_t i = 0; i < num; ++i) { + val[i] = "ngram" + std::to_string(i); + } + std::vector slices(num); + for (size_t i = 0; i < num; ++i) { + slices[i] = Slice(val[i].data(), val[i].size()); + } + + uint8_t gram_size = 5; + uint16_t bf_size = 65535; + + auto st = test_ngram_bloom_filter_index_reader_writer( + "bloom_filter_ngram_varchar", slices.data(), num, gram_size, bf_size); + EXPECT_TRUE(st.ok()); + st = test_ngram_bloom_filter_index_reader_writer( + "bloom_filter_ngram_char", slices.data(), num, gram_size, bf_size); + EXPECT_TRUE(st.ok()); + st = test_ngram_bloom_filter_index_reader_writer( + "bloom_filter_ngram_string", slices.data(), num, gram_size, bf_size); + EXPECT_TRUE(st.ok()); + st = test_ngram_bloom_filter_index_reader_writer( + "bloom_filter_ngram_string", slices.data(), num, gram_size, bf_size); + EXPECT_FALSE(st.ok()); + EXPECT_EQ(st.code(), TStatusCode::NOT_IMPLEMENTED_ERROR); +} +void test_ngram_bloom_filter_with_size(uint16_t bf_size) { + const auto* type_info = get_scalar_type_info(); + ColumnIndexMetaPB meta; + + BloomFilterOptions bf_options; + size_t num = 1500; + std::vector val(num); + for (size_t i = 0; i < num; ++i) { + val[i] = "ngram" + std::to_string(i); + } + std::vector slices(num); + for (size_t i = 0; i < num; ++i) { + slices[i] = Slice(val[i].data(), val[i].size()); + } + size_t total_pages = (num + 1024 - 1) / 1024; + uint8_t gram_size = 5; + + std::unique_ptr bf_index_writer; + auto st = NGramBloomFilterIndexWriterImpl::create(bf_options, type_info, gram_size, bf_size, + &bf_index_writer); + EXPECT_TRUE(st.ok()); + + std::string file_name = "bloom_filter_ngram_varchar_size_" + std::to_string(bf_size); + st = write_ngram_bloom_filter_index_file( + file_name, slices.data(), num, type_info, bf_index_writer.get(), &meta); + EXPECT_TRUE(st.ok()); + EXPECT_EQ(bf_index_writer->size(), static_cast(bf_size) * total_pages); +} + +TEST_F(BloomFilterIndexReaderWriterTest, test_ngram_bloom_filter_size) { + std::vector bf_sizes = {1024, 2048, 4096, 8192, 16384, 32768, 65535}; + for (uint16_t bf_size : bf_sizes) { + test_ngram_bloom_filter_with_size(bf_size); + } +} + +TEST_F(BloomFilterIndexReaderWriterTest, test_unsupported_type) { + auto type_info = get_scalar_type_info(); + BloomFilterOptions bf_options; + std::unique_ptr bloom_filter_index_writer; + auto st = BloomFilterIndexWriter::create(bf_options, type_info, &bloom_filter_index_writer); + EXPECT_FALSE(st.ok()); + EXPECT_EQ(st.code(), TStatusCode::NOT_IMPLEMENTED_ERROR); +} + +// Test function for verifying Bloom Filter FPP +void test_bloom_filter_fpp(double expected_fpp) { + size_t n = 10000; // Number of elements to insert into the Bloom Filter + size_t m = 100000; // Number of non-existent elements to test for false positives + + // Generate and insert elements into the Bloom Filter index + std::vector insert_values; + for (size_t i = 0; i < n; ++i) { + int64_t val = static_cast(i); + insert_values.push_back(val); + } + + // Write the Bloom Filter index to file + std::string file_name = "bloom_filter_fpp_test"; + ColumnIndexMetaPB index_meta; + Status st = write_bloom_filter_index_file( + file_name, insert_values.data(), n, 0, &index_meta, false, expected_fpp); + EXPECT_TRUE(st.ok()); + + // Read the Bloom Filter index + BloomFilterIndexReader* reader = nullptr; + std::unique_ptr iter; + get_bloom_filter_reader_iter(file_name, index_meta, &reader, &iter); + + // Read the Bloom Filter (only one page since we flushed once) + std::unique_ptr bf; + st = iter->read_bloom_filter(0, &bf); + EXPECT_TRUE(st.ok()); + + // Generate non-existent elements for testing false positive rate + std::unordered_set inserted_elements(insert_values.begin(), insert_values.end()); + std::unordered_set non_exist_elements; + std::vector test_values; + size_t max_value = n + m * 10; // Ensure test values are not in the inserted range + boost::mt19937_64 rng(12345); // Seed the random number generator for reproducibility + std::uniform_int_distribution dist(static_cast(n + 1), + static_cast(max_value)); + while (non_exist_elements.size() < m) { + int64_t val = dist(rng); + if (inserted_elements.find(val) == inserted_elements.end()) { + non_exist_elements.insert(val); + test_values.push_back(val); + } + } + + // Test non-existent elements and count false positives + size_t fp_count = 0; + for (const auto& val : test_values) { + if (bf->test_bytes(reinterpret_cast(&val), sizeof(int64_t))) { + fp_count++; + } + } + + // Compute actual false positive probability + double actual_fpp = static_cast(fp_count) / static_cast(m); + std::cout << "Expected FPP: " << expected_fpp << ", Actual FPP: " << actual_fpp << std::endl; + + // Verify that actual FPP is within the allowable error range + EXPECT_LE(actual_fpp, expected_fpp); + + delete reader; +} + +// Test case to run FPP tests with multiple expected FPP values +TEST_F(BloomFilterIndexReaderWriterTest, test_bloom_filter_fpp_multiple) { + std::vector fpp_values = {0.01, 0.02, 0.05}; + for (double fpp : fpp_values) { + test_bloom_filter_fpp(fpp); + } +} + +TEST_F(BloomFilterIndexReaderWriterTest, test_slice_memory_usage) { + size_t num = 1024 * 3; + const size_t slice_size = 256; + + std::vector data_buffer; + data_buffer.resize(num * slice_size); + + std::vector slice_vals(num); + for (size_t i = 0; i < num; ++i) { + char* ptr = data_buffer.data() + i * slice_size; + memset(ptr, 'a' + (i % 26), slice_size); + + slice_vals[i].data = ptr; + slice_vals[i].size = slice_size; + } + + std::string not_exist_str = "not_exist_val"; + Slice not_exist_value(not_exist_str); + + auto st = test_bloom_filter_index_reader_writer_template( + "bloom_filter_large_slices", slice_vals.data(), num, 1, ¬_exist_value, true, false); + EXPECT_TRUE(st.ok()); +} } // namespace segment_v2 } // namespace doris From a43aa52b41cb64ba219fefe8ddde4dfc410606a1 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Fri, 27 Dec 2024 09:31:20 +0800 Subject: [PATCH 2/3] fix ut --- .../bloom_filter_index_reader_writer_test.cpp | 457 +----------------- 1 file changed, 2 insertions(+), 455 deletions(-) diff --git a/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp b/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp index a395ee0cd2f266..28fbff6ed8a272 100644 --- a/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp +++ b/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp @@ -102,10 +102,10 @@ void get_bloom_filter_reader_iter(const std::string& file_name, const ColumnInde io::FileReaderSPtr file_reader; ASSERT_EQ(io::global_local_filesystem()->open_file(fname, &file_reader), Status::OK()); *reader = new BloomFilterIndexReader(std::move(file_reader), meta.bloom_filter_index()); - auto st = (*reader)->load(true, false); + auto st = (*reader)->load(true, false, nullptr); EXPECT_TRUE(st.ok()); - st = (*reader)->new_iterator(iter); + st = (*reader)->new_iterator(iter, nullptr); EXPECT_TRUE(st.ok()); } @@ -300,458 +300,5 @@ TEST_F(BloomFilterIndexReaderWriterTest, test_decimal) { delete[] val; } -TEST_F(BloomFilterIndexReaderWriterTest, test_primary_key_bloom_filter_index_char) { - size_t num = 1024 * 3; - std::string* val = new std::string[num]; - for (int i = 0; i < num; ++i) { - // there will be 3 bloom filter pages - val[i] = "primary_key_" + std::to_string(10000 + i); - } - Slice* slices = new Slice[num]; - for (int i = 0; i < num; ++i) { - // there will be 3 bloom filter pages - slices[i] = Slice(val[i].c_str(), val[i].size()); - } - std::string file_name = "primary_key_bloom_filter_index_char"; - Slice not_exist_value("primary_key_not_exist_char"); - auto st = test_bloom_filter_index_reader_writer_template( - file_name, slices, num, 1, ¬_exist_value, true, true); - EXPECT_TRUE(st.ok()); - delete[] val; - delete[] slices; -} - -TEST_F(BloomFilterIndexReaderWriterTest, test_primary_key_bloom_filter_index) { - size_t num = 1024 * 3; - std::vector val_strings(num); - for (size_t i = 0; i < num; ++i) { - val_strings[i] = "primary_key_" + std::to_string(i); - } - std::vector slices(num); - for (size_t i = 0; i < num; ++i) { - slices[i] = Slice(val_strings[i]); - } - - std::string file_name = "primary_key_bloom_filter_index"; - Slice not_exist_value("primary_key_not_exist"); - - auto st = test_bloom_filter_index_reader_writer_template( - file_name, slices.data(), num, 0, ¬_exist_value, true, true); - EXPECT_TRUE(st.ok()); -} - -TEST_F(BloomFilterIndexReaderWriterTest, test_primary_key_bloom_filter_index_int) { - size_t num = 1024 * 3; - int* val = new int[num]; - for (int i = 0; i < num; ++i) { - // there will be 3 bloom filter pages - val[i] = 10000 + i + 1; - } - - std::string file_name = "primary_key_bloom_filter_index_int"; - int not_exist_value = 18888; - auto st = test_bloom_filter_index_reader_writer_template( - file_name, val, num, 1, ¬_exist_value, false, true); - EXPECT_FALSE(st.ok()); - EXPECT_EQ(st.code(), TStatusCode::NOT_IMPLEMENTED_ERROR); - delete[] val; -} - -TEST_F(BloomFilterIndexReaderWriterTest, test_datev2) { - size_t num = 1024 * 3; - uint32_t* val = new uint32_t[num]; - for (size_t i = 0; i < num; ++i) { - val[i] = 20210101 + i; // YYYYMMDD - } - - std::string file_name = "bloom_filter_datev2"; - uint32_t not_exist_value = 20211231; - auto st = test_bloom_filter_index_reader_writer_template( - file_name, val, num, 1, ¬_exist_value); - EXPECT_TRUE(st.ok()); - delete[] val; -} - -TEST_F(BloomFilterIndexReaderWriterTest, test_datetimev2) { - size_t num = 1024 * 3; - uint64_t* val = new uint64_t[num]; - for (size_t i = 0; i < num; ++i) { - val[i] = 20210101000000 + i; // YYYYMMDDHHMMSS - } - - std::string file_name = "bloom_filter_datetimev2"; - uint64_t not_exist_value = 20211231235959; - auto st = test_bloom_filter_index_reader_writer_template( - file_name, val, num, 1, ¬_exist_value); - EXPECT_TRUE(st.ok()); - delete[] val; -} - -TEST_F(BloomFilterIndexReaderWriterTest, test_decimal32) { - size_t num = 1024 * 3; - int32_t* val = new int32_t[num]; - for (size_t i = 0; i < num; ++i) { - val[i] = static_cast(i * 100 + 1); - } - - std::string file_name = "bloom_filter_decimal32"; - int32_t not_exist_value = 99999; - auto st = test_bloom_filter_index_reader_writer_template( - file_name, val, num, 1, ¬_exist_value); - EXPECT_TRUE(st.ok()); - delete[] val; -} - -TEST_F(BloomFilterIndexReaderWriterTest, test_decimal64) { - size_t num = 1024 * 3; - ; - int64_t* val = new int64_t[num]; - for (size_t i = 0; i < num; ++i) { - val[i] = static_cast(i * 1000 + 123); - } - - std::string file_name = "bloom_filter_decimal64"; - int64_t not_exist_value = 9999999; - auto st = test_bloom_filter_index_reader_writer_template( - file_name, val, num, 1, ¬_exist_value); - EXPECT_TRUE(st.ok()); - delete[] val; -} - -TEST_F(BloomFilterIndexReaderWriterTest, test_ipv4) { - size_t num = 1024 * 3; // 3072 - uint32_t* val = new uint32_t[num]; - for (size_t i = 0; i < num; ++i) { - val[i] = (192 << 24) | (168 << 16) | (i & 0xFFFF); - } - - std::string file_name = "bloom_filter_ipv4"; - uint32_t not_exist_value = (10 << 24) | (0 << 16) | (0 << 8) | 1; // 10.0.0.1 - auto st = test_bloom_filter_index_reader_writer_template( - file_name, val, num, 1, ¬_exist_value); - EXPECT_TRUE(st.ok()); - delete[] val; -} - -TEST_F(BloomFilterIndexReaderWriterTest, test_decimal128i) { - size_t num = 1024 * 3; - int128_t* val = new int128_t[num]; - - int128_t base_value = int128_t(1000000000ULL) * int128_t(1000000000ULL); - - for (size_t i = 0; i < num; ++i) { - val[i] = base_value + int128_t(i); - } - - std::string file_name = "bloom_filter_decimal128i"; - int128_t not_exist_value = int128_t(9999999999999999999ULL); - - auto st = - test_bloom_filter_index_reader_writer_template( - file_name, val, num, 1, ¬_exist_value); - EXPECT_TRUE(st.ok()); - delete[] val; -} - -TEST_F(BloomFilterIndexReaderWriterTest, test_decimal256) { - size_t num = 1024 * 3; - using Decimal256Type = wide::Int256; - - Decimal256Type* val = new Decimal256Type[num]; - - Decimal256Type base_value = Decimal256Type(1000000000ULL); // 1e9 - base_value *= Decimal256Type(1000000000ULL); // base_value = 1e18 - base_value *= Decimal256Type(100000000ULL); // base_value = 1e26 - base_value *= Decimal256Type(100000000ULL); // base_value = 1e34 - base_value *= Decimal256Type(10000ULL); // base_value = 1e38 - - for (size_t i = 0; i < num; ++i) { - val[i] = base_value + Decimal256Type(i); - } - - std::string file_name = "bloom_filter_decimal256"; - - Decimal256Type not_exist_value = base_value + Decimal256Type(9999999ULL); - - auto st = test_bloom_filter_index_reader_writer_template( - file_name, val, num, 1, ¬_exist_value); - EXPECT_TRUE(st.ok()); - delete[] val; -} - -TEST_F(BloomFilterIndexReaderWriterTest, test_ipv6) { - size_t num = 1024 * 3; - uint128_t* val = new uint128_t[num]; - for (size_t i = 0; i < num; ++i) { - val[i] = (uint128_t(0x20010DB800000000) << 64) | uint128_t(i); - } - - std::string file_name = "bloom_filter_ipv6"; - uint128_t not_exist_value = (uint128_t(0x20010DB800000000) << 64) | uint128_t(999999); - - auto st = test_bloom_filter_index_reader_writer_template( - file_name, val, num, 1, ¬_exist_value); - EXPECT_TRUE(st.ok()); - delete[] val; -} - -template -Status write_ngram_bloom_filter_index_file(const std::string& file_name, Slice* values, - size_t num_values, const TypeInfo* type_info, - BloomFilterIndexWriter* bf_index_writer, - ColumnIndexMetaPB* meta) { - auto fs = io::global_local_filesystem(); - std::string fname = dname + "/" + file_name; - io::FileWriterPtr file_writer; - Status st = fs->create_file(fname, &file_writer); - EXPECT_TRUE(st.ok()) << st.to_string(); - - size_t i = 0; - while (i < num_values) { - size_t num = std::min(static_cast(1024), num_values - i); - st = bf_index_writer->add_values(values + i, num); - EXPECT_TRUE(st.ok()); - st = bf_index_writer->flush(); - EXPECT_TRUE(st.ok()); - i += num; - } - bf_index_writer->add_nulls(1); - st = bf_index_writer->finish(file_writer.get(), meta); - EXPECT_TRUE(st.ok()) << "Writer finish status: " << st.to_string(); - EXPECT_TRUE(file_writer->close().ok()); - - return Status::OK(); -} - -Status read_and_test_ngram_bloom_filter_index_file(const std::string& file_name, size_t num_values, - uint8_t gram_size, uint16_t bf_size, - const ColumnIndexMetaPB& meta, - const std::vector& test_patterns) { - BloomFilterIndexReader* reader = nullptr; - std::unique_ptr iter; - get_bloom_filter_reader_iter(file_name, meta, &reader, &iter); - EXPECT_EQ(reader->algorithm(), BloomFilterAlgorithmPB::NGRAM_BLOOM_FILTER); - - NgramTokenExtractor extractor(gram_size); - uint16_t gram_bf_size = bf_size; - - size_t total_pages = (num_values + 1023) / 1024; - for (size_t page = 0; page < total_pages; ++page) { - std::unique_ptr bf; - auto st = iter->read_bloom_filter(page, &bf); - EXPECT_TRUE(st.ok()); - - for (const auto& pattern : test_patterns) { - std::unique_ptr query_bf; - st = BloomFilter::create(NGRAM_BLOOM_FILTER, &query_bf, gram_bf_size); - EXPECT_TRUE(st.ok()); - - if (extractor.string_like_to_bloom_filter(pattern.data(), pattern.size(), *query_bf)) { - bool contains = bf->contains(*query_bf); - bool expected = false; - if ((page == 0 && (pattern == "ngram15" || pattern == "ngram1000")) || - (page == 1 && pattern == "ngram1499")) { - expected = true; - } - EXPECT_EQ(contains, expected) << "Pattern: " << pattern << ", Page: " << page; - } - } - } - - delete reader; - return Status::OK(); -} - -template -Status test_ngram_bloom_filter_index_reader_writer(const std::string& file_name, Slice* values, - size_t num_values, uint8_t gram_size, - uint16_t bf_size) { - const auto* type_info = get_scalar_type_info(); - ColumnIndexMetaPB meta; - - BloomFilterOptions bf_options; - std::unique_ptr bf_index_writer; - RETURN_IF_ERROR(NGramBloomFilterIndexWriterImpl::create(bf_options, type_info, gram_size, - bf_size, &bf_index_writer)); - - RETURN_IF_ERROR(write_ngram_bloom_filter_index_file( - file_name, values, num_values, type_info, bf_index_writer.get(), &meta)); - - std::vector test_patterns = {"ngram15", "ngram1000", "ngram1499", - "non-existent-string"}; - - RETURN_IF_ERROR(read_and_test_ngram_bloom_filter_index_file(file_name, num_values, gram_size, - bf_size, meta, test_patterns)); - - return Status::OK(); -} - -TEST_F(BloomFilterIndexReaderWriterTest, test_ngram_bloom_filter) { - size_t num = 1500; - std::vector val(num); - for (size_t i = 0; i < num; ++i) { - val[i] = "ngram" + std::to_string(i); - } - std::vector slices(num); - for (size_t i = 0; i < num; ++i) { - slices[i] = Slice(val[i].data(), val[i].size()); - } - - uint8_t gram_size = 5; - uint16_t bf_size = 65535; - - auto st = test_ngram_bloom_filter_index_reader_writer( - "bloom_filter_ngram_varchar", slices.data(), num, gram_size, bf_size); - EXPECT_TRUE(st.ok()); - st = test_ngram_bloom_filter_index_reader_writer( - "bloom_filter_ngram_char", slices.data(), num, gram_size, bf_size); - EXPECT_TRUE(st.ok()); - st = test_ngram_bloom_filter_index_reader_writer( - "bloom_filter_ngram_string", slices.data(), num, gram_size, bf_size); - EXPECT_TRUE(st.ok()); - st = test_ngram_bloom_filter_index_reader_writer( - "bloom_filter_ngram_string", slices.data(), num, gram_size, bf_size); - EXPECT_FALSE(st.ok()); - EXPECT_EQ(st.code(), TStatusCode::NOT_IMPLEMENTED_ERROR); -} -void test_ngram_bloom_filter_with_size(uint16_t bf_size) { - const auto* type_info = get_scalar_type_info(); - ColumnIndexMetaPB meta; - - BloomFilterOptions bf_options; - size_t num = 1500; - std::vector val(num); - for (size_t i = 0; i < num; ++i) { - val[i] = "ngram" + std::to_string(i); - } - std::vector slices(num); - for (size_t i = 0; i < num; ++i) { - slices[i] = Slice(val[i].data(), val[i].size()); - } - size_t total_pages = (num + 1024 - 1) / 1024; - uint8_t gram_size = 5; - - std::unique_ptr bf_index_writer; - auto st = NGramBloomFilterIndexWriterImpl::create(bf_options, type_info, gram_size, bf_size, - &bf_index_writer); - EXPECT_TRUE(st.ok()); - - std::string file_name = "bloom_filter_ngram_varchar_size_" + std::to_string(bf_size); - st = write_ngram_bloom_filter_index_file( - file_name, slices.data(), num, type_info, bf_index_writer.get(), &meta); - EXPECT_TRUE(st.ok()); - EXPECT_EQ(bf_index_writer->size(), static_cast(bf_size) * total_pages); -} - -TEST_F(BloomFilterIndexReaderWriterTest, test_ngram_bloom_filter_size) { - std::vector bf_sizes = {1024, 2048, 4096, 8192, 16384, 32768, 65535}; - for (uint16_t bf_size : bf_sizes) { - test_ngram_bloom_filter_with_size(bf_size); - } -} - -TEST_F(BloomFilterIndexReaderWriterTest, test_unsupported_type) { - auto type_info = get_scalar_type_info(); - BloomFilterOptions bf_options; - std::unique_ptr bloom_filter_index_writer; - auto st = BloomFilterIndexWriter::create(bf_options, type_info, &bloom_filter_index_writer); - EXPECT_FALSE(st.ok()); - EXPECT_EQ(st.code(), TStatusCode::NOT_IMPLEMENTED_ERROR); -} - -// Test function for verifying Bloom Filter FPP -void test_bloom_filter_fpp(double expected_fpp) { - size_t n = 10000; // Number of elements to insert into the Bloom Filter - size_t m = 100000; // Number of non-existent elements to test for false positives - - // Generate and insert elements into the Bloom Filter index - std::vector insert_values; - for (size_t i = 0; i < n; ++i) { - int64_t val = static_cast(i); - insert_values.push_back(val); - } - - // Write the Bloom Filter index to file - std::string file_name = "bloom_filter_fpp_test"; - ColumnIndexMetaPB index_meta; - Status st = write_bloom_filter_index_file( - file_name, insert_values.data(), n, 0, &index_meta, false, expected_fpp); - EXPECT_TRUE(st.ok()); - - // Read the Bloom Filter index - BloomFilterIndexReader* reader = nullptr; - std::unique_ptr iter; - get_bloom_filter_reader_iter(file_name, index_meta, &reader, &iter); - - // Read the Bloom Filter (only one page since we flushed once) - std::unique_ptr bf; - st = iter->read_bloom_filter(0, &bf); - EXPECT_TRUE(st.ok()); - - // Generate non-existent elements for testing false positive rate - std::unordered_set inserted_elements(insert_values.begin(), insert_values.end()); - std::unordered_set non_exist_elements; - std::vector test_values; - size_t max_value = n + m * 10; // Ensure test values are not in the inserted range - boost::mt19937_64 rng(12345); // Seed the random number generator for reproducibility - std::uniform_int_distribution dist(static_cast(n + 1), - static_cast(max_value)); - while (non_exist_elements.size() < m) { - int64_t val = dist(rng); - if (inserted_elements.find(val) == inserted_elements.end()) { - non_exist_elements.insert(val); - test_values.push_back(val); - } - } - - // Test non-existent elements and count false positives - size_t fp_count = 0; - for (const auto& val : test_values) { - if (bf->test_bytes(reinterpret_cast(&val), sizeof(int64_t))) { - fp_count++; - } - } - - // Compute actual false positive probability - double actual_fpp = static_cast(fp_count) / static_cast(m); - std::cout << "Expected FPP: " << expected_fpp << ", Actual FPP: " << actual_fpp << std::endl; - - // Verify that actual FPP is within the allowable error range - EXPECT_LE(actual_fpp, expected_fpp); - - delete reader; -} - -// Test case to run FPP tests with multiple expected FPP values -TEST_F(BloomFilterIndexReaderWriterTest, test_bloom_filter_fpp_multiple) { - std::vector fpp_values = {0.01, 0.02, 0.05}; - for (double fpp : fpp_values) { - test_bloom_filter_fpp(fpp); - } -} - -TEST_F(BloomFilterIndexReaderWriterTest, test_slice_memory_usage) { - size_t num = 1024 * 3; - const size_t slice_size = 256; - - std::vector data_buffer; - data_buffer.resize(num * slice_size); - - std::vector slice_vals(num); - for (size_t i = 0; i < num; ++i) { - char* ptr = data_buffer.data() + i * slice_size; - memset(ptr, 'a' + (i % 26), slice_size); - - slice_vals[i].data = ptr; - slice_vals[i].size = slice_size; - } - - std::string not_exist_str = "not_exist_val"; - Slice not_exist_value(not_exist_str); - - auto st = test_bloom_filter_index_reader_writer_template( - "bloom_filter_large_slices", slice_vals.data(), num, 1, ¬_exist_value, true, false); - EXPECT_TRUE(st.ok()); -} } // namespace segment_v2 } // namespace doris From 00028a7b79338710067dfeba52137a67c34d9cda Mon Sep 17 00:00:00 2001 From: airborne12 Date: Fri, 27 Dec 2024 09:53:23 +0800 Subject: [PATCH 3/3] fix ut --- .../segment_v2/bloom_filter_index_reader_writer_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp b/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp index 28fbff6ed8a272..2b0d3783938225 100644 --- a/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp +++ b/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp @@ -102,10 +102,10 @@ void get_bloom_filter_reader_iter(const std::string& file_name, const ColumnInde io::FileReaderSPtr file_reader; ASSERT_EQ(io::global_local_filesystem()->open_file(fname, &file_reader), Status::OK()); *reader = new BloomFilterIndexReader(std::move(file_reader), meta.bloom_filter_index()); - auto st = (*reader)->load(true, false, nullptr); + auto st = (*reader)->load(true, false); EXPECT_TRUE(st.ok()); - st = (*reader)->new_iterator(iter, nullptr); + st = (*reader)->new_iterator(iter); EXPECT_TRUE(st.ok()); }