From 4a9af40f65ea7995c8d7dcb8cc7a5e361ee77b3b Mon Sep 17 00:00:00 2001 From: airborne12 Date: Wed, 18 Jun 2025 17:41:09 +0800 Subject: [PATCH 1/3] [fix](inverted index) catch IO exception to avoid coredump in inverted index string reader (#51844) Problem Summary: This PR adds error handling around CLucene interactions in the string inverted index reader to prevent core dumps on IO failures and introduces. --- .../segment_v2/inverted_index_reader.cpp | 76 ++++---- .../segment_v2/inverted_index_reader_test.cpp | 169 +++++++++++++++++- 2 files changed, 207 insertions(+), 38 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index 662c930ead2a93..6551cbbd04cd86 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -411,32 +411,33 @@ Status StringTypeInvertedIndexReader::query(const io::IOContext* io_ctx, std::string search_str(search_query->data, act_len); VLOG_DEBUG << "begin to query the inverted index from clucene" << ", column_name: " << column_name << ", search_str: " << search_str; + try { + auto index_file_key = _inverted_index_file_reader->get_index_file_cache_key(&_index_meta); + // try to get query bitmap result from cache and return immediately on cache hit + InvertedIndexQueryCache::CacheKey cache_key {index_file_key, column_name, query_type, + search_str}; + auto* cache = InvertedIndexQueryCache::instance(); + InvertedIndexQueryCacheHandle cache_handler; + auto cache_status = + handle_query_cache(runtime_state, cache, cache_key, &cache_handler, stats, bit_map); + if (cache_status.ok()) { + return Status::OK(); + } - auto index_file_key = _inverted_index_file_reader->get_index_file_cache_key(&_index_meta); - // try to get query bitmap result from cache and return immediately on cache hit - InvertedIndexQueryCache::CacheKey cache_key {index_file_key, column_name, query_type, - search_str}; - auto* cache = InvertedIndexQueryCache::instance(); - InvertedIndexQueryCacheHandle cache_handler; - auto cache_status = handle_query_cache(cache, cache_key, &cache_handler, stats, bit_map); - if (cache_status.ok()) { - return Status::OK(); - } - - std::wstring column_name_ws = StringUtil::string_to_wstring(column_name); + std::wstring column_name_ws = StringUtil::string_to_wstring(column_name); - InvertedIndexQueryInfo query_info; - query_info.field_name = column_name_ws; - query_info.terms.emplace_back(search_str); + InvertedIndexQueryInfo query_info; + query_info.field_name = column_name_ws; + query_info.terms.emplace_back(search_str); - auto result = std::make_shared(); - FulltextIndexSearcherPtr* searcher_ptr = nullptr; - InvertedIndexCacheHandle inverted_index_cache_handle; - RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, io_ctx, stats)); - auto searcher_variant = inverted_index_cache_handle.get_index_searcher(); - searcher_ptr = std::get_if(&searcher_variant); - if (searcher_ptr != nullptr) { - try { + auto result = std::make_shared(); + FulltextIndexSearcherPtr* searcher_ptr = nullptr; + InvertedIndexCacheHandle inverted_index_cache_handle; + RETURN_IF_ERROR( + handle_searcher_cache(runtime_state, &inverted_index_cache_handle, io_ctx, stats)); + auto searcher_variant = inverted_index_cache_handle.get_index_searcher(); + searcher_ptr = std::get_if(&searcher_variant); + if (searcher_ptr != nullptr) { switch (query_type) { case InvertedIndexQueryType::MATCH_ANY_QUERY: case InvertedIndexQueryType::MATCH_ALL_QUERY: @@ -488,27 +489,28 @@ Status StringTypeInvertedIndexReader::query(const io::IOContext* io_ctx, return Status::Error( "invalid query type when query untokenized inverted index"); } - } catch (const CLuceneError& e) { - if (is_range_query(query_type) && e.number() == CL_ERR_TooManyClauses) { - return Status::Error( - "range query term exceeds limits, try to downgrade from inverted index, " - "column " - "name:{}, search_str:{}", - column_name, search_str); - } else { - return Status::Error( - "CLuceneError occured, error msg: {}, column name: {}, search_str: {}", - e.what(), column_name, search_str); - } } - // add to cache result->runOptimize(); cache->insert(cache_key, result, &cache_handler); bit_map = result; + return Status::OK(); + } catch (const CLuceneError& e) { + if (is_range_query(query_type) && e.number() == CL_ERR_TooManyClauses) { + return Status::Error( + "range query term exceeds limits, try to downgrade from inverted index, " + "column " + "name:{}, search_str:{}", + column_name, search_str); + } else { + LOG(ERROR) << "CLuceneError occurred, error msg: " << e.what() + << ", column name: " << column_name << ", search_str: " << search_str; + return Status::Error( + "CLuceneError occurred, error msg: {}, column name: {}, search_str: {}", + e.what(), column_name, search_str); + } } - return Status::OK(); } InvertedIndexReaderType StringTypeInvertedIndexReader::type() { diff --git a/be/test/olap/rowset/segment_v2/inverted_index_reader_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index_reader_test.cpp index 524936829177e8..63569d9655a3ce 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index_reader_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index_reader_test.cpp @@ -1531,6 +1531,163 @@ class InvertedIndexReaderTest : public testing::Test { } } + class MockStringTypeInvertedIndexReader final : public StringTypeInvertedIndexReader { + public: + static std::shared_ptr create_shared( + const TabletIndex* idx_meta, + std::shared_ptr& file_reader) { + return std::shared_ptr( + new MockStringTypeInvertedIndexReader(idx_meta, file_reader)); + } + + protected: + Status handle_searcher_cache(RuntimeState*, InvertedIndexCacheHandle*, const io::IOContext*, + OlapReaderStatistics*) override { + CLuceneError err; + err.set(CL_ERR_IO, "mock handle_searcher_cache failure"); + throw err; + } + + private: + MockStringTypeInvertedIndexReader(const TabletIndex* idx_meta, + std::shared_ptr& file_reader) + : StringTypeInvertedIndexReader(idx_meta, file_reader) {} + }; + + // Mock class for testing tokenized index query exceptions + class MockTokenizedStringTypeInvertedIndexReader final : public FullTextIndexReader { + public: + static std::shared_ptr create_shared( + const TabletIndex* idx_meta, + std::shared_ptr& file_reader) { + return std::shared_ptr( + new MockTokenizedStringTypeInvertedIndexReader(idx_meta, file_reader)); + } + + protected: + Status handle_searcher_cache(RuntimeState*, InvertedIndexCacheHandle*, const io::IOContext*, + OlapReaderStatistics*) override { + CLuceneError err; + err.set(CL_ERR_IO, "mock tokenized index searcher cache failure"); + throw err; + } + + private: + MockTokenizedStringTypeInvertedIndexReader( + const TabletIndex* idx_meta, std::shared_ptr& file_reader) + : FullTextIndexReader(idx_meta, file_reader) {} + }; + + void test_cache_error_scenarios() { + std::string_view rowset_id = "test_handle_searcher_cache_exception"; + int seg_id = 0; + std::vector values = {Slice("apple"), Slice("banana")}; + + TabletIndex idx_meta; + { + auto index_meta_pb = std::make_unique(); + index_meta_pb->set_index_type(IndexType::INVERTED); + index_meta_pb->set_index_id(1); + index_meta_pb->set_index_name("test_mock_cache"); + index_meta_pb->add_col_unique_id(1); // c2 + idx_meta.init_from_pb(*index_meta_pb); + } + + std::string index_path_prefix; + prepare_string_index(rowset_id, seg_id, values, &idx_meta, &index_path_prefix); + + auto file_reader = std::make_shared( + io::global_local_filesystem(), index_path_prefix, InvertedIndexStorageFormatPB::V2); + ASSERT_TRUE(file_reader->init().ok()); + + auto mock_reader = MockStringTypeInvertedIndexReader::create_shared(&idx_meta, file_reader); + ASSERT_NE(mock_reader, nullptr); + + io::IOContext io_ctx; + OlapReaderStatistics stats; + RuntimeState runtime_state; + TQueryOptions opts; + opts.enable_inverted_index_searcher_cache = true; + runtime_state.set_query_options(opts); + + std::shared_ptr bitmap = std::make_shared(); + std::string field_name = "1"; // c2 unique_id + StringRef query_val(values[0].data, values[0].size); + + Status st = mock_reader->query(&io_ctx, &stats, &runtime_state, field_name, &query_val, + InvertedIndexQueryType::EQUAL_QUERY, bitmap); + + EXPECT_FALSE(st.ok()); + EXPECT_EQ(st.code(), ErrorCode::INVERTED_INDEX_CLUCENE_ERROR); + } + + void test_tokenized_index_query_error_scenarios() { + std::string_view rowset_id = "test_tokenized_index_query_exception"; + int seg_id = 0; + std::vector values = {Slice("Hello world this is a test"), + Slice("Apache Doris is a modern analytics database"), + Slice("Inverted index provides fast text search")}; + + TabletIndex idx_meta; + { + auto index_meta_pb = std::make_unique(); + index_meta_pb->set_index_type(IndexType::INVERTED); + index_meta_pb->set_index_id(2); + index_meta_pb->set_index_name("test_tokenized_mock_cache"); + index_meta_pb->add_col_unique_id(1); // c2 + + // Set tokenized index properties + auto* properties = index_meta_pb->mutable_properties(); + (*properties)[INVERTED_INDEX_PARSER_KEY] = INVERTED_INDEX_PARSER_ENGLISH; + (*properties)[INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY] = + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES; + (*properties)[INVERTED_INDEX_PARSER_LOWERCASE_KEY] = INVERTED_INDEX_PARSER_TRUE; + + idx_meta.init_from_pb(*index_meta_pb); + } + + std::string index_path_prefix; + prepare_string_index(rowset_id, seg_id, values, &idx_meta, &index_path_prefix); + + auto file_reader = std::make_shared( + io::global_local_filesystem(), index_path_prefix, InvertedIndexStorageFormatPB::V2); + ASSERT_TRUE(file_reader->init().ok()); + + auto mock_reader = + MockTokenizedStringTypeInvertedIndexReader::create_shared(&idx_meta, file_reader); + ASSERT_NE(mock_reader, nullptr); + + io::IOContext io_ctx; + OlapReaderStatistics stats; + RuntimeState runtime_state; + TQueryOptions opts; + opts.enable_inverted_index_searcher_cache = true; + runtime_state.set_query_options(opts); + + std::shared_ptr bitmap = std::make_shared(); + std::string field_name = "1"; // c2 unique_id + + // Test tokenized query with "world" which should be found in "Hello world this is a test" + std::string query_term = "world"; + StringRef query_val(query_term.data(), query_term.size()); + + Status st = mock_reader->query(&io_ctx, &stats, &runtime_state, field_name, &query_val, + InvertedIndexQueryType::MATCH_ANY_QUERY, bitmap); + + EXPECT_FALSE(st.ok()); + EXPECT_EQ(st.code(), ErrorCode::INVERTED_INDEX_CLUCENE_ERROR); + + // Test phrase query + std::string phrase_query = "Apache Doris"; + StringRef phrase_query_val(phrase_query.data(), phrase_query.size()); + + st = mock_reader->query(&io_ctx, &stats, &runtime_state, field_name, &phrase_query_val, + InvertedIndexQueryType::MATCH_PHRASE_QUERY, bitmap); + + EXPECT_FALSE(st.ok()); + EXPECT_EQ(st.code(), ErrorCode::INVERTED_INDEX_CLUCENE_ERROR); + } + private: std::unique_ptr _inverted_index_searcher_cache; std::unique_ptr _inverted_index_query_cache; @@ -1561,4 +1718,14 @@ TEST_F(InvertedIndexReaderTest, CompatibleTest) { test_compatible_read_cross_platform(); } -} // namespace doris::segment_v2 \ No newline at end of file +// Test cache error scenarios that could crash BE +TEST_F(InvertedIndexReaderTest, CacheErrorScenarios) { + test_cache_error_scenarios(); +} + +// Test tokenized index query error scenarios +TEST_F(InvertedIndexReaderTest, TokenizedIndexQueryErrorScenarios) { + test_tokenized_index_query_error_scenarios(); +} + +} // namespace doris::segment_v2 From 0b01d1c5d91fc242a7faa54aabd98cb0391e5d36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A7=9C=E5=87=AF?= Date: Mon, 7 Jul 2025 14:16:29 +0800 Subject: [PATCH 2/3] fix --- be/src/olap/rowset/segment_v2/inverted_index_reader.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index 6551cbbd04cd86..dfd128b4cae0d0 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -418,8 +418,7 @@ Status StringTypeInvertedIndexReader::query(const io::IOContext* io_ctx, search_str}; auto* cache = InvertedIndexQueryCache::instance(); InvertedIndexQueryCacheHandle cache_handler; - auto cache_status = - handle_query_cache(runtime_state, cache, cache_key, &cache_handler, stats, bit_map); + auto cache_status = handle_query_cache(cache, cache_key, &cache_handler, stats, bit_map); if (cache_status.ok()) { return Status::OK(); } @@ -433,8 +432,7 @@ Status StringTypeInvertedIndexReader::query(const io::IOContext* io_ctx, auto result = std::make_shared(); FulltextIndexSearcherPtr* searcher_ptr = nullptr; InvertedIndexCacheHandle inverted_index_cache_handle; - RETURN_IF_ERROR( - handle_searcher_cache(runtime_state, &inverted_index_cache_handle, io_ctx, stats)); + RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, io_ctx, stats)); auto searcher_variant = inverted_index_cache_handle.get_index_searcher(); searcher_ptr = std::get_if(&searcher_variant); if (searcher_ptr != nullptr) { From 953f2346a7823ae6c81345e6975286e53667fa42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A7=9C=E5=87=AF?= Date: Mon, 7 Jul 2025 15:40:49 +0800 Subject: [PATCH 3/3] fix --- .../olap/rowset/segment_v2/inverted_index_reader_test.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/be/test/olap/rowset/segment_v2/inverted_index_reader_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index_reader_test.cpp index 63569d9655a3ce..2e27cef2539198 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index_reader_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index_reader_test.cpp @@ -1541,7 +1541,7 @@ class InvertedIndexReaderTest : public testing::Test { } protected: - Status handle_searcher_cache(RuntimeState*, InvertedIndexCacheHandle*, const io::IOContext*, + Status handle_searcher_cache(InvertedIndexCacheHandle*, const io::IOContext*, OlapReaderStatistics*) override { CLuceneError err; err.set(CL_ERR_IO, "mock handle_searcher_cache failure"); @@ -1565,7 +1565,7 @@ class InvertedIndexReaderTest : public testing::Test { } protected: - Status handle_searcher_cache(RuntimeState*, InvertedIndexCacheHandle*, const io::IOContext*, + Status handle_searcher_cache(InvertedIndexCacheHandle*, const io::IOContext*, OlapReaderStatistics*) override { CLuceneError err; err.set(CL_ERR_IO, "mock tokenized index searcher cache failure"); @@ -1607,7 +1607,6 @@ class InvertedIndexReaderTest : public testing::Test { OlapReaderStatistics stats; RuntimeState runtime_state; TQueryOptions opts; - opts.enable_inverted_index_searcher_cache = true; runtime_state.set_query_options(opts); std::shared_ptr bitmap = std::make_shared(); @@ -1661,7 +1660,6 @@ class InvertedIndexReaderTest : public testing::Test { OlapReaderStatistics stats; RuntimeState runtime_state; TQueryOptions opts; - opts.enable_inverted_index_searcher_cache = true; runtime_state.set_query_options(opts); std::shared_ptr bitmap = std::make_shared();