diff --git a/be/src/olap/collection_statistics.cpp b/be/src/olap/collection_statistics.cpp index 714a19fe6b7798..94130a1e6a74f6 100644 --- a/be/src/olap/collection_statistics.cpp +++ b/be/src/olap/collection_statistics.cpp @@ -17,6 +17,7 @@ #include "collection_statistics.h" +#include #include #include "common/exception.h" @@ -26,6 +27,7 @@ #include "olap/rowset/segment_v2/index_reader_helper.h" #include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h" #include "olap/rowset/segment_v2/inverted_index/util/string_helper.h" +#include "util/uid_util.h" #include "vec/exprs/vexpr.h" #include "vec/exprs/vexpr_context.h" #include "vec/exprs/vliteral.h" @@ -64,31 +66,44 @@ Status CollectionStatistics::collect( } } -#ifndef NDEBUG - std::stringstream ss; - ss << "term_num_docs: " << _total_num_docs; - for (const auto& [ws_field_name, num_tokens] : _total_num_tokens) { - ss << ", [field_name: " << StringHelper::to_string(ws_field_name) - << ", num_tokens: " << num_tokens; - auto it = _term_doc_freqs.find(ws_field_name); - if (it != _term_doc_freqs.end()) { - ss << ", terms: {"; - bool first = true; - for (const auto& [term, doc_freq] : it->second) { - if (!first) { - ss << ", "; - } - ss << StringHelper::to_string(term) << ": " << doc_freq; - first = false; + // Build a single-line log with query_id, tablet_ids, and per-field term statistics + if (VLOG_IS_ON(1)) { + std::set tablet_ids; + for (const auto& rs_split : rs_splits) { + if (rs_split.rs_reader && rs_split.rs_reader->rowset()) { + tablet_ids.insert(rs_split.rs_reader->rowset()->rowset_meta()->tablet_id()); } - ss << "}"; - } else { - ss << ", (no term stats)"; } - ss << "]"; + + std::ostringstream oss; + oss << "CollectionStatistics: query_id=" << print_id(state->query_id()); + + oss << ", tablet_ids=["; + bool first_tablet = true; + for (int64_t tid : tablet_ids) { + if (!first_tablet) oss << ","; + oss << tid; + first_tablet = false; + } + oss << "]"; + + oss << ", total_num_docs=" << _total_num_docs; + + for (const auto& [ws_field_name, num_tokens] : _total_num_tokens) { + oss << ", {field=" << StringHelper::to_string(ws_field_name) + << ", num_tokens=" << num_tokens << ", terms=["; + + bool first_term = true; + for (const auto& [term, doc_freq] : _term_doc_freqs.at(ws_field_name)) { + if (!first_term) oss << ", "; + oss << "(" << StringHelper::to_string(term) << ":" << doc_freq << ")"; + first_term = false; + } + oss << "]}"; + } + + VLOG(1) << oss.str(); } - LOG(INFO) << "CollectionStatistics: " << ss.str(); -#endif return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/inverted_index/similarity/bm25_similarity.cpp b/be/src/olap/rowset/segment_v2/inverted_index/similarity/bm25_similarity.cpp index a01b467c1e0c89..88865e140a625d 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/similarity/bm25_similarity.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/similarity/bm25_similarity.cpp @@ -29,7 +29,7 @@ const int32_t BM25Similarity::NUM_FREE_VALUES = 255 - static_cast(MAX_INT4) std::vector BM25Similarity::LENGTH_TABLE = []() { std::vector table(256); for (int32_t i = 0; i < 256; i++) { - table[i] = int_to_byte4(i); + table[i] = (float)byte4_to_int((uint8_t)i); } return table; }(); diff --git a/be/test/olap/rowset/segment_v2/inverted_index/similarity/bm25_similarity_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/similarity/bm25_similarity_test.cpp index 34711aaac8b87d..e6d827a01cd60c 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/similarity/bm25_similarity_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/similarity/bm25_similarity_test.cpp @@ -119,7 +119,7 @@ TEST_F(BM25SimilarityTest, ScoreEdgeCasesTest) { ASSERT_GT(score_high, 0.0f); float score_max_norm = similarity_->score(1.0f, 255); - ASSERT_GT(score_max_norm, 0.0f); + ASSERT_GE(score_max_norm, 0.0f); } TEST_F(BM25SimilarityTest, Int4EncodingTest) { @@ -231,6 +231,32 @@ TEST_F(BM25SimilarityTest, LengthTableInitializationTest) { } } +TEST_F(BM25SimilarityTest, LengthTableCorrectDecoding) { + for (int i = 0; i < 256; ++i) { + float expected = static_cast(BM25Similarity::byte4_to_int(static_cast(i))); + ASSERT_FLOAT_EQ(BM25Similarity::LENGTH_TABLE[i], expected) + << "LENGTH_TABLE[" << i << "] should equal byte4_to_int(" << i << ")"; + } + + std::vector test_doc_lengths = {0, 1, 10, 50, 100, 500, 1000, 5000, 10000}; + for (int32_t doc_len : test_doc_lengths) { + uint8_t encoded_norm = BM25Similarity::int_to_byte4(doc_len); + float decoded_via_table = BM25Similarity::LENGTH_TABLE[encoded_norm]; + int32_t decoded_via_func = BM25Similarity::byte4_to_int(encoded_norm); + + ASSERT_FLOAT_EQ(decoded_via_table, static_cast(decoded_via_func)) + << "Mismatch for doc_len=" << doc_len << ", encoded_norm=" << (int)encoded_norm; + + ASSERT_LE(decoded_via_func, doc_len) + << "Decoded value should be <= original for doc_len=" << doc_len; + } + + for (int i = 0; i < 256; ++i) { + int32_t correct_value = BM25Similarity::byte4_to_int(static_cast(i)); + ASSERT_FLOAT_EQ(BM25Similarity::LENGTH_TABLE[i], static_cast(correct_value)); + } +} + TEST_F(BM25SimilarityTest, DifferentParametersTest) { mock_stats_->set_mock_idf(1.0f); mock_stats_->set_mock_avg_dl(1.0f);