From 9f156dd01fb5688c7037df2150acefb32262ce8f Mon Sep 17 00:00:00 2001 From: qiye Date: Sun, 16 Jun 2024 10:02:14 +0800 Subject: [PATCH 1/2] [fix](inverted index)Support Chinese column name with inverted index (#36321) 1. `std::string` to `std::wstring` conversion only supports ASCII characters. For non-ASCII characters, we need to use `StringUtil::string_to_wstring` 2. Fix index_tool check_terms_stats_v2 and add field info to print Issue Number: #34118 --- be/src/index-tools/index_tool.cpp | 31 ++++++++----- .../segment_v2/inverted_index_reader.cpp | 6 +-- .../segment_v2/inverted_index_writer.cpp | 2 +- .../test_index_chinese_column.out | 7 +++ .../test_index_chinese_column.groovy | 44 +++++++++++++++++++ 5 files changed, 74 insertions(+), 16 deletions(-) create mode 100644 regression-test/data/inverted_index_p0/test_index_chinese_column.out create mode 100644 regression-test/suites/inverted_index_p0/test_index_chinese_column.groovy diff --git a/be/src/index-tools/index_tool.cpp b/be/src/index-tools/index_tool.cpp index 32ed178e36c26c..9bc845bfb7f7af 100644 --- a/be/src/index-tools/index_tool.cpp +++ b/be/src/index-tools/index_tool.cpp @@ -31,6 +31,7 @@ #include #include +#include "CLucene/util/stringUtil.h" #include "io/fs/file_reader.h" #ifdef __clang__ #pragma clang diagnostic push @@ -93,14 +94,17 @@ std::string get_usage(const std::string& progname) { ss << "./index_tool --operation=term_query --directory=directory " "--idx_file_name=file --print_row_id --term=term --column_name=column_name " "--pred_type=eq/lt/gt/le/ge/match etc\n"; - ss << "./index_tool --operation=write_index_v2 --idx_file_path=path/to/index " - "--data_file_path=data/to/index\n"; ss << "*** debug_index_compaction operation is only for offline debug index compaction, do not " "use in production ***\n"; ss << "./index_tool --operation=debug_index_compaction --idx_id=index_id " "--src_idx_dirs_file=path/to/file --dest_idx_dirs_file=path/to/file " "--dest_seg_num_rows_file=path/to/file --tablet_path=path/to/tablet " "--trans_vec_file=path/to/file\n"; + ss << "./index_tool --operation=write_index_v2 --idx_file_path=path/to/index " + "--data_file_path=data/to/index\n"; + ss << "./index_tool --operation=show_nested_files_v2 --idx_file_path=path/to/file\n"; + ss << "./index_tool --operation=check_terms_stats_v2 --idx_file_path=path/to/file " + "--idx_id=index_id\n"; return ss.str(); } @@ -129,21 +133,21 @@ void search(lucene::store::Directory* dir, std::string& field, std::string& toke std::cout << "version: " << (int32_t)(reader->getIndexVersion()) << std::endl; - std::wstring field_ws(field.begin(), field.end()); + auto field_ws = StringUtil::string_to_wstring(field); if (pred == "match_all") { } else if (pred == "match_phrase") { std::vector terms = split(token, '|'); auto* phrase_query = new lucene::search::PhraseQuery(); for (auto& term : terms) { - std::wstring term_ws = StringUtil::string_to_wstring(term); + auto term_ws = StringUtil::string_to_wstring(term); auto* t = _CLNEW lucene::index::Term(field_ws.c_str(), term_ws.c_str()); phrase_query->add(t); _CLDECDELETE(t); } query.reset(phrase_query); } else { - std::wstring token_ws(token.begin(), token.end()); - lucene::index::Term* term = _CLNEW lucene::index::Term(field_ws.c_str(), token_ws.c_str()); + auto token_ws = StringUtil::string_to_wstring(token); + auto* term = _CLNEW lucene::index::Term(field_ws.c_str(), token_ws.c_str()); if (pred == "eq" || pred == "match") { query.reset(new lucene::search::TermQuery(term)); } else if (pred == "lt") { @@ -205,7 +209,10 @@ void check_terms_stats(lucene::store::Directory* dir) { /* empty */ std::string token = lucene_wcstoutf8string(te->term(false)->text(), te->term(false)->textLength()); + std::string field = lucene_wcstoutf8string(te->term(false)->field(), + lenOfString(te->term(false)->field())); + printf("Field: %s ", field.c_str()); printf("Term: %s ", token.c_str()); printf("Freq: %d\n", te->docFreq()); if (FLAGS_print_doc_id) { @@ -430,7 +437,7 @@ int main(int argc, char** argv) { std::string index_writer_path = tablet_path + "/tmp_index_writer"; lucene::store::Directory* dir = - DorisCompoundDirectoryFactory::getDirectory(fs, index_writer_path.c_str(), false); + DorisFSDirectoryFactory::getDirectory(fs, index_writer_path.c_str(), false); lucene::analysis::SimpleAnalyzer analyzer; auto index_writer = _CLNEW lucene::index::IndexWriter(dir, &analyzer, true /* create */, true /* closeDirOnShutdown */); @@ -443,7 +450,7 @@ int main(int argc, char** argv) { std::string src_idx_full_name = src_index_files[i] + "_" + std::to_string(index_id) + ".idx"; DorisCompoundReader* reader = new DorisCompoundReader( - DorisCompoundDirectoryFactory::getDirectory(fs, tablet_path.c_str()), + DorisFSDirectoryFactory::getDirectory(fs, tablet_path.c_str()), src_idx_full_name.c_str()); src_index_dirs[i] = reader; } @@ -454,7 +461,7 @@ int main(int argc, char** argv) { // format: rowsetId_segmentId_columnId auto path = tablet_path + "/" + dest_index_files[i] + "_" + std::to_string(index_id); dest_index_dirs[i] = - DorisCompoundDirectoryFactory::getDirectory(fs, path.c_str(), true); + DorisFSDirectoryFactory::getDirectory(fs, path.c_str(), true); } index_writer->indexCompaction(src_index_dirs, dest_index_dirs, trans_vec, @@ -556,7 +563,7 @@ int main(int argc, char** argv) { auto field_config = (int32_t)(lucene::document::Field::STORE_NO); field_config |= (int32_t)(lucene::document::Field::INDEX_NONORMS); field_config |= lucene::document::Field::INDEX_TOKENIZED; - auto field_name = std::wstring(name.begin(), name.end()); + auto field_name = StringUtil::string_to_wstring(name); auto field = _CLNEW lucene::document::Field(field_name.c_str(), field_config); field->setOmitTermFreqAndPositions(false); doc->add(*field); @@ -633,7 +640,7 @@ int main(int argc, char** argv) { std::cerr << "error occurred when show files: " << err.what() << std::endl; } } else if (FLAGS_operation == "check_terms_stats_v2") { - if (FLAGS_idx_file_path == "") { + if (FLAGS_idx_file_path == "" || FLAGS_idx_id <= 0) { std::cout << "no file flag for check " << std::endl; return -1; } @@ -650,7 +657,7 @@ int main(int argc, char** argv) { return -1; } std::vector files; - int64_t index_id = 1; + int64_t index_id = FLAGS_idx_id; std::string index_suffix = ""; doris::TabletIndexPB index_pb; index_pb.set_index_id(index_id); diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index 0b3d01bbf9f95e..68f3a9b95241a1 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -123,7 +123,7 @@ void InvertedIndexReader::get_analyse_result(std::vector& analyse_r bool drop_duplicates) { analyse_result.clear(); - std::wstring field_ws = std::wstring(field_name.begin(), field_name.end()); + std::wstring field_ws = StringUtil::string_to_wstring(field_name); std::unique_ptr token_stream( analyzer->tokenStream(field_ws.c_str(), reader)); @@ -311,7 +311,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run } std::unique_ptr query; - query_info.field_name = std::wstring(column_name.begin(), column_name.end()); + query_info.field_name = StringUtil::string_to_wstring(column_name); if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY || query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY || @@ -422,7 +422,7 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, // std::string search_str = reinterpret_cast(query_value)->to_string(); VLOG_DEBUG << "begin to query the inverted index from clucene" << ", column_name: " << column_name << ", search_str: " << search_str; - std::wstring column_name_ws = std::wstring(column_name.begin(), column_name.end()); + std::wstring column_name_ws = StringUtil::string_to_wstring(column_name); std::wstring search_str_ws = StringUtil::string_to_wstring(search_str); // unique_ptr with custom deleter std::unique_ptr term { diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index fbade800b5d504..c838621f92da32 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -86,7 +86,7 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { _parser_type = get_inverted_index_parser_type_from_string( get_parser_string_from_properties(_index_meta->properties())); _value_key_coder = get_key_coder(field_type); - _field_name = std::wstring(field_name.begin(), field_name.end()); + _field_name = StringUtil::string_to_wstring(field_name); } ~InvertedIndexColumnWriterImpl() override { diff --git a/regression-test/data/inverted_index_p0/test_index_chinese_column.out b/regression-test/data/inverted_index_p0/test_index_chinese_column.out new file mode 100644 index 00000000000000..8b3ebab527ec9c --- /dev/null +++ b/regression-test/data/inverted_index_p0/test_index_chinese_column.out @@ -0,0 +1,7 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +1 json love anny json anny 2023-10-10T12:11:11 + +-- !sql -- +1 json love anny json anny 2023-10-10T12:11:11 + diff --git a/regression-test/suites/inverted_index_p0/test_index_chinese_column.groovy b/regression-test/suites/inverted_index_p0/test_index_chinese_column.groovy new file mode 100644 index 00000000000000..21a94e1ffef2b5 --- /dev/null +++ b/regression-test/suites/inverted_index_p0/test_index_chinese_column.groovy @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +suite("test_index_chinese_column", "inverted_index_select"){ + def createAndInsertData = { table_name, inverted_index_storage_format -> + sql "DROP TABLE IF EXISTS ${table_name}" + sql """ + CREATE TABLE ${table_name} + ( + k1 int , + 名称 string, + k3 char(50), + k4 varchar(200), + k5 datetime, + index index_str_k2 (`名称`) using inverted properties("parser"="english","ignore_above"="257") + ) + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES("replication_num" = "1","inverted_index_storage_format" = "${inverted_index_storage_format}") + """ + sql " insert into ${table_name} values(1, 'json love anny', 'json', 'anny', '2023-10-10 12:11:11') " + qt_sql "SELECT * FROM ${table_name} WHERE 名称 match_all 'json'" + } + + def table_name_v1 = "test_index_chinese_column_v1" + def table_name_v2 = "test_index_chinese_column_v2" + + sql "set enable_unicode_name_support=true" + + createAndInsertData(table_name_v1, "V1") + createAndInsertData(table_name_v2, "V2") +} From 298df985ae71bde466791475b31f81a288af6a44 Mon Sep 17 00:00:00 2001 From: Luennng Date: Mon, 17 Jun 2024 12:18:54 +0800 Subject: [PATCH 2/2] format --- be/src/index-tools/index_tool.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/be/src/index-tools/index_tool.cpp b/be/src/index-tools/index_tool.cpp index 9bc845bfb7f7af..9587e6f7b1f5dc 100644 --- a/be/src/index-tools/index_tool.cpp +++ b/be/src/index-tools/index_tool.cpp @@ -460,8 +460,7 @@ int main(int argc, char** argv) { for (int i = 0; i < dest_segment_num; ++i) { // format: rowsetId_segmentId_columnId auto path = tablet_path + "/" + dest_index_files[i] + "_" + std::to_string(index_id); - dest_index_dirs[i] = - DorisFSDirectoryFactory::getDirectory(fs, path.c_str(), true); + dest_index_dirs[i] = DorisFSDirectoryFactory::getDirectory(fs, path.c_str(), true); } index_writer->indexCompaction(src_index_dirs, dest_index_dirs, trans_vec,