From 9f156dd01fb5688c7037df2150acefb32262ce8f Mon Sep 17 00:00:00 2001
From: qiye <jianliang5669@gmail.com>
Date: Sun, 16 Jun 2024 10:02:14 +0800
Subject: [PATCH 1/2] [fix](inverted index)Support Chinese column name with
 inverted index (#36321)

1. `std::string` to `std::wstring` conversion only supports ASCII
characters. For non-ASCII characters, we need to use
`StringUtil::string_to_wstring`
2. Fix index_tool check_terms_stats_v2 and add field info to print

Issue Number: #34118
---
 be/src/index-tools/index_tool.cpp             | 31 ++++++++-----
 .../segment_v2/inverted_index_reader.cpp      |  6 +--
 .../segment_v2/inverted_index_writer.cpp      |  2 +-
 .../test_index_chinese_column.out             |  7 +++
 .../test_index_chinese_column.groovy          | 44 +++++++++++++++++++
 5 files changed, 74 insertions(+), 16 deletions(-)
 create mode 100644 regression-test/data/inverted_index_p0/test_index_chinese_column.out
 create mode 100644 regression-test/suites/inverted_index_p0/test_index_chinese_column.groovy
diff --git a/be/src/index-tools/index_tool.cpp b/be/src/index-tools/index_tool.cpp
index 32ed178e36c26c..9bc845bfb7f7af 100644
--- a/be/src/index-tools/index_tool.cpp
+++ b/be/src/index-tools/index_tool.cpp
@@ -31,6 +31,7 @@
 #include <string>
 #include <vector>
 
+#include "CLucene/util/stringUtil.h"
 #include "io/fs/file_reader.h"
 #ifdef __clang__
 #pragma clang diagnostic push
@@ -93,14 +94,17 @@ std::string get_usage(const std::string& progname) {
     ss << "./index_tool --operation=term_query --directory=directory "
           "--idx_file_name=file --print_row_id --term=term --column_name=column_name "
           "--pred_type=eq/lt/gt/le/ge/match etc\n";
-    ss << "./index_tool --operation=write_index_v2 --idx_file_path=path/to/index "
-          "--data_file_path=data/to/index\n";
     ss << "*** debug_index_compaction operation is only for offline debug index compaction, do not "
           "use in production ***\n";
     ss << "./index_tool --operation=debug_index_compaction --idx_id=index_id "
           "--src_idx_dirs_file=path/to/file --dest_idx_dirs_file=path/to/file "
           "--dest_seg_num_rows_file=path/to/file --tablet_path=path/to/tablet "
           "--trans_vec_file=path/to/file\n";
+    ss << "./index_tool --operation=write_index_v2 --idx_file_path=path/to/index "
+          "--data_file_path=data/to/index\n";
+    ss << "./index_tool --operation=show_nested_files_v2 --idx_file_path=path/to/file\n";
+    ss << "./index_tool --operation=check_terms_stats_v2 --idx_file_path=path/to/file "
+          "--idx_id=index_id\n";
     return ss.str();
 }
 
@@ -129,21 +133,21 @@ void search(lucene::store::Directory* dir, std::string& field, std::string& toke
 
     std::cout << "version: " << (int32_t)(reader->getIndexVersion()) << std::endl;
 
-    std::wstring field_ws(field.begin(), field.end());
+    auto field_ws = StringUtil::string_to_wstring(field);
     if (pred == "match_all") {
     } else if (pred == "match_phrase") {
         std::vector<std::string> terms = split(token, '|');
         auto* phrase_query = new lucene::search::PhraseQuery();
         for (auto& term : terms) {
-            std::wstring term_ws = StringUtil::string_to_wstring(term);
+            auto term_ws = StringUtil::string_to_wstring(term);
             auto* t = _CLNEW lucene::index::Term(field_ws.c_str(), term_ws.c_str());
             phrase_query->add(t);
             _CLDECDELETE(t);
         }
         query.reset(phrase_query);
     } else {
-        std::wstring token_ws(token.begin(), token.end());
-        lucene::index::Term* term = _CLNEW lucene::index::Term(field_ws.c_str(), token_ws.c_str());
+        auto token_ws = StringUtil::string_to_wstring(token);
+        auto* term = _CLNEW lucene::index::Term(field_ws.c_str(), token_ws.c_str());
         if (pred == "eq" || pred == "match") {
             query.reset(new lucene::search::TermQuery(term));
         } else if (pred == "lt") {
@@ -205,7 +209,10 @@ void check_terms_stats(lucene::store::Directory* dir) {
         /* empty */
         std::string token =
                 lucene_wcstoutf8string(te->term(false)->text(), te->term(false)->textLength());
+        std::string field = lucene_wcstoutf8string(te->term(false)->field(),
+                                                   lenOfString(te->term(false)->field()));
 
+        printf("Field: %s ", field.c_str());
         printf("Term: %s ", token.c_str());
         printf("Freq: %d\n", te->docFreq());
         if (FLAGS_print_doc_id) {
@@ -430,7 +437,7 @@ int main(int argc, char** argv) {
 
         std::string index_writer_path = tablet_path + "/tmp_index_writer";
         lucene::store::Directory* dir =
-                DorisCompoundDirectoryFactory::getDirectory(fs, index_writer_path.c_str(), false);
+                DorisFSDirectoryFactory::getDirectory(fs, index_writer_path.c_str(), false);
         lucene::analysis::SimpleAnalyzer<char> analyzer;
         auto index_writer = _CLNEW lucene::index::IndexWriter(dir, &analyzer, true /* create */,
                                                               true /* closeDirOnShutdown */);
@@ -443,7 +450,7 @@ int main(int argc, char** argv) {
             std::string src_idx_full_name =
                     src_index_files[i] + "_" + std::to_string(index_id) + ".idx";
             DorisCompoundReader* reader = new DorisCompoundReader(
-                    DorisCompoundDirectoryFactory::getDirectory(fs, tablet_path.c_str()),
+                    DorisFSDirectoryFactory::getDirectory(fs, tablet_path.c_str()),
                     src_idx_full_name.c_str());
             src_index_dirs[i] = reader;
         }
@@ -454,7 +461,7 @@ int main(int argc, char** argv) {
             // format: rowsetId_segmentId_columnId
             auto path = tablet_path + "/" + dest_index_files[i] + "_" + std::to_string(index_id);
             dest_index_dirs[i] =
-                    DorisCompoundDirectoryFactory::getDirectory(fs, path.c_str(), true);
+                    DorisFSDirectoryFactory::getDirectory(fs, path.c_str(), true);
         }
 
         index_writer->indexCompaction(src_index_dirs, dest_index_dirs, trans_vec,
@@ -556,7 +563,7 @@ int main(int argc, char** argv) {
         auto field_config = (int32_t)(lucene::document::Field::STORE_NO);
         field_config |= (int32_t)(lucene::document::Field::INDEX_NONORMS);
         field_config |= lucene::document::Field::INDEX_TOKENIZED;
-        auto field_name = std::wstring(name.begin(), name.end());
+        auto field_name = StringUtil::string_to_wstring(name);
         auto field = _CLNEW lucene::document::Field(field_name.c_str(), field_config);
         field->setOmitTermFreqAndPositions(false);
         doc->add(*field);
@@ -633,7 +640,7 @@ int main(int argc, char** argv) {
             std::cerr << "error occurred when show files: " << err.what() << std::endl;
         }
     } else if (FLAGS_operation == "check_terms_stats_v2") {
-        if (FLAGS_idx_file_path == "") {
+        if (FLAGS_idx_file_path == "" || FLAGS_idx_id <= 0) {
             std::cout << "no file flag for check " << std::endl;
             return -1;
         }
@@ -650,7 +657,7 @@ int main(int argc, char** argv) {
                 return -1;
             }
             std::vector<std::string> files;
-            int64_t index_id = 1;
+            int64_t index_id = FLAGS_idx_id;
             std::string index_suffix = "";
             doris::TabletIndexPB index_pb;
             index_pb.set_index_id(index_id);
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index 0b3d01bbf9f95e..68f3a9b95241a1 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -123,7 +123,7 @@ void InvertedIndexReader::get_analyse_result(std::vector<std::string>& analyse_r
                                              bool drop_duplicates) {
     analyse_result.clear();
 
-    std::wstring field_ws = std::wstring(field_name.begin(), field_name.end());
+    std::wstring field_ws = StringUtil::string_to_wstring(field_name);
     std::unique_ptr<lucene::analysis::TokenStream> token_stream(
             analyzer->tokenStream(field_ws.c_str(), reader));
 
@@ -311,7 +311,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run
         }
 
         std::unique_ptr<lucene::search::Query> query;
-        query_info.field_name = std::wstring(column_name.begin(), column_name.end());
+        query_info.field_name = StringUtil::string_to_wstring(column_name);
 
         if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY ||
             query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY ||
@@ -422,7 +422,7 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats,
     // std::string search_str = reinterpret_cast<const StringRef*>(query_value)->to_string();
     VLOG_DEBUG << "begin to query the inverted index from clucene"
                << ", column_name: " << column_name << ", search_str: " << search_str;
-    std::wstring column_name_ws = std::wstring(column_name.begin(), column_name.end());
+    std::wstring column_name_ws = StringUtil::string_to_wstring(column_name);
     std::wstring search_str_ws = StringUtil::string_to_wstring(search_str);
     // unique_ptr with custom deleter
     std::unique_ptr<lucene::index::Term, void (*)(lucene::index::Term*)> term {
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index fbade800b5d504..c838621f92da32 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -86,7 +86,7 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
         _parser_type = get_inverted_index_parser_type_from_string(
                 get_parser_string_from_properties(_index_meta->properties()));
         _value_key_coder = get_key_coder(field_type);
-        _field_name = std::wstring(field_name.begin(), field_name.end());
+        _field_name = StringUtil::string_to_wstring(field_name);
     }
 
     ~InvertedIndexColumnWriterImpl() override {
diff --git a/regression-test/data/inverted_index_p0/test_index_chinese_column.out b/regression-test/data/inverted_index_p0/test_index_chinese_column.out
new file mode 100644
index 00000000000000..8b3ebab527ec9c
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_index_chinese_column.out
@@ -0,0 +1,7 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !sql --
+1	json love anny	json	anny	2023-10-10T12:11:11
+
+-- !sql --
+1	json love anny	json	anny	2023-10-10T12:11:11
+
diff --git a/regression-test/suites/inverted_index_p0/test_index_chinese_column.groovy b/regression-test/suites/inverted_index_p0/test_index_chinese_column.groovy
new file mode 100644
index 00000000000000..21a94e1ffef2b5
--- /dev/null
+++ b/regression-test/suites/inverted_index_p0/test_index_chinese_column.groovy
@@ -0,0 +1,44 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+suite("test_index_chinese_column", "inverted_index_select"){
+    def createAndInsertData = { table_name, inverted_index_storage_format ->
+        sql "DROP TABLE IF EXISTS ${table_name}"
+        sql """
+            CREATE TABLE ${table_name}
+            (
+                k1 int ,
+                名称 string,
+                k3 char(50),
+                k4 varchar(200),
+                k5 datetime,
+                index index_str_k2 (`名称`) using inverted properties("parser"="english","ignore_above"="257")
+            )
+            DISTRIBUTED BY RANDOM BUCKETS 1
+            PROPERTIES("replication_num" = "1","inverted_index_storage_format" = "${inverted_index_storage_format}")
+        """
+        sql " insert into ${table_name} values(1, 'json love anny', 'json', 'anny', '2023-10-10 12:11:11') "
+        qt_sql "SELECT * FROM ${table_name} WHERE 名称 match_all 'json'"
+    }
+
+    def table_name_v1 = "test_index_chinese_column_v1"
+    def table_name_v2 = "test_index_chinese_column_v2"
+
+    sql "set enable_unicode_name_support=true"
+
+    createAndInsertData(table_name_v1, "V1")
+    createAndInsertData(table_name_v2, "V2")
+}

From 298df985ae71bde466791475b31f81a288af6a44 Mon Sep 17 00:00:00 2001
From: Luennng <luennng@gmail.com>
Date: Mon, 17 Jun 2024 12:18:54 +0800
Subject: [PATCH 2/2] format

---
 be/src/index-tools/index_tool.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/be/src/index-tools/index_tool.cpp b/be/src/index-tools/index_tool.cpp
index 9bc845bfb7f7af..9587e6f7b1f5dc 100644
--- a/be/src/index-tools/index_tool.cpp
+++ b/be/src/index-tools/index_tool.cpp
@@ -460,8 +460,7 @@ int main(int argc, char** argv) {
         for (int i = 0; i < dest_segment_num; ++i) {
             // format: rowsetId_segmentId_columnId
             auto path = tablet_path + "/" + dest_index_files[i] + "_" + std::to_string(index_id);
-            dest_index_dirs[i] =
-                    DorisFSDirectoryFactory::getDirectory(fs, path.c_str(), true);
+            dest_index_dirs[i] = DorisFSDirectoryFactory::getDirectory(fs, path.c_str(), true);
         }
 
         index_writer->indexCompaction(src_index_dirs, dest_index_dirs, trans_vec,