diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index c2cc0bbbefa728..13a31e24768fc6 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -72,7 +72,6 @@ const int32_t MERGE_FACTOR = 100000000; const int32_t MAX_LEAF_COUNT = 1024; const float MAXMBSortInHeap = 512.0 * 8; const int DIMS = 1; -const std::string empty_value; template class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { @@ -139,9 +138,38 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { return Status::OK(); } - Status init_fulltext_index() { - bool create = true; + std::unique_ptr create_chinese_analyzer() { + auto chinese_analyzer = std::make_unique(); + chinese_analyzer->setLanguage(L"chinese"); + chinese_analyzer->initDict(config::inverted_index_dict_path); + + auto mode = get_parser_mode_string_from_properties(_index_meta->properties()); + if (mode == INVERTED_INDEX_PARSER_FINE_GRANULARITY) { + chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All); + } else { + chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default); + } + + return chinese_analyzer; + } + + Status create_char_string_reader(std::unique_ptr& string_reader) { + CharFilterMap char_filter_map = + get_parser_char_filter_map_from_properties(_index_meta->properties()); + if (!char_filter_map.empty()) { + string_reader = std::unique_ptr(CharFilterFactory::create( + char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE], + new lucene::util::SStringReader(), + char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN], + char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT])); + } else { + string_reader = std::make_unique>(); + } + return Status::OK(); + } + Status create_index_directory(std::unique_ptr& dir) { + bool create = true; auto index_path = InvertedIndexDescriptor::get_temporary_index_path( _directory + "/" + _segment_file_name, _index_meta->index_id(), _index_meta->get_index_suffix()); @@ -149,8 +177,7 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { bool exists = false; auto st = _fs->exists(index_path.c_str(), &exists); if (!st.ok()) { - LOG(ERROR) << "index_path:" - << " exists error:" << st; + LOG(ERROR) << "index_path: exists error:" << st; return st; } if (exists) { @@ -158,69 +185,76 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { return Status::InternalError("init_fulltext_index directory already exists"); } - _char_string_reader = std::make_unique>(); - CharFilterMap char_filter_map = - get_parser_char_filter_map_from_properties(_index_meta->properties()); - if (!char_filter_map.empty()) { - _char_string_reader.reset(CharFilterFactory::create( - char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE], - _char_string_reader.release(), - char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN], - char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT])); - } + dir = std::unique_ptr( + DorisCompoundDirectory::getDirectory(_fs, index_path.c_str(), create)); + return Status::OK(); + } - _doc = std::make_unique(); - _dir.reset(DorisCompoundDirectory::getDirectory(_fs, index_path.c_str(), true)); - - if (_parser_type == InvertedIndexParserType::PARSER_STANDARD || - _parser_type == InvertedIndexParserType::PARSER_UNICODE) { - _analyzer = std::make_unique(); - } else if (_parser_type == InvertedIndexParserType::PARSER_ENGLISH) { - _analyzer = std::make_unique>(); - } else if (_parser_type == InvertedIndexParserType::PARSER_CHINESE) { - auto chinese_analyzer = _CLNEW lucene::analysis::LanguageBasedAnalyzer(); - chinese_analyzer->setLanguage(L"chinese"); - chinese_analyzer->initDict(config::inverted_index_dict_path); - auto mode = get_parser_mode_string_from_properties(_index_meta->properties()); - if (mode == INVERTED_INDEX_PARSER_FINE_GRANULARITY) { - chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All); - } else { - chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default); - } - _analyzer.reset(chinese_analyzer); - } else { - // ANALYSER_NOT_SET, ANALYSER_NONE use default SimpleAnalyzer - _analyzer = std::make_unique>(); + Status create_index_writer(std::unique_ptr& index_writer) { + bool create_index = true; + bool close_dir_on_shutdown = true; + index_writer = std::make_unique( + _dir.get(), _analyzer.get(), create_index, close_dir_on_shutdown); + index_writer->setMaxBufferedDocs(MAX_BUFFER_DOCS); + index_writer->setRAMBufferSizeMB(config::inverted_index_ram_buffer_size); + index_writer->setMaxFieldLength(MAX_FIELD_LEN); + index_writer->setMergeFactor(MERGE_FACTOR); + index_writer->setUseCompoundFile(false); + + return Status::OK(); + } + + Status create_field(lucene::document::Field** field) { + int field_config = int(lucene::document::Field::STORE_NO) | + int(lucene::document::Field::INDEX_NONORMS); + field_config |= (_parser_type == InvertedIndexParserType::PARSER_NONE) + ? int(lucene::document::Field::INDEX_UNTOKENIZED) + : int(lucene::document::Field::INDEX_TOKENIZED); + *field = new lucene::document::Field(_field_name.c_str(), field_config); + (*field)->setOmitTermFreqAndPositions( + get_parser_phrase_support_string_from_properties(_index_meta->properties()) == + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES + ? false + : true); + return Status::OK(); + } + + Status create_analyzer(std::unique_ptr& analyzer) { + switch (_parser_type) { + case InvertedIndexParserType::PARSER_STANDARD: + case InvertedIndexParserType::PARSER_UNICODE: + analyzer = std::make_unique(); + break; + case InvertedIndexParserType::PARSER_ENGLISH: + analyzer = std::make_unique>(); + break; + case InvertedIndexParserType::PARSER_CHINESE: + analyzer = create_chinese_analyzer(); + break; + default: + analyzer = std::make_unique>(); + break; } + setup_analyzer_lowercase(analyzer); + return Status::OK(); + } + + void setup_analyzer_lowercase(std::unique_ptr& analyzer) { auto lowercase = get_parser_lowercase_from_properties(_index_meta->properties()); if (lowercase == "true") { - _analyzer->set_lowercase(true); + analyzer->set_lowercase(true); } else if (lowercase == "false") { - _analyzer->set_lowercase(false); + analyzer->set_lowercase(false); } - _index_writer = std::make_unique(_dir.get(), _analyzer.get(), - create, true); - _index_writer->setMaxBufferedDocs(MAX_BUFFER_DOCS); - _index_writer->setRAMBufferSizeMB(config::inverted_index_ram_buffer_size); - _index_writer->setMaxFieldLength(MAX_FIELD_LEN); - _index_writer->setMergeFactor(MERGE_FACTOR); - _index_writer->setUseCompoundFile(false); - _doc->clear(); + } - int field_config = int(lucene::document::Field::STORE_NO) | - int(lucene::document::Field::INDEX_NONORMS); - if (_parser_type == InvertedIndexParserType::PARSER_NONE) { - field_config |= int(lucene::document::Field::INDEX_UNTOKENIZED); - } else { - field_config |= int(lucene::document::Field::INDEX_TOKENIZED); - } - _field = new lucene::document::Field(_field_name.c_str(), field_config); - if (get_parser_phrase_support_string_from_properties(_index_meta->properties()) == - INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES) { - _field->setOmitTermFreqAndPositions(false); - } else { - _field->setOmitTermFreqAndPositions(true); - } + Status init_fulltext_index() { + RETURN_IF_ERROR(create_index_directory(_dir)); + RETURN_IF_ERROR(create_char_string_reader(_char_string_reader)); + RETURN_IF_ERROR(create_analyzer(_analyzer)); + RETURN_IF_ERROR(create_index_writer(_index_writer)); + RETURN_IF_ERROR(create_field(&_field)); + _doc = std::make_unique(); _doc->add(*_field); return Status::OK(); } @@ -555,12 +589,13 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { roaring::Roaring _null_bitmap; uint64_t _reverted_index_size; - std::unique_ptr _doc {}; - lucene::document::Field* _field {}; - std::unique_ptr _index_writer {}; - std::unique_ptr _analyzer {}; - std::unique_ptr _char_string_reader {}; - std::shared_ptr _bkd_writer; + std::unique_ptr _doc = nullptr; + lucene::document::Field* _field = nullptr; + std::unique_ptr _index_writer = nullptr; + std::unique_ptr _analyzer = nullptr; + std::unique_ptr _char_string_reader = nullptr; + std::shared_ptr _bkd_writer = nullptr; + std::unique_ptr _dir = nullptr; std::string _segment_file_name; std::string _directory; io::FileSystemSPtr _fs; @@ -568,7 +603,6 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { const TabletIndex* _index_meta; InvertedIndexParserType _parser_type; std::wstring _field_name; - std::unique_ptr _dir; }; Status InvertedIndexColumnWriter::create(const Field* field, @@ -576,127 +610,48 @@ Status InvertedIndexColumnWriter::create(const Field* field, const std::string& segment_file_name, const std::string& dir, const TabletIndex* index_meta, const io::FileSystemSPtr& fs) { - auto typeinfo = field->type_info(); + const auto* typeinfo = field->type_info(); FieldType type = typeinfo->type(); std::string field_name = field->name(); if (type == FieldType::OLAP_FIELD_TYPE_ARRAY) { - const auto array_typeinfo = dynamic_cast(typeinfo); - typeinfo = array_typeinfo->item_type_info(); - type = typeinfo->type(); + const auto* array_typeinfo = dynamic_cast(typeinfo); + if (array_typeinfo != nullptr) { + typeinfo = array_typeinfo->item_type_info(); + type = typeinfo->type(); + } else { + return Status::NotSupported("unsupported array type for inverted index: " + + std::to_string(int(type))); + } } switch (type) { - case FieldType::OLAP_FIELD_TYPE_CHAR: { - *res = std::make_unique>( - field_name, segment_file_name, dir, fs, index_meta); - break; - } - case FieldType::OLAP_FIELD_TYPE_VARCHAR: { - *res = std::make_unique>( - field_name, segment_file_name, dir, fs, index_meta); - break; - } - case FieldType::OLAP_FIELD_TYPE_STRING: { - *res = std::make_unique>( - field_name, segment_file_name, dir, fs, index_meta); - break; - } - case FieldType::OLAP_FIELD_TYPE_DATETIME: { - *res = std::make_unique>( - field_name, segment_file_name, dir, fs, index_meta); - break; - } - case FieldType::OLAP_FIELD_TYPE_DATE: { - *res = std::make_unique>( - field_name, segment_file_name, dir, fs, index_meta); - break; - } - case FieldType::OLAP_FIELD_TYPE_DATETIMEV2: { - *res = std::make_unique< - InvertedIndexColumnWriterImpl>( - field_name, segment_file_name, dir, fs, index_meta); - break; - } - case FieldType::OLAP_FIELD_TYPE_DATEV2: { - *res = std::make_unique>( - field_name, segment_file_name, dir, fs, index_meta); - break; - } - case FieldType::OLAP_FIELD_TYPE_TINYINT: { - *res = std::make_unique>( - field_name, segment_file_name, dir, fs, index_meta); - break; - } - case FieldType::OLAP_FIELD_TYPE_SMALLINT: { - *res = std::make_unique>( - field_name, segment_file_name, dir, fs, index_meta); - break; - } - case FieldType::OLAP_FIELD_TYPE_UNSIGNED_INT: { - *res = std::make_unique< - InvertedIndexColumnWriterImpl>( - field_name, segment_file_name, dir, fs, index_meta); +#define M(TYPE) \ + case TYPE: \ + *res = std::make_unique>( \ + field_name, segment_file_name, dir, fs, index_meta); \ break; - } - case FieldType::OLAP_FIELD_TYPE_INT: { - *res = std::make_unique>( - field_name, segment_file_name, dir, fs, index_meta); - break; - } - case FieldType::OLAP_FIELD_TYPE_LARGEINT: { - *res = std::make_unique>( - field_name, segment_file_name, dir, fs, index_meta); - break; - } - case FieldType::OLAP_FIELD_TYPE_DECIMAL: { - *res = std::make_unique>( - field_name, segment_file_name, dir, fs, index_meta); - break; - } - case FieldType::OLAP_FIELD_TYPE_DECIMAL32: { - *res = std::make_unique< - InvertedIndexColumnWriterImpl>( - field_name, segment_file_name, dir, fs, index_meta); - break; - } - case FieldType::OLAP_FIELD_TYPE_DECIMAL64: { - *res = std::make_unique< - InvertedIndexColumnWriterImpl>( - field_name, segment_file_name, dir, fs, index_meta); - break; - } - case FieldType::OLAP_FIELD_TYPE_DECIMAL128I: { - *res = std::make_unique< - InvertedIndexColumnWriterImpl>( - field_name, segment_file_name, dir, fs, index_meta); - break; - } - case FieldType::OLAP_FIELD_TYPE_DECIMAL256: { - *res = std::make_unique< - InvertedIndexColumnWriterImpl>( - field_name, segment_file_name, dir, fs, index_meta); - break; - } - case FieldType::OLAP_FIELD_TYPE_BOOL: { - *res = std::make_unique>( - field_name, segment_file_name, dir, fs, index_meta); - break; - } - case FieldType::OLAP_FIELD_TYPE_DOUBLE: { - *res = std::make_unique>( - field_name, segment_file_name, dir, fs, index_meta); - break; - } - case FieldType::OLAP_FIELD_TYPE_FLOAT: { - *res = std::make_unique>( - field_name, segment_file_name, dir, fs, index_meta); - break; - } - case FieldType::OLAP_FIELD_TYPE_BIGINT: { - *res = std::make_unique>( - field_name, segment_file_name, dir, fs, index_meta); - break; - } + M(FieldType::OLAP_FIELD_TYPE_TINYINT) + M(FieldType::OLAP_FIELD_TYPE_SMALLINT) + M(FieldType::OLAP_FIELD_TYPE_INT) + M(FieldType::OLAP_FIELD_TYPE_UNSIGNED_INT) + M(FieldType::OLAP_FIELD_TYPE_BIGINT) + M(FieldType::OLAP_FIELD_TYPE_LARGEINT) + M(FieldType::OLAP_FIELD_TYPE_CHAR) + M(FieldType::OLAP_FIELD_TYPE_VARCHAR) + M(FieldType::OLAP_FIELD_TYPE_STRING) + M(FieldType::OLAP_FIELD_TYPE_DATE) + M(FieldType::OLAP_FIELD_TYPE_DATETIME) + M(FieldType::OLAP_FIELD_TYPE_DECIMAL) + M(FieldType::OLAP_FIELD_TYPE_DATEV2) + M(FieldType::OLAP_FIELD_TYPE_DATETIMEV2) + M(FieldType::OLAP_FIELD_TYPE_DECIMAL32) + M(FieldType::OLAP_FIELD_TYPE_DECIMAL64) + M(FieldType::OLAP_FIELD_TYPE_DECIMAL128I) + M(FieldType::OLAP_FIELD_TYPE_DECIMAL256) + M(FieldType::OLAP_FIELD_TYPE_BOOL) + M(FieldType::OLAP_FIELD_TYPE_DOUBLE) + M(FieldType::OLAP_FIELD_TYPE_FLOAT) +#undef M default: return Status::NotSupported("unsupported type for inverted index: " + std::to_string(int(type)));