From 4c2d6e649c8393a61d88ad3cee3a6c47f5f2a2c2 Mon Sep 17 00:00:00 2001 From: yangzhg Date: Mon, 21 Mar 2022 15:39:43 +0800 Subject: [PATCH] [api-change] add soft limit of String type length 1. add a config string_type_soft_limit to soft limit max length of string type 2. disable using String type in Key column, partition column and distribution column 3. remove String type alias BLOB for futrue use --- be/src/common/config.h | 7 +++ be/src/exec/tablet_sink.cpp | 55 +++++++++++-------- be/src/olap/delete_handler.cpp | 4 +- be/src/olap/olap_define.h | 3 - be/src/olap/row_block2.cpp | 9 +-- be/src/olap/types.h | 4 +- be/src/olap/wrapper_field.cpp | 5 +- be/src/runtime/types.h | 3 +- be/src/vec/sink/vtablet_sink.cpp | 29 +++++----- .../administrator-guide/config/be_config.md | 5 ++ .../sql-statements/Data Types/STRING.md | 2 +- .../administrator-guide/config/be_config.md | 5 ++ .../sql-statements/Data Types/STRING.md | 2 +- fe/fe-core/src/main/cup/sql_parser.cup | 2 - .../org/apache/doris/analysis/ColumnDef.java | 5 +- .../doris/analysis/CreateTableStmt.java | 12 ++-- .../doris/analysis/DistributionDesc.java | 2 +- .../doris/analysis/HashDistributionDesc.java | 18 +++++- .../apache/doris/analysis/PartitionDesc.java | 5 ++ .../analysis/RandomDistributionDesc.java | 2 +- .../org/apache/doris/catalog/ScalarType.java | 1 - .../org/apache/doris/planner/PlannerTest.java | 15 +++++ 22 files changed, 129 insertions(+), 66 deletions(-) diff --git a/be/src/common/config.h b/be/src/common/config.h index 729330505a998e..438a4a345f2141 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -720,6 +720,13 @@ CONF_String(function_service_protocol, "h2:grpc"); // use which load balancer to select server to connect CONF_String(rpc_load_balancer, "rr"); +// a soft limit of string type length, the hard limit is 2GB - 4, but if too long will cause very low performance, +// so we set a soft limit, default is 1MB +CONF_mInt32(string_type_length_soft_limit_bytes, "1048576"); + +CONF_Validator(string_type_length_soft_limit_bytes, + [](const int config) -> bool { return config > 0 && config <= 2147483643; }); + } // namespace config } // namespace doris diff --git a/be/src/exec/tablet_sink.cpp b/be/src/exec/tablet_sink.cpp index 6be3e2dba10759..4719c069cae8ef 100644 --- a/be/src/exec/tablet_sink.cpp +++ b/be/src/exec/tablet_sink.cpp @@ -50,7 +50,8 @@ NodeChannel::NodeChannel(OlapTableSink* parent, IndexChannel* index_channel, int if (_parent->_transfer_data_by_brpc_attachment) { _tuple_data_buffer_ptr = &_tuple_data_buffer; } - _node_channel_tracker = MemTracker::create_tracker(-1, "NodeChannel" + thread_local_ctx.get()->thread_id_str()); + _node_channel_tracker = + MemTracker::create_tracker(-1, "NodeChannel" + thread_local_ctx.get()->thread_id_str()); } NodeChannel::~NodeChannel() noexcept { @@ -193,7 +194,8 @@ Status NodeChannel::open_wait() { return; } // If rpc failed, mark all tablets on this node channel as failed - _index_channel->mark_as_failed(this->node_id(), this->host(), _add_batch_closure->cntl.ErrorText(), -1); + _index_channel->mark_as_failed(this->node_id(), this->host(), + _add_batch_closure->cntl.ErrorText(), -1); Status st = _index_channel->check_intolerable_failure(); if (!st.ok()) { _cancel_with_msg(fmt::format("{}, err: {}", channel_info(), st.get_error_msg())); @@ -216,7 +218,8 @@ Status NodeChannel::open_wait() { if (status.ok()) { // if has error tablet, handle them first for (auto& error : result.tablet_errors()) { - _index_channel->mark_as_failed(this->node_id(), this->host(), error.msg(), error.tablet_id()); + _index_channel->mark_as_failed(this->node_id(), this->host(), error.msg(), + error.tablet_id()); } Status st = _index_channel->check_intolerable_failure(); @@ -262,8 +265,7 @@ Status NodeChannel::add_row(Tuple* input_tuple, int64_t tablet_id) { // But there is still some unfinished things, we do mem limit here temporarily. // _cancelled may be set by rpc callback, and it's possible that _cancelled might be set in any of the steps below. // It's fine to do a fake add_row() and return OK, because we will check _cancelled in next add_row() or mark_close(). - while (!_cancelled && - _pending_batches_num > 0 && + while (!_cancelled && _pending_batches_num > 0 && (_pending_batches_bytes > _max_pending_batches_bytes || _parent->_mem_tracker->any_limit_exceeded())) { SCOPED_ATOMIC_TIMER(&_mem_exceeded_block_ns); @@ -314,8 +316,7 @@ Status NodeChannel::add_row(BlockRow& block_row, int64_t tablet_id) { // But there is still some unfinished things, we do mem limit here temporarily. // _cancelled may be set by rpc callback, and it's possible that _cancelled might be set in any of the steps below. // It's fine to do a fake add_row() and return OK, because we will check _cancelled in next add_row() or mark_close(). - while (!_cancelled && - _pending_batches_num > 0 && + while (!_cancelled && _pending_batches_num > 0 && (_pending_batches_bytes > _max_pending_batches_bytes || _parent->_mem_tracker->any_limit_exceeded())) { SCOPED_ATOMIC_TIMER(&_mem_exceeded_block_ns); @@ -393,7 +394,7 @@ Status NodeChannel::close_wait(RuntimeState* state) { while (!_add_batches_finished && !_cancelled) { SleepFor(MonoDelta::FromMilliseconds(1)); } - _close_time_ms = UnixMillis() - _close_time_ms; + _close_time_ms = UnixMillis() - _close_time_ms; if (_add_batches_finished) { { @@ -685,7 +686,8 @@ OlapTableSink::~OlapTableSink() { // OlapTableSink::_mem_tracker and its parents. // But their destructions are after OlapTableSink's. for (auto index_channel : _channels) { - index_channel->for_each_node_channel([](const std::shared_ptr& ch) { ch->clear_all_batches(); }); + index_channel->for_each_node_channel( + [](const std::shared_ptr& ch) { ch->clear_all_batches(); }); } } @@ -846,11 +848,13 @@ Status OlapTableSink::open(RuntimeState* state) { RETURN_IF_ERROR(Expr::open(_output_expr_ctxs, state)); for (auto index_channel : _channels) { - index_channel->for_each_node_channel([](const std::shared_ptr& ch) { ch->open(); }); + index_channel->for_each_node_channel( + [](const std::shared_ptr& ch) { ch->open(); }); } for (auto index_channel : _channels) { - index_channel->for_each_node_channel([&index_channel](const std::shared_ptr& ch) { + index_channel->for_each_node_channel([&index_channel]( + const std::shared_ptr& ch) { auto st = ch->open_wait(); if (!st.ok()) { // The open() phase is mainly to generate DeltaWriter instances on the nodes corresponding to each node channel. @@ -940,13 +944,13 @@ Status OlapTableSink::send(RuntimeState* state, RowBatch* input_batch) { uint32_t tablet_index = 0; if (findTabletMode != FindTabletMode::FIND_TABLET_EVERY_ROW) { if (_partition_to_tablet_map.find(partition->id) == _partition_to_tablet_map.end()) { - tablet_index = _partition->find_tablet(tuple,*partition); + tablet_index = _partition->find_tablet(tuple, *partition); _partition_to_tablet_map.emplace(partition->id, tablet_index); } else { tablet_index = _partition_to_tablet_map[partition->id]; } } else { - tablet_index = _partition->find_tablet(tuple,*partition); + tablet_index = _partition->find_tablet(tuple, *partition); } _partition_ids.emplace(partition->id); for (int j = 0; j < partition->indexes.size(); ++j) { @@ -984,7 +988,8 @@ Status OlapTableSink::close(RuntimeState* state, Status close_status) { { SCOPED_TIMER(_close_timer); for (auto index_channel : _channels) { - index_channel->for_each_node_channel([](const std::shared_ptr& ch) { ch->mark_close(); }); + index_channel->for_each_node_channel( + [](const std::shared_ptr& ch) { ch->mark_close(); }); num_node_channels += index_channel->num_node_channels(); } @@ -997,7 +1002,8 @@ Status OlapTableSink::close(RuntimeState* state, Status close_status) { &total_add_batch_num](const std::shared_ptr& ch) { auto s = ch->close_wait(state); if (!s.ok()) { - index_channel->mark_as_failed(ch->node_id(), ch->host(), s.get_error_msg(), -1); + index_channel->mark_as_failed(ch->node_id(), ch->host(), + s.get_error_msg(), -1); LOG(WARNING) << ch->channel_info() << ", close channel failed, err: " << s.get_error_msg(); @@ -1047,7 +1053,8 @@ Status OlapTableSink::close(RuntimeState* state, Status close_status) { // print log of add batch time of all node, for tracing load performance easily std::stringstream ss; ss << "finished to close olap table sink. load_id=" << print_id(_load_id) - << ", txn_id=" << _txn_id << ", node add batch time(ms)/wait execution time(ms)/close time(ms)/num: "; + << ", txn_id=" << _txn_id + << ", node add batch time(ms)/wait execution time(ms)/close time(ms)/num: "; for (auto const& pair : node_add_batch_counter_map) { ss << "{" << pair.first << ":(" << (pair.second.add_batch_execution_time_us / 1000) << ")(" << (pair.second.add_batch_wait_execution_time_us / 1000) << ")(" @@ -1056,8 +1063,9 @@ Status OlapTableSink::close(RuntimeState* state, Status close_status) { LOG(INFO) << ss.str(); } else { for (auto channel : _channels) { - channel->for_each_node_channel( - [&status](const std::shared_ptr& ch) { ch->cancel(status.get_error_msg()); }); + channel->for_each_node_channel([&status](const std::shared_ptr& ch) { + ch->cancel(status.get_error_msg()); + }); } } @@ -1189,13 +1197,14 @@ Status OlapTableSink::_validate_data(RuntimeState* state, RowBatch* batch, Bitma } case TYPE_STRING: { StringValue* str_val = (StringValue*)slot; - if (str_val->len > OLAP_STRING_MAX_LENGTH) { + if (str_val->len > config::string_type_length_soft_limit_bytes) { fmt::format_to(error_msg, "{}", "the length of input is too long than schema. "); fmt::format_to(error_msg, "column_name: {}; ", desc->col_name()); fmt::format_to(error_msg, "first 128 bytes of input str: [{}] ", std::string(str_val->ptr, 128)); - fmt::format_to(error_msg, "schema length: {}; ", OLAP_STRING_MAX_LENGTH); + fmt::format_to(error_msg, "schema length: {}; ", + config::string_type_length_soft_limit_bytes); fmt::format_to(error_msg, "actual length: {}; ", str_val->len); row_valid = false; continue; @@ -1259,7 +1268,8 @@ void OlapTableSink::_send_batch_process(RuntimeState* state) { do { int running_channels_num = 0; for (auto index_channel : _channels) { - index_channel->for_each_node_channel([&running_channels_num, this, state](const std::shared_ptr& ch) { + index_channel->for_each_node_channel([&running_channels_num, this, + state](const std::shared_ptr& ch) { running_channels_num += ch->try_send_and_fetch_status(state, this->_send_batch_thread_pool_token); }); @@ -1267,7 +1277,8 @@ void OlapTableSink::_send_batch_process(RuntimeState* state) { if (running_channels_num == 0) { LOG(INFO) << "all node channels are stopped(maybe finished/offending/cancelled), " - "sender thread exit. " << print_id(_load_id); + "sender thread exit. " + << print_id(_load_id); return; } } while (!_stop_background_threads_latch.wait_for( diff --git a/be/src/olap/delete_handler.cpp b/be/src/olap/delete_handler.cpp index 6dd655f07b9cfe..22340443e81f4a 100644 --- a/be/src/olap/delete_handler.cpp +++ b/be/src/olap/delete_handler.cpp @@ -143,7 +143,7 @@ bool DeleteConditionHandler::is_condition_value_valid(const TabletColumn& column case OLAP_FIELD_TYPE_VARCHAR: return value_str.size() <= column.length(); case OLAP_FIELD_TYPE_STRING: - return value_str.size() <= OLAP_STRING_MAX_LENGTH; + return value_str.size() <= config::string_type_length_soft_limit_bytes; case OLAP_FIELD_TYPE_DATE: case OLAP_FIELD_TYPE_DATETIME: return valid_datetime(value_str); @@ -303,7 +303,7 @@ OLAPStatus DeleteHandler::init(const TabletSchema& schema, } bool DeleteHandler::is_filter_data(const int64_t data_version, const RowCursor& row) const { - // According to semantics, the delete condition stored in _del_conds should be an OR relationship, + // According to semantics, the delete condition stored in _del_conds should be an OR relationship, // so as long as the data matches one of the _del_conds, it will return true. for (const auto& del_cond : _del_conds) { if (data_version <= del_cond.filter_version && diff --git a/be/src/olap/olap_define.h b/be/src/olap/olap_define.h index b16c614d062016..6af5024a339269 100644 --- a/be/src/olap/olap_define.h +++ b/be/src/olap/olap_define.h @@ -56,9 +56,6 @@ static const uint16_t OLAP_VARCHAR_MAX_LENGTH = 65535; // the max length supported for string type 2GB static const uint32_t OLAP_STRING_MAX_LENGTH = 2147483647; -// the max length supported for vec string type 1MB -static constexpr size_t MAX_SIZE_OF_VEC_STRING = 1024 * 1024; - // the max length supported for array static const uint16_t OLAP_ARRAY_MAX_LENGTH = 65535; diff --git a/be/src/olap/row_block2.cpp b/be/src/olap/row_block2.cpp index aa13c1f012e497..4f2feec37bd88c 100644 --- a/be/src/olap/row_block2.cpp +++ b/be/src/olap/row_block2.cpp @@ -17,6 +17,7 @@ #include "olap/row_block2.h" +#include #include #include @@ -190,16 +191,16 @@ Status RowBlockV2::_copy_data_to_column(int cid, } case OLAP_FIELD_TYPE_STRING: { auto column_string = assert_cast(column); - + size_t limit = config::string_type_length_soft_limit_bytes; for (uint16_t j = 0; j < _selected_size; ++j) { if (!nullable_mark_array[j]) { uint16_t row_idx = _selection_vector[j]; auto slice = reinterpret_cast(column_block(cid).cell_ptr(row_idx)); - if (LIKELY(slice->size <= MAX_SIZE_OF_VEC_STRING)) { + if (LIKELY(slice->size <= limit)) { column_string->insert_data(slice->data, slice->size); } else { - return Status::NotSupported( - "Not support string len over than 1MB in vec engine."); + return Status::NotSupported(fmt::format( + "Not support string len over than {} in vec engine.", limit)); } } else { column_string->insert_default(); diff --git a/be/src/olap/types.h b/be/src/olap/types.h index e007df90ef0613..ab55b459e57f18 100644 --- a/be/src/olap/types.h +++ b/be/src/olap/types.h @@ -1137,9 +1137,9 @@ template <> struct FieldTypeTraits : public FieldTypeTraits { static OLAPStatus from_string(void* buf, const std::string& scan_key) { size_t value_len = scan_key.length(); - if (value_len > OLAP_STRING_MAX_LENGTH) { + if (value_len > config::string_type_length_soft_limit_bytes) { LOG(WARNING) << "the len of value string is too long, len=" << value_len - << ", max_len=" << OLAP_STRING_MAX_LENGTH; + << ", max_len=" << config::string_type_length_soft_limit_bytes; return OLAP_ERR_INPUT_PARAMETER_ERROR; } diff --git a/be/src/olap/wrapper_field.cpp b/be/src/olap/wrapper_field.cpp index be908a27255cbe..887b93b2590aa2 100644 --- a/be/src/olap/wrapper_field.cpp +++ b/be/src/olap/wrapper_field.cpp @@ -28,8 +28,9 @@ WrapperField* WrapperField::create(const TabletColumn& column, uint32_t len) { (column.type() == OLAP_FIELD_TYPE_CHAR || column.type() == OLAP_FIELD_TYPE_VARCHAR || column.type() == OLAP_FIELD_TYPE_HLL || column.type() == OLAP_FIELD_TYPE_OBJECT || column.type() == OLAP_FIELD_TYPE_STRING); - size_t max_length = column.type() == OLAP_FIELD_TYPE_STRING ? OLAP_STRING_MAX_LENGTH - : OLAP_VARCHAR_MAX_LENGTH; + size_t max_length = column.type() == OLAP_FIELD_TYPE_STRING + ? config::string_type_length_soft_limit_bytes + : OLAP_VARCHAR_MAX_LENGTH; if (is_string_type && len > max_length) { OLAP_LOG_WARNING("length of string parameter is too long[len=%lu, max_len=%lu].", len, max_length); diff --git a/be/src/runtime/types.h b/be/src/runtime/types.h index 152e87e04817d4..da463baaf9a59c 100644 --- a/be/src/runtime/types.h +++ b/be/src/runtime/types.h @@ -46,7 +46,6 @@ struct TypeDescriptor { /// Only set if type == TYPE_CHAR or type == TYPE_VARCHAR int len; static const int MAX_VARCHAR_LENGTH = OLAP_VARCHAR_MAX_LENGTH; - static const int MAX_STRING_LENGTH = OLAP_STRING_MAX_LENGTH; static const int MAX_CHAR_LENGTH = 255; static const int MAX_CHAR_INLINE_LENGTH = 128; @@ -100,7 +99,7 @@ struct TypeDescriptor { static TypeDescriptor create_string_type() { TypeDescriptor ret; ret.type = TYPE_STRING; - ret.len = MAX_STRING_LENGTH; + ret.len = config::string_type_length_soft_limit_bytes; return ret; } diff --git a/be/src/vec/sink/vtablet_sink.cpp b/be/src/vec/sink/vtablet_sink.cpp index d1a62f4ae71dab..e8d3cc0f5f79ff 100644 --- a/be/src/vec/sink/vtablet_sink.cpp +++ b/be/src/vec/sink/vtablet_sink.cpp @@ -116,10 +116,11 @@ Status VOlapTableSink::send(RuntimeState* state, vectorized::Block* input_block) RETURN_IF_ERROR(state->append_error_msg_to_file( []() -> std::string { return ""; }, [&]() -> std::string { - fmt::memory_buffer buf; - fmt::format_to(buf, "no partition for this tuple. tuple=[]"); - return fmt::to_string(buf); - }, &stop_processing)); + fmt::memory_buffer buf; + fmt::format_to(buf, "no partition for this tuple. tuple=[]"); + return fmt::to_string(buf); + }, + &stop_processing)); _number_filtered_rows++; if (stop_processing) { return Status::EndOfFile("Encountered unqualified data, stop processing"); @@ -163,9 +164,11 @@ Status VOlapTableSink::_validate_data(RuntimeState* state, vectorized::Block* bl const auto num_rows = block->rows(); fmt::memory_buffer error_msg; auto set_invalid_and_append_error_msg = [&](int row) { - filter_bitmap->Set(row, true); - return state->append_error_msg_to_file([]() -> std::string { return ""; }, - [&error_msg]() -> std::string { return fmt::to_string(error_msg); }, stop_processing); + filter_bitmap->Set(row, true); + return state->append_error_msg_to_file( + []() -> std::string { return ""; }, + [&error_msg]() -> std::string { return fmt::to_string(error_msg); }, + stop_processing); }; for (int i = 0; i < _output_tuple_desc->slots().size(); ++i) { @@ -200,12 +203,8 @@ Status VOlapTableSink::_validate_data(RuntimeState* state, vectorized::Block* bl const auto column_string = assert_cast(real_column_ptr.get()); - size_t limit = MAX_SIZE_OF_VEC_STRING; - if (desc->type().type != TYPE_STRING) { - DCHECK(desc->type().len >= 0); - limit = std::min(limit, (size_t)desc->type().len); - } - + size_t limit = + std::min(config::string_type_length_soft_limit_bytes, desc->type().len); for (int j = 0; j < num_rows; ++j) { if (!filter_bitmap->Get(j)) { auto str_val = column_string->get_data_at(j); @@ -219,14 +218,14 @@ Status VOlapTableSink::_validate_data(RuntimeState* state, vectorized::Block* bl fmt::format_to(error_msg, "input str: [{}] ", str_val.to_prefix(10)); fmt::format_to(error_msg, "schema length: {}; ", desc->type().len); fmt::format_to(error_msg, "actual length: {}; ", str_val.size); - } else if (str_val.size > MAX_SIZE_OF_VEC_STRING) { + } else if (str_val.size > limit) { fmt::format_to( error_msg, "{}", "the length of input string is too long than vec schema. "); fmt::format_to(error_msg, "column_name: {}; ", desc->col_name()); fmt::format_to(error_msg, "input str: [{}] ", str_val.to_prefix(10)); fmt::format_to(error_msg, "schema length: {}; ", desc->type().len); - fmt::format_to(error_msg, "limit length: {}; ", MAX_SIZE_OF_VEC_STRING); + fmt::format_to(error_msg, "limit length: {}; ", limit); fmt::format_to(error_msg, "actual length: {}; ", str_val.size); } diff --git a/docs/en/administrator-guide/config/be_config.md b/docs/en/administrator-guide/config/be_config.md index 4707ebfc51da13..c97416ccc2d576 100644 --- a/docs/en/administrator-guide/config/be_config.md +++ b/docs/en/administrator-guide/config/be_config.md @@ -1519,3 +1519,8 @@ Translated with www.DeepL.com/Translator (free version) * Type: int32 * Description: The maximum amount of data read by each OlapScanner. * Default: 1024 + +### `string_type_length_soft_limit_bytes` +* Type: int32 +* Description: A soft limit of string type length. +* Description: 1048576 \ No newline at end of file diff --git a/docs/en/sql-reference/sql-statements/Data Types/STRING.md b/docs/en/sql-reference/sql-statements/Data Types/STRING.md index 654a7b9d3e4ff7..9a019395bdc79d 100644 --- a/docs/en/sql-reference/sql-statements/Data Types/STRING.md +++ b/docs/en/sql-reference/sql-statements/Data Types/STRING.md @@ -27,7 +27,7 @@ under the License. # STRING ## Description STRING (M) -A variable length string, max legnth is 2147483643(2GB - 4). +A variable length string, max legnth is 2147483643(2GB - 4). The length of the String type is also limited by the configuration `string_type_length_soft_limit_bytes` of be, the actual maximum length that can be stored take the minimum value of both, the String type can only be used in the value column, not in the key column and the partition and bucket columns Note: Variable length strings are stored in UTF-8 encoding, so usually English characters occupies 1 byte, and Chinese characters occupies 3 bytes. diff --git a/docs/zh-CN/administrator-guide/config/be_config.md b/docs/zh-CN/administrator-guide/config/be_config.md index dfb73ff0a4b17b..3294b363948614 100644 --- a/docs/zh-CN/administrator-guide/config/be_config.md +++ b/docs/zh-CN/administrator-guide/config/be_config.md @@ -1536,3 +1536,8 @@ webserver默认工作线程数 * 类型: int32 * 描述: 每个OlapScanner 读取的最大数据量 * 默认值: 1024 + +### `string_type_length_soft_limit_bytes` +* 类型: int32 +* 描述: String 类型最大长度的软限,单位是字节 +* 默认值: 1048576 diff --git a/docs/zh-CN/sql-reference/sql-statements/Data Types/STRING.md b/docs/zh-CN/sql-reference/sql-statements/Data Types/STRING.md index 7b80e79554f2d0..88fb608fca8cbb 100644 --- a/docs/zh-CN/sql-reference/sql-statements/Data Types/STRING.md +++ b/docs/zh-CN/sql-reference/sql-statements/Data Types/STRING.md @@ -27,7 +27,7 @@ under the License. # STRING ## description STRING - 变长字符串,最大支持2147483643 字节(2GB-4)。用法类似VARCHAR。 + 变长字符串,最大支持2147483643 字节(2GB-4)。String类型的长度还受 be 配置 `string_type_soft_limit`, 实际能存储的最大长度 取两者最小值,String类型只能用在value 列,不能用在 key 列和分区 分桶列 注意:变长字符串是以UTF-8编码存储的,因此通常英文字符占1个字节,中文字符占3个字节。 diff --git a/fe/fe-core/src/main/cup/sql_parser.cup b/fe/fe-core/src/main/cup/sql_parser.cup index 1fd8904569a5df..38bd07b2d8c557 100644 --- a/fe/fe-core/src/main/cup/sql_parser.cup +++ b/fe/fe-core/src/main/cup/sql_parser.cup @@ -4570,8 +4570,6 @@ type ::= {: RESULT = ScalarType.createStringType(); :} | KW_TEXT {: RESULT = ScalarType.createStringType(); :} - | KW_BLOB - {: RESULT = ScalarType.createStringType(); :} | KW_VARCHAR LPAREN INTEGER_LITERAL:len RPAREN {: ScalarType type = ScalarType.createVarcharType(len.intValue()); type.setAssignedStrLenInColDefinition(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/ColumnDef.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/ColumnDef.java index bd32419fe18e38..cf61e09218f66d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/ColumnDef.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/ColumnDef.java @@ -215,7 +215,10 @@ public void analyze(boolean isOlap) throws AnalysisException { throw new AnalysisException("Array type column default value only support null"); } } - + if (isKey() && type.getPrimitiveType() == PrimitiveType.STRING) { + throw new AnalysisException("String Type should not be used in key column[" + getName() + + "]."); + } if (type.getPrimitiveType() == PrimitiveType.MAP) { if (defaultValue.isSet && defaultValue != DefaultValue.NULL_DEFAULT_VALUE) { throw new AnalysisException("Map type column default value just support null"); diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/CreateTableStmt.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/CreateTableStmt.java index 16e0609c40a0da..54afeb1a7df5dd 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/CreateTableStmt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/CreateTableStmt.java @@ -295,7 +295,8 @@ public void analyze(Analyzer analyzer) throws UserException { } if (hasAggregate) { for (ColumnDef columnDef : columnDefs) { - if (columnDef.getAggregateType() == null) { + if (columnDef.getAggregateType() == null + && !columnDef.getType().isScalarType(PrimitiveType.STRING)) { keysColumnNames.add(columnDef.getName()); } } @@ -314,6 +315,9 @@ public void analyze(Analyzer analyzer) throws UserException { if (columnDef.getType().isFloatingPointType()) { break; } + if (columnDef.getType().getPrimitiveType() == PrimitiveType.STRING) { + break; + } if (columnDef.getType().getPrimitiveType() == PrimitiveType.VARCHAR) { keysColumnNames.add(columnDef.getName()); break; @@ -323,8 +327,8 @@ public void analyze(Analyzer analyzer) throws UserException { // The OLAP table must has at least one short key and the float and double should not be short key. // So the float and double could not be the first column in OLAP table. if (keysColumnNames.isEmpty()) { - throw new AnalysisException("The olap table first column could not be float or double," - + " use decimal instead."); + throw new AnalysisException("The olap table first column could not be float, double, string" + + " use decimal or varchar instead."); } keysDesc = new KeysDesc(KeysType.DUP_KEYS, keysColumnNames); } @@ -416,7 +420,7 @@ public void analyze(Analyzer analyzer) throws UserException { if (distributionDesc == null) { throw new AnalysisException("Create olap table should contain distribution desc"); } - distributionDesc.analyze(columnSet); + distributionDesc.analyze(columnSet, columnDefs); } else if (engineName.equalsIgnoreCase("elasticsearch")) { EsUtil.analyzePartitionAndDistributionDesc(partitionDesc, distributionDesc); } else { diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/DistributionDesc.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/DistributionDesc.java index c3491859f041b0..5d5b1320af3992 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/DistributionDesc.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/DistributionDesc.java @@ -40,7 +40,7 @@ public DistributionDesc() { } - public void analyze(Set colSet) throws AnalysisException { + public void analyze(Set colSet, List columnDefs) throws AnalysisException { throw new NotImplementedException(); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/HashDistributionDesc.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/HashDistributionDesc.java index 2379d3d0c0b911..d02faa4fc6dc55 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/HashDistributionDesc.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/HashDistributionDesc.java @@ -22,11 +22,13 @@ import org.apache.doris.catalog.DistributionInfo; import org.apache.doris.catalog.DistributionInfo.DistributionInfoType; import org.apache.doris.catalog.HashDistributionInfo; +import org.apache.doris.catalog.PrimitiveType; import org.apache.doris.common.AnalysisException; import org.apache.doris.common.DdlException; import org.apache.doris.common.io.Text; import com.google.common.collect.Lists; +import com.google.common.collect.Sets; import java.io.DataInput; import java.io.DataOutput; @@ -58,17 +60,29 @@ public int getBuckets() { } @Override - public void analyze(Set cols) throws AnalysisException { + public void analyze(Set colSet, List columnDefs) throws AnalysisException { if (numBucket <= 0) { throw new AnalysisException("Number of hash distribution should be larger than zero."); } if (distributionColumnNames == null || distributionColumnNames.size() == 0) { throw new AnalysisException("Number of hash column should be larger than zero."); } + Set distColSet = Sets.newTreeSet(String.CASE_INSENSITIVE_ORDER); for (String columnName : distributionColumnNames) { - if (!cols.contains(columnName)) { + if (!colSet.contains(columnName)) { throw new AnalysisException("Distribution column(" + columnName + ") doesn't exist."); } + if (!distColSet.add(columnName)) { + throw new AnalysisException("Duplicated distribution column " + columnName); + } + for (ColumnDef columnDef : columnDefs) { + if (columnDef.getName().equals(columnName)) { + if (columnDef.getType().isScalarType(PrimitiveType.STRING)) { + throw new AnalysisException("String Type should not be used in distribution column[" + + columnDef.getName() + "]."); + } + } + } } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/PartitionDesc.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/PartitionDesc.java index 7566cdd6f2630a..83c592c4dc9e5d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/PartitionDesc.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/PartitionDesc.java @@ -21,6 +21,7 @@ import org.apache.doris.catalog.Column; import org.apache.doris.catalog.PartitionInfo; import org.apache.doris.catalog.PartitionType; +import org.apache.doris.catalog.PrimitiveType; import org.apache.doris.common.AnalysisException; import org.apache.doris.common.DdlException; import org.apache.doris.qe.ConnectContext; @@ -78,6 +79,10 @@ public void analyze(List columnDefs, Map otherPropert if (columnDef.getType().isFloatingPointType()) { throw new AnalysisException("Floating point type column can not be partition column"); } + if (columnDef.getType().isScalarType(PrimitiveType.STRING)) { + throw new AnalysisException("String Type should not be used in partition column[" + + columnDef.getName() + "]."); + } if (!ConnectContext.get().getSessionVariable().isAllowPartitionColumnNullable() && columnDef.isAllowNull()) { throw new AnalysisException("The partition column must be NOT NULL"); diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/RandomDistributionDesc.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/RandomDistributionDesc.java index 947cebbab4ede3..d32af899ed22e0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/RandomDistributionDesc.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/RandomDistributionDesc.java @@ -42,7 +42,7 @@ public RandomDistributionDesc(int numBucket) { } @Override - public void analyze(Set colSet) throws AnalysisException { + public void analyze(Set colSet, List columnDefs) throws AnalysisException { if (numBucket <= 0) { throw new AnalysisException("Number of random distribution should be larger than zero."); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/ScalarType.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/ScalarType.java index 9484d7a7bd08eb..714e067699efec 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/ScalarType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/ScalarType.java @@ -196,7 +196,6 @@ public static ScalarType createType(String type) { return createVarcharType(); case "STRING": case "TEXT": - case "BLOB": return createStringType(); case "HLL": return createHllType(); diff --git a/fe/fe-core/src/test/java/org/apache/doris/planner/PlannerTest.java b/fe/fe-core/src/test/java/org/apache/doris/planner/PlannerTest.java index ff29dc29649ffd..8548ba689a5a9d 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/planner/PlannerTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/planner/PlannerTest.java @@ -22,6 +22,7 @@ import org.apache.doris.analysis.ExplainOptions; import org.apache.doris.analysis.Expr; import org.apache.doris.catalog.Catalog; +import org.apache.doris.common.AnalysisException; import org.apache.doris.qe.ConnectContext; import org.apache.doris.qe.StmtExecutor; import org.apache.doris.utframe.UtFrameUtils; @@ -31,7 +32,9 @@ import org.junit.After; import org.junit.Assert; import org.junit.BeforeClass; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.ExpectedException; import java.io.File; import java.util.List; @@ -41,6 +44,9 @@ public class PlannerTest { private static String runningDir = "fe/mocked/DemoTest/" + UUID.randomUUID().toString() + "/"; private static ConnectContext ctx; + @Rule + public ExpectedException expectedEx = ExpectedException.none(); + @After public void tearDown() throws Exception { FileUtils.deleteDirectory(new File(runningDir)); @@ -434,4 +440,13 @@ public void testBigintSlotRefCompareDecimalLiteral() { compare.accept("select * from db1.tbl2 where k1 > 2.1", "select * from db1.tbl2 where k1 > 2"); } + @Test + public void testStringType() throws Exception { + String createTbl1 = "create table db1.tbl1(k1 string, k2 varchar(32), k3 varchar(32), k4 int) " + + "AGGREGATE KEY(k1, k2,k3,k4) distributed by hash(k1) buckets 3 properties('replication_num' = '1')"; + expectedEx.expect(AnalysisException.class); + expectedEx.expectMessage("String Type should not be used in key column[k1]."); + UtFrameUtils.parseAndAnalyzeStmt(createTbl1, ctx); + } + }