From 08a65d2903df77283844eecc0b9b9caf22be9f0e Mon Sep 17 00:00:00 2001 From: chaoyli Date: Tue, 17 Sep 2019 14:56:40 +0800 Subject: [PATCH] Check file descriptor number is larger than 65536 upon start --- be/src/common/config.h | 8 ++++-- be/src/olap/olap_define.h | 1 + be/src/olap/storage_engine.cpp | 27 +++++++++++++++++-- be/src/olap/storage_engine.h | 5 +++- .../cn/installing/install-deploy.md | 14 ++++++++++ .../en/installing/install-deploy_EN.md | 16 +++++++++++ 6 files changed, 66 insertions(+), 5 deletions(-) diff --git a/be/src/common/config.h b/be/src/common/config.h index ce5ac4686de7b5..540df3a3a8b6dd 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -219,8 +219,12 @@ namespace config { CONF_Int32(trash_file_expire_time_sec, "259200"); // check row nums for BE/CE and schema change. true is open, false is closed. CONF_Bool(row_nums_check, "true") - //file descriptors cache, by default, cache 30720 descriptors - CONF_Int32(file_descriptor_cache_capacity, "30720"); + //file descriptors cache, by default, cache 32768 descriptors + CONF_Int32(file_descriptor_cache_capacity, "32768"); + // minimum/maximum file descriptor number + // modify them upon necessity + CONF_Int32(min_file_descriptor_number, "65536"); + CONF_Int32(max_file_descriptor_number, "131072"); CONF_Int64(index_stream_cache_capacity, "10737418240"); CONF_Int64(max_packed_row_block_size, "20971520"); diff --git a/be/src/olap/olap_define.h b/be/src/olap/olap_define.h index 22757b5becd9ad..3735f1e2c8f104 100644 --- a/be/src/olap/olap_define.h +++ b/be/src/olap/olap_define.h @@ -130,6 +130,7 @@ enum OLAPStatus { OLAP_ERR_EVAL_CONJUNCTS_ERROR = -120, OLAP_ERR_COPY_FILE_ERROR = -121, OLAP_ERR_FILE_ALREADY_EXIST = -122, + OLAP_ERR_TOO_FEW_FILE_DESCRITPROR = -123, // common errors codes // [-200, -300) diff --git a/be/src/olap/storage_engine.cpp b/be/src/olap/storage_engine.cpp index d084cc1d4aaf5f..35e2f79f09ad44 100644 --- a/be/src/olap/storage_engine.cpp +++ b/be/src/olap/storage_engine.cpp @@ -177,7 +177,7 @@ OLAPStatus StorageEngine::open() { _store_map.emplace(path.path, store); } _effective_cluster_id = config::cluster_id; - auto res = check_all_root_path_cluster_id(); + auto res = _check_all_root_path_cluster_id(); if (res != OLAP_SUCCESS) { LOG(WARNING) << "fail to check cluster info. res=" << res; return res; @@ -185,6 +185,14 @@ OLAPStatus StorageEngine::open() { _update_storage_medium_type_count(); + res = _check_file_descriptor_number(); + if (res != OLAP_SUCCESS) { + LOG(WARNING) << "file descriptor number is not between " + << "min_file_descriptor_number:" << config::min_file_descriptor_number + << " and max_file_descriptor_number:" << config::max_file_descriptor_number; + return OLAP_ERR_INIT_FAILED; + } + auto cache = new_lru_cache(config::file_descriptor_cache_capacity); if (cache == nullptr) { LOG(WARNING) << "failed to init file descriptor LRUCache"; @@ -354,7 +362,22 @@ bool StorageEngine::_used_disk_not_enough(uint32_t unused_num, uint32_t total_nu return ((total_num == 0) || (unused_num * 100 / total_num > _min_percentage_of_error_disk)); } -OLAPStatus StorageEngine::check_all_root_path_cluster_id() { +OLAPStatus StorageEngine::_check_file_descriptor_number() { + struct rlimit l; + int ret = getrlimit(RLIMIT_NOFILE , &l); + if (ret != 0) { + LOG(WARNING) << "call getrlimit() failed. errno=" << strerror(errno) + << ", use default configuration instead."; + return OLAP_SUCCESS; + } + if (l.rlim_cur < config::min_file_descriptor_number + || l.rlim_cur > config::max_file_descriptor_number) { + return OLAP_ERR_TOO_FEW_FILE_DESCRITPROR; + } + return OLAP_SUCCESS; +} + +OLAPStatus StorageEngine::_check_all_root_path_cluster_id() { int32_t cluster_id = -1; for (auto& it : _store_map) { int32_t tmp_cluster_id = it.second->cluster_id(); diff --git a/be/src/olap/storage_engine.h b/be/src/olap/storage_engine.h index 0a630d9f896576..cfbd995b24615a 100644 --- a/be/src/olap/storage_engine.h +++ b/be/src/olap/storage_engine.h @@ -204,7 +204,10 @@ class StorageEngine { void release_rowset_id(const RowsetId& rowset_id) { return _rowset_id_generator->release_id(rowset_id); }; private: - OLAPStatus check_all_root_path_cluster_id(); + + OLAPStatus _check_file_descriptor_number(); + + OLAPStatus _check_all_root_path_cluster_id(); bool _used_disk_not_enough(uint32_t unused_num, uint32_t total_num); diff --git a/docs/documentation/cn/installing/install-deploy.md b/docs/documentation/cn/installing/install-deploy.md index c284e388750833..18e5bffabda092 100644 --- a/docs/documentation/cn/installing/install-deploy.md +++ b/docs/documentation/cn/installing/install-deploy.md @@ -378,3 +378,17 @@ Broker 是无状态的进程,可以随意启停。当然,停止后,正在 > 出现这个问题的主要原因是:当用户通过 `ADD BACKEND` 语句添加 BE 后,FE 会识别该语句中指定的是 hostname 还是 IP。如果是 hostname,则 FE 会自动将其转换为 IP 地址并存储到元数据中。当 BE 在汇报任务完成信息时,会携带自己的 IP 地址。而如果 FE 发现 BE 汇报的 IP 地址和元数据中不一致时,就会出现如上错误。 > > 这个错误的解决方法:1)分别在 FE 和 BE 设置 **priority\_network** 参数。通常 FE 和 BE 都处于一个网段,所以该参数设置为相同即可。2)在 `ADD BACKEND` 语句中直接填写 BE 正确的 IP 地址而不是 hostname,以避免 FE 获取到错误的 IP 地址。 + +5. BE 进程文件句柄数 + + BE进程文件句柄数,受min_file_descriptor_number/max_file_descriptor_number两个参数控制。 + + 如果不在[min_file_descriptor_number, max_file_descriptor_number]区间内,BE进程启动会出错,可以使用ulimit进行设置。 + + min_file_descriptor_number的默认值为65536。 + + max_file_descriptor_number的默认值为131072. + + 举例而言:ulimit -n 65536; 表示将文件句柄设成65536。 + + 启动BE进程之后,可以通过 cat /proc/$pid/limits 查看进程实际生效的句柄数 diff --git a/docs/documentation/en/installing/install-deploy_EN.md b/docs/documentation/en/installing/install-deploy_EN.md index 1ced7a9b61d77a..ead626a9666af9 100644 --- a/docs/documentation/en/installing/install-deploy_EN.md +++ b/docs/documentation/en/installing/install-deploy_EN.md @@ -389,3 +389,19 @@ Broker is a stateless process that can be started or stopped at will. Of course, > The main reason for this problem is that when the user adds BE through the `ADD BACKEND` statement, FE recognizes whether the statement specifies hostname or IP. If it is hostname, FE automatically converts it to an IP address and stores it in metadata. When BE reports on the completion of the task, it carries its own IP address. If FE finds that BE reports inconsistent IP addresses and metadata, it will make the above error. > > Solutions to this error: 1) Set **priority\_network** parameters in FE and BE respectively. Usually FE and BE are in a network segment, so this parameter can be set to the same. 2) Fill in the `ADD BACKEND` statement directly with the correct IP address of BE instead of hostname to avoid FE getting the wrong IP address. + +5. File descriptor number of BE process + + The number of file descriptor of BE process is controlled by the two parameters min_file_descriptor_number/max_file_descriptor_number. + + If it is not in the [min_file_descriptor_number, max_file_descriptor_number] interval, error will occurs when starting BE process. + + Please using ulimit command to set file descriptor under this circumstance. + + The default value of min_file_descriptor_number is 65536. + + The default value of max_file_descriptor_number is 131072. + + For Example : ulimit -n 65536; this command set file descriptor to 65536. + + After starting BE process, you can use **cat /proc/$pid/limits** to see the actual limit of process.