diff --git a/be/src/olap/data_dir.cpp b/be/src/olap/data_dir.cpp index 41453a33d1c9a1..1e8d5cca872c7d 100644 --- a/be/src/olap/data_dir.cpp +++ b/be/src/olap/data_dir.cpp @@ -69,29 +69,14 @@ DataDir::DataDir(const std::string& path, int64_t capacity_bytes, _cluster_id(-1), _to_be_deleted(false), _current_shard(0), - _test_file_read_buf(nullptr), - _test_file_write_buf(nullptr), _meta(nullptr) {} DataDir::~DataDir() { - free(_test_file_read_buf); - free(_test_file_write_buf); delete _id_generator; delete _meta; } Status DataDir::init() { - _rand_seed = static_cast(time(NULL)); - if (posix_memalign((void**)&_test_file_write_buf, DIRECT_IO_ALIGNMENT, TEST_FILE_BUF_SIZE) != - 0) { - LOG(WARNING) << "fail to allocate memory. size=" << TEST_FILE_BUF_SIZE; - return Status::InternalError("No memory"); - } - if (posix_memalign((void**)&_test_file_read_buf, DIRECT_IO_ALIGNMENT, TEST_FILE_BUF_SIZE) != - 0) { - LOG(WARNING) << "fail to allocate memory. size=" << TEST_FILE_BUF_SIZE; - return Status::InternalError("No memory"); - } if (!FileUtils::check_exist(_path)) { LOG(WARNING) << "opendir failed, path=" << _path; return Status::InternalError("opendir failed"); @@ -315,68 +300,7 @@ void DataDir::health_check() { OLAPStatus DataDir::_read_and_write_test_file() { std::string test_file = _path + kTestFilePath; - - if (access(test_file.c_str(), F_OK) == 0) { - if (remove(test_file.c_str()) != 0) { - char errmsg[64]; - LOG(WARNING) << "fail to delete test file. " - << "path=" << test_file << ", errno=" << errno - << ", err=" << strerror_r(errno, errmsg, 64); - return OLAP_ERR_IO_ERROR; - } - } else { - if (errno != ENOENT) { - char errmsg[64]; - LOG(WARNING) << "fail to access test file. " - << "path=" << test_file << ", errno=" << errno - << ", err=" << strerror_r(errno, errmsg, 64); - return OLAP_ERR_IO_ERROR; - } - } - - OLAPStatus res = OLAP_SUCCESS; - FileHandler file_handler; - if ((res = file_handler.open_with_mode(test_file.c_str(), O_RDWR | O_CREAT | O_DIRECT, - S_IRUSR | S_IWUSR)) != OLAP_SUCCESS) { - LOG(WARNING) << "fail to create test file. path=" << test_file; - return res; - } - - for (size_t i = 0; i < TEST_FILE_BUF_SIZE; ++i) { - int32_t tmp_value = rand_r(&_rand_seed); - _test_file_write_buf[i] = static_cast(tmp_value); - } - - if ((res = file_handler.pwrite(_test_file_write_buf, TEST_FILE_BUF_SIZE, SEEK_SET)) != - OLAP_SUCCESS) { - LOG(WARNING) << "fail to write test file. [file_name=" << test_file << "]"; - return res; - } - - if ((res = file_handler.pread(_test_file_read_buf, TEST_FILE_BUF_SIZE, SEEK_SET)) != - OLAP_SUCCESS) { - LOG(WARNING) << "fail to read test file. [file_name=" << test_file << "]"; - return res; - } - - if (memcmp(_test_file_write_buf, _test_file_read_buf, TEST_FILE_BUF_SIZE) != 0) { - OLAP_LOG_WARNING("the test file write_buf and read_buf not equal."); - return OLAP_ERR_TEST_FILE_ERROR; - } - - if ((res = file_handler.close()) != OLAP_SUCCESS) { - LOG(WARNING) << "fail to close test file. [file_name=" << test_file << "]"; - return res; - } - - if (remove(test_file.c_str()) != 0) { - char errmsg[64]; - VLOG(3) << "fail to delete test file. [err='" << strerror_r(errno, errmsg, 64) << "' path='" - << test_file << "']"; - return OLAP_ERR_IO_ERROR; - } - - return res; + return read_write_test_file(test_file);; } OLAPStatus DataDir::get_shard(uint64_t* shard) { diff --git a/be/src/olap/data_dir.h b/be/src/olap/data_dir.h index c9d26062ab682c..7456c170da8917 100644 --- a/be/src/olap/data_dir.h +++ b/be/src/olap/data_dir.h @@ -159,8 +159,6 @@ class DataDir { TStorageMedium::type _storage_medium; bool _is_used; - uint32_t _rand_seed; - std::string _file_system; TabletManager* _tablet_manager; TxnManager* _txn_manager; @@ -173,11 +171,7 @@ class DataDir { uint64_t _current_shard; std::set _tablet_set; - static const size_t TEST_FILE_BUF_SIZE = 4096; - static const size_t DIRECT_IO_ALIGNMENT = 512; static const uint32_t MAX_SHARD_NUM = 1024; - char* _test_file_read_buf; - char* _test_file_write_buf; OlapMeta* _meta = nullptr; RowsetIdGenerator* _id_generator = nullptr; diff --git a/be/src/olap/utils.cpp b/be/src/olap/utils.cpp index 1bcd9666a33bf8..93a61e3e77402a 100644 --- a/be/src/olap/utils.cpp +++ b/be/src/olap/utils.cpp @@ -22,7 +22,11 @@ #include #include +#include +#include #include +#include +#include #include #include @@ -30,6 +34,7 @@ #include #include #include "util/file_utils.h" +#include "olap/file_helper.h" #ifdef DORIS_WITH_LZO #include @@ -50,6 +55,7 @@ using std::string; using std::set; using std::vector; +using std::unique_ptr; namespace doris { @@ -973,6 +979,92 @@ OLAPStatus copy_file(const string& src, const string& dest) { return res; } +OLAPStatus read_write_test_file(const string& test_file_path) { + if (access(test_file_path.c_str(), F_OK) == 0) { + if (remove(test_file_path.c_str()) != 0) { + char errmsg[64]; + LOG(WARNING) << "fail to delete test file. " + << "path=" << test_file_path + << ", errno=" << errno << ", err=" << strerror_r(errno, errmsg, 64); + return OLAP_ERR_IO_ERROR; + } + } else { + if (errno != ENOENT) { + char errmsg[64]; + LOG(WARNING) << "fail to access test file. " + << "path=" << test_file_path + << ", errno=" << errno << ", err=" << strerror_r(errno, errmsg, 64); + return OLAP_ERR_IO_ERROR; + } + } + OLAPStatus res = OLAP_SUCCESS; + FileHandler file_handler; + if ((res = file_handler.open_with_mode(test_file_path.c_str(), + O_RDWR | O_CREAT | O_DIRECT, + S_IRUSR | S_IWUSR)) != OLAP_SUCCESS) { + LOG(WARNING) << "fail to create test file. path=" << test_file_path; + return res; + } + const size_t TEST_FILE_BUF_SIZE = 4096; + const size_t DIRECT_IO_ALIGNMENT = 512; + char *write_test_buff = nullptr; + char *read_test_buff = nullptr; + if (posix_memalign((void**) &write_test_buff, DIRECT_IO_ALIGNMENT, TEST_FILE_BUF_SIZE)!= 0) { + LOG(WARNING) << "fail to allocate write buffer memory. size=" << TEST_FILE_BUF_SIZE; + return OLAP_ERR_MALLOC_ERROR; + } + unique_ptr write_buff (write_test_buff, &std::free); + if (posix_memalign((void**) &read_test_buff, DIRECT_IO_ALIGNMENT, TEST_FILE_BUF_SIZE)!= 0) { + LOG(WARNING) << "fail to allocate read buffer memory. size=" << TEST_FILE_BUF_SIZE; + return OLAP_ERR_MALLOC_ERROR; + } + unique_ptr read_buff (read_test_buff, &std::free); + // generate random numbers + uint32_t rand_seed = static_cast(time(NULL)); + for (size_t i = 0; i < TEST_FILE_BUF_SIZE; ++i) { + int32_t tmp_value = rand_r(&rand_seed); + write_test_buff[i] = static_cast(tmp_value); + } + if ((res = file_handler.pwrite(write_buff.get(), TEST_FILE_BUF_SIZE, SEEK_SET)) != OLAP_SUCCESS) { + LOG(WARNING) << "fail to write test file. [file_name=" << test_file_path << "]"; + return res; + } + if ((res = file_handler.pread(read_buff.get(), TEST_FILE_BUF_SIZE, SEEK_SET)) != OLAP_SUCCESS) { + LOG(WARNING) << "fail to read test file. [file_name=" << test_file_path << "]"; + return res; + } + if (memcmp(write_buff.get(), read_buff.get(), TEST_FILE_BUF_SIZE) != 0) { + LOG(WARNING) << "the test file write_buf and read_buf not equal, [file_name = " << test_file_path << "]"; + return OLAP_ERR_TEST_FILE_ERROR; + } + if ((res = file_handler.close()) != OLAP_SUCCESS) { + LOG(WARNING) << "fail to close test file. [file_name=" << test_file_path << "]"; + return res; + } + if (remove(test_file_path.c_str()) != 0) { + char errmsg[64]; + VLOG(3) << "fail to delete test file. [err='" << strerror_r(errno, errmsg, 64) + << "' path='" << test_file_path << "']"; + return OLAP_ERR_IO_ERROR; + } + return res; +} + +bool check_datapath_rw(const string& path) { + if (!FileUtils::check_exist(path)) + return false; + string file_path = path + "/.read_write_test_file"; + try { + OLAPStatus res = read_write_test_file(file_path); + return res == OLAP_SUCCESS; + } catch (...) { + // do nothing + } + LOG(WARNING) << "error when try to read and write temp file under the data path and return false. [path=" << path << "]"; + return false; +} + + OLAPStatus copy_dir(const string &src_dir, const string &dst_dir) { boost::filesystem::path src_path(src_dir.c_str()); diff --git a/be/src/olap/utils.h b/be/src/olap/utils.h index e610f7ce3da99c..09cbf8c3f65362 100644 --- a/be/src/olap/utils.h +++ b/be/src/olap/utils.h @@ -227,6 +227,10 @@ OLAPStatus copy_file(const std::string& src, const std::string& dest); OLAPStatus copy_dir(const std::string &src_dir, const std::string &dst_dir); +bool check_datapath_rw(const std::string& path); + +OLAPStatus read_write_test_file(const std::string& test_file_path); + //转换两个list template void static_cast_assign_vector(std::vector* v1, const std::vector& v2) { diff --git a/be/src/service/doris_main.cpp b/be/src/service/doris_main.cpp index 7f8a23834147bf..5fdb29e07f7ba7 100644 --- a/be/src/service/doris_main.cpp +++ b/be/src/service/doris_main.cpp @@ -143,6 +143,25 @@ int main(int argc, char** argv) { LOG(FATAL) << "parse config storage path failed, path=" << doris::config::storage_root_path; exit(-1); } + auto it = paths.begin(); + for (;it != paths.end();) { + if (!doris::check_datapath_rw(it->path)) { + if (doris::config::ignore_broken_disk) { + LOG(WARNING) << "read write test file failed, path=" << it->path; + it = paths.erase(it); + } else { + LOG(FATAL) << "read write test file failed, path=" << it->path; + exit(-1); + } + } else { + ++it; + } + } + + if (paths.empty()) { + LOG(FATAL) << "All disks are broken, exit."; + exit(-1); + } // initilize libcurl here to avoid concurrent initialization auto curl_ret = curl_global_init(CURL_GLOBAL_ALL); diff --git a/docs/en/administrator-guide/config/be_config.md b/docs/en/administrator-guide/config/be_config.md index 36d45ad56fef73..97a81b2ff57b45 100644 --- a/docs/en/administrator-guide/config/be_config.md +++ b/docs/en/administrator-guide/config/be_config.md @@ -188,6 +188,19 @@ Since this is a brpc configuration, users can also modify this parameter directl ### `ignore_broken_disk` ### `ignore_load_tablet_failure` +When BE starts, it will check all the paths under the `storage_root_path` in configuration. + +- `ignore_broken_disk=true` + + If the path does not exist or the file under the path cannot be read or written (broken disk), it will be ignored. If there are any other available paths, the startup will not be interrupted. + +- `ignore_broken_disk=false` + + If the path does not exist or the file under the path cannot be read or written (bad disk), the startup will fail and exit. + +The default value is `false`. + +### inc_rowset_expired_sec * Type: boolean * Description: Whether to continue to start be when load tablet from header failed. diff --git a/docs/zh-CN/administrator-guide/config/be_config.md b/docs/zh-CN/administrator-guide/config/be_config.md index 63d36069e20281..ceb8bf0aeb70ef 100644 --- a/docs/zh-CN/administrator-guide/config/be_config.md +++ b/docs/zh-CN/administrator-guide/config/be_config.md @@ -185,6 +185,18 @@ under the License. ### `ignore_broken_disk` +​ 当BE启动时,会检查``storage_root_path`` 配置下的所有路径。 + + - `ignore_broken_disk=true` + + 如果路径不存在或路径下无法进行读写文件(坏盘),将忽略此路径,如果有其他可用路径则不中断启动。 + + - `ignore_broken_disk=false` + + 如果路径不存在或路径下无法进行读写文件(坏盘),将中断启动失败退出。 + +​ 默认为false + ### `inc_rowset_expired_sec` ### `index_stream_cache_capacity`