diff --git a/be/src/http/action/restore_tablet_action.cpp b/be/src/http/action/restore_tablet_action.cpp index 745fa106bcb7d8..6d0e1a740c9544 100644 --- a/be/src/http/action/restore_tablet_action.cpp +++ b/be/src/http/action/restore_tablet_action.cpp @@ -37,6 +37,7 @@ #include "olap/storage_engine.h" #include "olap/data_dir.h" #include "runtime/exec_env.h" +#include "gutil/strings/substitute.h" // for Substitute using boost::filesystem::path; @@ -86,11 +87,11 @@ Status RestoreTabletAction::_handle(HttpRequest *req) { TabletSharedPtr tablet = StorageEngine::instance()->tablet_manager()->get_tablet(tablet_id, schema_hash); - if (tablet.get() != nullptr) { + if (tablet != nullptr) { LOG(WARNING) << "find tablet. tablet_id=" << tablet_id << " schema_hash=" << schema_hash; return Status::InternalError("tablet already exists, can not restore."); } - std::string key = std::to_string(tablet_id) + "_" + std::to_string(schema_hash); + std::string key = tablet_id_str + "_" + schema_hash_str; { // check tablet_id + schema_hash already is restoring std::lock_guard l(_tablet_restore_lock); @@ -121,7 +122,7 @@ Status RestoreTabletAction::_reload_tablet( << ", signature: " << tablet_id; // remove tablet data path in data path // path: /roo_path/data/shard/tablet_id - std::string tablet_path = shard_path + "/" + std::to_string(tablet_id); + std::string tablet_path = strings::Substitute("$0/$1/$2", shard_path, tablet_id, schema_hash); LOG(INFO) << "remove tablet_path:" << tablet_path; Status s = FileUtils::remove_all(tablet_path); if (!s.ok()) { @@ -129,28 +130,19 @@ Status RestoreTabletAction::_reload_tablet( } return Status::InternalError("command executor load header failed"); } else { - LOG(INFO) << "load header success. status: " << res - << ", signature: " << tablet_id; - // remove tablet data path in trash - // path: /root_path/trash/time_label, because only one tablet path under time_label std::string trash_tablet_schema_hash_dir = ""; - { // get tablet path in trash std::lock_guard l(_tablet_restore_lock); trash_tablet_schema_hash_dir = _tablet_path_map[key]; } + LOG(INFO) << "load header success. status: " << res + << ", signature: " << tablet_id << ", from trash path:" << trash_tablet_schema_hash_dir + << " to shard path:" << shard_path; - boost::filesystem::path trash_tablet_schema_hash_path(trash_tablet_schema_hash_dir); - boost::filesystem::path time_label_path = trash_tablet_schema_hash_path.parent_path().parent_path(); - LOG(INFO) << "remove time label path:" << time_label_path.string(); - Status s = FileUtils::remove_all(time_label_path.string()); - if (!s.ok()) { - LOG(WARNING) << "remove time label path:" << time_label_path.string() << " failed"; - } return Status::OK(); } -} +} Status RestoreTabletAction::_restore(const std::string& key, int64_t tablet_id, int32_t schema_hash) { // get latest tablet path in trash @@ -185,31 +177,35 @@ Status RestoreTabletAction::_restore(const std::string& key, int64_t tablet_id, return s; } // create hard link for files in /root_path/data/shard/tablet_id/schema_hash - std::vector files; - s = FileUtils::scan_dir(latest_tablet_path, &files); + s = _create_hard_link_recursive(latest_tablet_path, restore_schema_hash_path); if (!s.ok()) { - LOG(WARNING) << "scan dir failed:" << latest_tablet_path; + RETURN_IF_ERROR(FileUtils::remove_all(restore_schema_hash_path)); return s; } + std::string restore_shard_path = store->get_absolute_shard_path(std::to_string(tablet_meta.shard_id())); + Status status = _reload_tablet(key, restore_shard_path, tablet_id, schema_hash); + return status; +} + +Status RestoreTabletAction::_create_hard_link_recursive(const std::string& src, const std::string& dst) { + std::vector files; + RETURN_IF_ERROR(FileUtils::scan_dir(src, &files)); for (auto& file : files) { - std::string from = latest_tablet_path + "/" + file; - std::string to = restore_schema_hash_path + "/" + file; - int link_ret = link(from.c_str(), to.c_str()); - if (link_ret != 0) { - LOG(WARNING) << "link from:" << from - << " to:" << to << " failed, link ret:" << link_ret; - std::string restore_tablet_path = store->get_absolute_tablet_path(&tablet_meta, false); - LOG(WARNING) << "remove tablet_path:" << restore_tablet_path; - Status s = FileUtils::remove_all(restore_tablet_path); - if (!s.ok()) { - LOG(WARNING) << "remove invalid tablet path:" << restore_tablet_path << " failed"; + std::string from = src + "/" + file; + std::string to = dst + "/" + file; + if (FileUtils::is_dir(from)) { + RETURN_IF_ERROR(FileUtils::create_dir(to)); + RETURN_IF_ERROR(_create_hard_link_recursive(from, to)); + } else { + int link_ret = link(from.c_str(), to.c_str()); + if (link_ret != 0) { + LOG(WARNING) << "link from:" << from + << " to:" << to << " failed, link ret:" << link_ret; + return Status::InternalError("create link path failed"); } - return Status::InternalError("create link path failed"); } } - std::string restore_shard_path = store->get_absolute_shard_path(std::to_string(tablet_meta.shard_id())); - Status status = _reload_tablet(key, restore_shard_path, tablet_id, schema_hash); - return status; + return Status::OK(); } bool RestoreTabletAction::_get_latest_tablet_path_from_trash( diff --git a/be/src/http/action/restore_tablet_action.h b/be/src/http/action/restore_tablet_action.h index d22686d4932dc6..9fce3110cb2477 100644 --- a/be/src/http/action/restore_tablet_action.h +++ b/be/src/http/action/restore_tablet_action.h @@ -51,6 +51,8 @@ class RestoreTabletAction : public HttpHandler { void _clear_key(const std::string& key); + Status _create_hard_link_recursive(const std::string& src, const std::string& dst); + private: ExecEnv* _exec_env; std::mutex _tablet_restore_lock; diff --git a/docs/documentation/cn/administrator-guide/operation/tablet-restore-tool.md b/docs/documentation/cn/administrator-guide/operation/tablet-restore-tool.md new file mode 100644 index 00000000000000..0aae09f423312c --- /dev/null +++ b/docs/documentation/cn/administrator-guide/operation/tablet-restore-tool.md @@ -0,0 +1,64 @@ +# BE Tablet数据恢复工具 + +## 背景 + +用户在使用Doris的过程中,可能会发生因为一些误操作或者线上bug,导致一些有效的tablet被删除(包括元数据和数据)。为了防止在这些异常情况出现数据丢失,Doris提供了回收站机制,来保护用户数据。用户删除的tablet数据不会被直接删除,会被放在回收站中存储一段时间,在一段时间之后会有定时清理机制将过期的数据删除。回收站中的数据包括:tablet的data文件(.dat),tablet的索引文件(.idx)和tablet的元数据文件(.hdr)。数据将会存放在如下格式的路径: + +/root_path/trash/time_label/tablet_id/schema_hash/ + +其中, root path是用户配置的一块盘上be存储的根目录; +trash:是回收站的目录 +time_label: 时间标签,为了回收站中数据目录的唯一性,同时记录数据时间,使用时间标签作为子目录 + +当用户发现线上的数据被误删除,需要从回收站中恢复被删除的tablet,需要用到这个tablet数据恢复功能。BE提供http接口和restore_tablet_tool.sh脚本实现这个功能,支持单tablet操作(single mode)和批量操作模式(batch mode)。 +在single mode下,支持单个tablet的数据恢复。 +在batch mode下,支持批量tablet的数据恢复。 + +## 操作 + +### single mode + +#### http请求方式 + +BE中提供单个tablet数据恢复的http接口,接口如下: + +``` +curl -X POST "http://localhost:8040/api/restore_tablet?tablet_id=11111\&schema_hash=12345" +``` + + +成功的结果如下: +``` +{"status": "Success", "msg": "OK"} +``` + +失败的话,会返回相应的失败原因,一种可能的结果如下: +``` +{"status": "Failed", "msg": "create link path failed"} +``` + +#### 脚本方式 + +restore_tablet_tool.sh可用来实现单tablet数据恢复的功能。 + +``` +sh tools/restore_tablet_tool.sh -b "http://127.0.0.1:8040" -t 12345 -s 11111 +sh tools/restore_tablet_tool.sh --backend "http://127.0.0.1:8040" --tablet_id 12345 --schema_hash 11111 +``` + +### batch mode + +批量恢复模式用于实现恢复多个tablet数据的功能。使用的时候需要预先将恢复的tablet id和schema hash按照逗号分隔的格式放在一个文件中,一个tablet一行。 +格式如下: +``` +12345,11111 +12346,11111 +12347,11111 +``` + +然后如下的命令进行恢复(假设文件名为:tablets.txt): + +``` +sh restore_tablet_tool.sh -b "http://127.0.0.1:8040" -f tablets.txt +sh restore_tablet_tool.sh --backend "http://127.0.0.1:8040" --file tablets.txt +``` diff --git a/tools/restore_tablet_tool.sh b/tools/restore_tablet_tool.sh new file mode 100644 index 00000000000000..17c1e2ca4febec --- /dev/null +++ b/tools/restore_tablet_tool.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# print usage +usage() { + echo " +Description: + This script is used to restore the tablets from trash. It supports single mode + and batch mode. + In single mode, it will restore just one tablet. + In batch mode, it will restore all the tablets specified in file. The content + of the file is comma-split tablet id and schema hash, like the following: + 12345,11111 + 12346,11111 + 12347,11111 + +Usage: $0 + Optional options: + -h | --help print help info + -b | --backend backend http service, default: http://127.0.0.1/8040 + -t | --tablet_id tablet id to restore + -s | --schema_hash tablet related schema hash + -f | --file file with lines containing comma-split tablet id and schema hash + +Examples: + batch mode: + sh restore_tablet_tool.sh -b "http://127.0.0.1:8040" -f tablets.txt + sh restore_tablet_tool.sh --backend "http://127.0.0.1:8040" --file tablets.txt + + single mode: + sh restore_tablet_tool.sh -b "http://127.0.0.1:8040" -t 12345 -s 11111 + sh restore_tablet_tool.sh --backend "http://127.0.0.1:8040" --tablet_id 12345 --schema_hash 11111 + " + exit 1 +} + +OPTS=$(getopt \ + -n $0 \ + -o 'b:t:s:f:' \ + -l 'server:,tablet_id:,schema_hash:,file:,help' \ + -- "$@") + +if [ $? != 0 ] ; then + usage +fi + +eval set -- "$OPTS" + +SERVER="http://127.0.0.1/8040" +TABLET_ID= +SCHEMA_HASH= +FILENAME= +BATCH_MODE=false + +while true; do + case "$1" in + -b|--backend) SERVER=$2 ; shift 2 ;; + -f|--file) FILENAME=$2 ; BATCH_MODE=true ; shift 2 ;; + -t|--tablet_id) TABLET_ID=$2 ; shift 2 ;; + -s|--schema_hash) SCHEMA_HASH=$2 ; shift 2 ;; + -h|--help) usage ; shift ;; + --) shift ; break ;; + *) echo "Internal error!" ; exit 1 ;; + esac +done + +restore_tablet() { + echo "start to restore tablet id:"$2", schema hash:"$3 + curl -X POST "$1/api/restore_tablet?tablet_id=$2&schema_hash=$3" + echo -e "\n" +} + +if [ $BATCH_MODE = true ] ; then + lines=`cat $FILENAME` + for line in $lines + do + # split the comma-split line + # format: tablet_id,schema_hash + fields=(${line/,/ }) + TABLET_ID=${fields[0]} + SCHEMA_HASH=${fields[1]} + restore_tablet $SERVER $TABLET_ID $SCHEMA_HASH + done +else + restore_tablet $SERVER $TABLET_ID $SCHEMA_HASH +fi