From 55665f78081a46deeabec396f20552b4781bd990 Mon Sep 17 00:00:00 2001 From: libo Date: Sun, 1 Mar 2026 23:58:20 +0800 Subject: [PATCH 01/13] Come true all rpc interfaces in the DataNode, and partial features in the DataPartitionTableIntegrityCheckProcedure. --- DataPartitionTableIntegrityCheck_README.md | 245 ++++++++ .../AsyncDataNodeHeartbeatClientPool.java | 8 + .../client/sync/CnToDnSyncRequestType.java | 5 + .../client/sync/SyncDataNodeClientPool.java | 10 + .../confignode/manager/ProcedureManager.java | 13 + .../load/service/HeartbeatService.java | 3 + .../partition/ConfigNodeProcedureEnv.java | 39 ++ ...PartitionTableIntegrityCheckProcedure.java | 578 ++++++++++++++++++ ...tionTableIntegrityCheckProcedureState.java | 18 + .../procedure/store/ProcedureFactory.java | 6 + .../procedure/store/ProcedureType.java | 5 +- .../iotdb/confignode/service/ConfigNode.java | 9 +- .../org/apache/iotdb/db/conf/IoTDBConfig.java | 20 + .../apache/iotdb/db/conf/IoTDBDescriptor.java | 5 + .../DataPartitionTableGenerator.java | 357 +++++++++++ .../db/protocol/thrift/OperationType.java | 5 +- .../impl/DataNodeInternalRPCServiceImpl.java | 296 ++++++++- .../dataregion/tsfile/TsFileResource.java | 5 + .../tsfile/timeindex/FileTimeIndex.java | 36 ++ .../tsfile/timeindex/ITimeIndex.java | 9 +- .../conf/iotdb-system.properties.template | 15 + .../iotdb/commons/concurrent/ThreadName.java | 2 + .../DataPartitionTableGeneratorState.java | 33 + .../commons/utils/TimePartitionUtils.java | 33 + .../rateLimiter/LeakyBucketRateLimiter.java | 128 ++++ .../src/main/thrift/datanode.thrift | 54 ++ 26 files changed, 1930 insertions(+), 7 deletions(-) create mode 100644 DataPartitionTableIntegrityCheck_README.md create mode 100644 iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/ConfigNodeProcedureEnv.java create mode 100644 iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java create mode 100644 iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/DataPartitionTableIntegrityCheckProcedureState.java create mode 100644 iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java create mode 100644 iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/enums/DataPartitionTableGeneratorState.java create mode 100644 iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java diff --git a/DataPartitionTableIntegrityCheck_README.md b/DataPartitionTableIntegrityCheck_README.md new file mode 100644 index 0000000000000..7fe3eefabb041 --- /dev/null +++ b/DataPartitionTableIntegrityCheck_README.md @@ -0,0 +1,245 @@ +# IoTDB 数据分区表完整性检测功能实现 + +## 功能概述 + +本功能实现了IoTDB ConfigNode重启时的数据分区表完整性检测,能够自动发现并恢复丢失的数据分区信息。 + +## 实现架构 + +### 1. 核心组件 + +#### Procedure实现 +- **DataPartitionTableIntegrityCheckProcedure**: 主要的Procedure实现,负责整个完整性检测流程 +- **ConfigNodeProcedureEnv**: Procedure执行环境,提供ConfigManager访问 + +#### DataNode端实现 +- **DataPartitionTableGenerator**: 扫描tsfile并生成DataPartitionTable的核心组件 +- **RPC接口扩展**: 在DataNode RPC服务中添加了三个新接口 + +#### 配置和注册 +- **ProcedureType枚举扩展**: 添加了新的Procedure类型 +- **ProcedureFactory扩展**: 支持新Procedure的创建和反序列化 +- **启动监听器**: ConfigNode启动时自动触发检测 + +### 2. 执行流程 + +``` +ConfigNode重启 → 检查Leader状态 → 收集最早timeslot → 分析缺失分区 → +请求DN生成表 → 合并分区表 → 写入Raft日志 → 完成 +``` + +## 详细实现 + +### 1. Thrift接口定义 (datanode.thrift) + +新增的RPC接口: +```thrift +// 获取最早timeslot信息 +TGetEarliestTimeslotsResp getEarliestTimeslots() + +// 请求生成DataPartitionTable +TGenerateDataPartitionTableResp generateDataPartitionTable() + +// 检查生成状态 +TCheckDataPartitionTableStatusResp checkDataPartitionTableStatus() +``` + +对应的响应结构体: +```thrift +struct TGetEarliestTimeslotsResp { + 1: required common.TSStatus status + 2: optional map databaseToEarliestTimeslot +} + +struct TGenerateDataPartitionTableResp { + 1: required common.TSStatus status + 2: required i32 errorCode + 3: optional string message +} + +struct TCheckDataPartitionTableStatusResp { + 1: required common.TSStatus status + 2: required i32 errorCode + 3: optional string message + 4: optional binary dataPartitionTable +} +``` + +### 2. DataNode实现 + +#### DataPartitionTableGenerator +- **并行扫描**: 使用多线程并行扫描tsfile文件 +- **进度跟踪**: 提供处理进度和状态信息 +- **错误处理**: 统计失败文件并记录错误信息 +- **配置化**: 支持自定义线程数和分区配置 + +#### RPC服务实现 +在`DataNodeInternalRPCServiceImpl`中实现: +- `getEarliestTimeslots()`: 扫描数据目录获取每个数据库的最早timeslot +- `generateDataPartitionTable()`: 启动异步扫描任务 +- `checkDataPartitionTableStatus()`: 检查任务状态并返回结果 + +### 3. ConfigNode Procedure实现 + +#### 状态机设计 +```java +public enum State { + CHECK_LEADER_STATUS, // 检查Leader状态 + COLLECT_EARLIEST_TIMESLOTS, // 收集最早timeslot + ANALYZE_MISSING_PARTITIONS, // 分析缺失分区 + REQUEST_PARTITION_TABLES, // 请求生成分区表 + MERGE_PARTITION_TABLES, // 合并分区表 + WRITE_PARTITION_TABLE_TO_RAFT, // 写入Raft日志 + SUCCESS, // 成功完成 + FAILED // 执行失败 +} +``` + +#### 错误码定义 +```java +public static final int DN_ERROR_CODE_SUCCESS = 0; // 处理成功 +public static final int DN_ERROR_CODE_IN_PROGRESS = 2; // 正在执行 +public static final int DN_ERROR_CODE_FAILED = 1; // 处理失败 +public static final int DN_ERROR_CODE_UNKNOWN = -1; // DN未知状态 +``` + +#### 核心逻辑 +1. **Leader检查**: 只有Leader节点执行检测 +2. **数据收集**: 从所有DataNode收集最早timeslot信息 +3. **缺失分析**: 对比当前分区表,识别缺失的分区 +4. **异步处理**: 向DataNode发送异步扫描请求 +5. **状态轮询**: 定期检查任务状态,支持重试机制 +6. **数据合并**: 合并所有DataNode返回的分区表 +7. **Raft写入**: 通过共识层持久化最终分区表 + +### 4. 自动触发机制 + +#### 启动监听器 +```java +public class DataPartitionTableIntegrityCheckListener { + public void onStartupComplete() { + if (isLeader()) { + startIntegrityCheck(); + } + } + + public void onBecomeLeader() { + startIntegrityCheck(); + } +} +``` + +## 关键特性 + +### 1. 原子性保证 +- 每个步骤都是幂等的,支持重试 +- Procedure框架保证状态一致性 +- 失败时可以安全回滚 + +### 2. 容错机制 +- **重试策略**: 最多重试3次 +- **超时处理**: 避免无限等待 +- **部分失败**: 部分DataNode失败时继续处理 + +### 3. 性能优化 +- **并行扫描**: DataNode端使用多线程并行处理 +- **异步执行**: 避免阻塞主流程 +- **进度跟踪**: 提供实时进度信息 + +### 4. 可扩展性 +- **配置化**: 支持自定义线程数和分区配置 +- **模块化**: 各组件独立,易于扩展 +- **接口化**: 清晰的RPC接口定义 + +## 使用方式 + +### 1. 自动触发 +ConfigNode重启时自动检测并执行,无需手动干预。 + +### 2. 手动触发 +可以通过ProcedureExecutor手动提交检测Procedure: +```java +DataPartitionTableIntegrityCheckProcedure procedure = new DataPartitionTableIntegrityCheckProcedure(); +procedureExecutor.submit(procedure); +``` + +## 配置参数 + +### DataNode配置 +- `seriesSlotNum`: 系列分区槽数量 +- `seriesPartitionExecutorClass`: 分区执行器类名 +- `dataDirs`: 数据目录配置 + +### Procedure配置 +- `MAX_RETRY_COUNT`: 最大重试次数 (默认3) +- 重试间隔: 5秒 + +## 监控和日志 + +### 日志级别 +- **INFO**: 关键流程节点信息 +- **DEBUG**: 详细的执行过程 +- **ERROR**: 错误和异常信息 + +### 关键指标 +- 处理文件数量 +- 失败文件数量 +- 执行时间 +- 重试次数 +- DataNode响应状态 + +## 注意事项 + +### 1. 依赖关系 +- 需要ConfigNode为Leader状态 +- 依赖DataNode正常注册和通信 +- 需要共识层正常工作 + +### 2. 资源消耗 +- DataNode扫描会消耗CPU和I/O资源 +- 建议在低峰期执行 +- 大数据集时需要考虑内存使用 + +### 3. 网络带宽 +- DataPartitionTable序列化后可能较大 +- 需要考虑网络传输限制 +- 建议实现增量传输机制 + +## 后续优化建议 + +### 1. 增量扫描 +- 支持增量扫描,只处理新增文件 +- 维护扫描状态,避免重复工作 + +### 2. 分布式协调 +- 实现更智能的负载分配 +- 支持动态调整扫描策略 + +### 3. 缓存优化 +- 缓存扫描结果,避免重复计算 +- 实现智能失效机制 + +### 4. 监控增强 +- 添加更详细的性能指标 +- 实现告警机制 + +## 测试验证 + +### 1. 单元测试 +- 各组件独立测试 +- 边界条件测试 +- 异常场景验证 + +### 2. 集成测试 +- 端到端流程测试 +- 多节点环境验证 +- 故障恢复测试 + +### 3. 性能测试 +- 大数据集扫描测试 +- 并发性能测试 +- 资源使用监控 + +--- + +本实现提供了完整的IoTDB数据分区表完整性检测解决方案,具备高可用性、容错性和可扩展性,能够在ConfigNode重启时自动发现并恢复丢失的数据分区信息。 diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/AsyncDataNodeHeartbeatClientPool.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/AsyncDataNodeHeartbeatClientPool.java index 324e351302787..d32cb5b416934 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/AsyncDataNodeHeartbeatClientPool.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/AsyncDataNodeHeartbeatClientPool.java @@ -63,6 +63,14 @@ public void writeAuditLog( } } + public void generateDataPartitionTableHeartbeat(TEndPoint endPoint, TDataNodeHeartbeatReq req, DataNodeHeartbeatHandler handler) { + try { + clientManager.borrowClient(endPoint).generateDataPartitionTableHeartbeat(req, handler); + } catch (Exception ignore) { + // Just ignore + } + } + private static class AsyncDataNodeHeartbeatClientPoolHolder { private static final AsyncDataNodeHeartbeatClientPool INSTANCE = diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/CnToDnSyncRequestType.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/CnToDnSyncRequestType.java index 4055398ddb7ec..790fd637d616a 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/CnToDnSyncRequestType.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/CnToDnSyncRequestType.java @@ -37,6 +37,11 @@ public enum CnToDnSyncRequestType { DELETE_OLD_REGION_PEER, RESET_PEER_LIST, + // Data Partition Table Maintenance + COLLECT_EARLIEST_TIMESLOTS, + GENERATE_DATA_PARTITION_TABLE, + GENERATE_DATA_PARTITION_TABLE_HEART_BEAT, + // PartitionCache INVALIDATE_PARTITION_CACHE, INVALIDATE_PERMISSION_CACHE, diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/SyncDataNodeClientPool.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/SyncDataNodeClientPool.java index d63d5a74f6095..84c027e513298 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/SyncDataNodeClientPool.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/SyncDataNodeClientPool.java @@ -32,6 +32,7 @@ import org.apache.iotdb.mpp.rpc.thrift.TCreateDataRegionReq; import org.apache.iotdb.mpp.rpc.thrift.TCreatePeerReq; import org.apache.iotdb.mpp.rpc.thrift.TCreateSchemaRegionReq; +import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableReq; import org.apache.iotdb.mpp.rpc.thrift.TInvalidateCacheReq; import org.apache.iotdb.mpp.rpc.thrift.TInvalidatePermissionCacheReq; import org.apache.iotdb.mpp.rpc.thrift.TKillQueryInstanceReq; @@ -139,6 +140,15 @@ private void buildActionMap() { actionMapBuilder.put( CnToDnSyncRequestType.SHOW_APPLIED_CONFIGURATIONS, (req, client) -> client.showAppliedConfigurations()); + actionMapBuilder.put( + CnToDnSyncRequestType.COLLECT_EARLIEST_TIMESLOTS, + (req, client) -> client.getEarliestTimeslots()); + actionMapBuilder.put( + CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE, + (req, client) -> client.generateDataPartitionTable((TGenerateDataPartitionTableReq) req)); + actionMapBuilder.put( + CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE_HEART_BEAT, + (req, client) -> client.generateDataPartitionTableHeartbeat()); actionMap = actionMapBuilder.build(); } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java index 0fe3abc79a72b..06f9534d39779 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java @@ -67,6 +67,7 @@ import org.apache.iotdb.confignode.procedure.impl.node.RemoveAINodeProcedure; import org.apache.iotdb.confignode.procedure.impl.node.RemoveConfigNodeProcedure; import org.apache.iotdb.confignode.procedure.impl.node.RemoveDataNodesProcedure; +import org.apache.iotdb.confignode.procedure.impl.partition.DataPartitionTableIntegrityCheckProcedure; import org.apache.iotdb.confignode.procedure.impl.pipe.plugin.CreatePipePluginProcedure; import org.apache.iotdb.confignode.procedure.impl.pipe.plugin.DropPipePluginProcedure; import org.apache.iotdb.confignode.procedure.impl.pipe.runtime.PipeHandleLeaderChangeProcedure; @@ -1374,6 +1375,18 @@ public TSStatus createRegionGroups( } } + /** + * Used to repair the lost data partition table + */ + public TSStatus dataPartitionTableIntegrityCheck() { + DataPartitionTableIntegrityCheckProcedure procedure; + synchronized (this) { + procedure = new DataPartitionTableIntegrityCheckProcedure(); + executor.submitProcedure(procedure); + } + return waitingProcedureFinished(procedure); + } + /** * Generate {@link CreateTriggerProcedure} and wait until it finished. * diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/load/service/HeartbeatService.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/load/service/HeartbeatService.java index 64322da5bbb20..a2b1c3ed66ffd 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/load/service/HeartbeatService.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/load/service/HeartbeatService.java @@ -279,6 +279,9 @@ private void pingRegisteredDataNodes( AsyncDataNodeHeartbeatClientPool.getInstance() .getDataNodeHeartBeat( dataNodeInfo.getLocation().getInternalEndPoint(), heartbeatReq, handler); + AsyncDataNodeHeartbeatClientPool.getInstance() + .generateDataPartitionTableHeartbeat( + dataNodeInfo.getLocation().getInternalEndPoint(), heartbeatReq, handler); } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/ConfigNodeProcedureEnv.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/ConfigNodeProcedureEnv.java new file mode 100644 index 0000000000000..c1ebd7ffccde1 --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/ConfigNodeProcedureEnv.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.procedure.impl.partition; + +import org.apache.iotdb.confignode.manager.ConfigManager; + +/** + * Environment object for ConfigNode procedures. Provides access to ConfigManager and other + * necessary components. + */ +public class ConfigNodeProcedureEnv { + + private final ConfigManager configManager; + + public ConfigNodeProcedureEnv(ConfigManager configManager) { + this.configManager = configManager; + } + + public ConfigManager getConfigManager() { + return configManager; + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java new file mode 100644 index 0000000000000..860f34ed1d735 --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -0,0 +1,578 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.procedure.impl.partition; + +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; +import org.apache.iotdb.common.rpc.thrift.TDataNodeConfiguration; +import org.apache.iotdb.common.rpc.thrift.TSeriesPartitionSlot; +import org.apache.iotdb.common.rpc.thrift.TTimePartitionSlot; +import org.apache.iotdb.commons.enums.DataPartitionTableGeneratorState; +import org.apache.iotdb.commons.partition.DataPartitionTable; +import org.apache.iotdb.commons.utils.TimePartitionUtils; +import org.apache.iotdb.confignode.client.sync.CnToDnSyncRequestType; +import org.apache.iotdb.confignode.client.sync.SyncDataNodeClientPool; +import org.apache.iotdb.confignode.consensus.request.read.partition.GetDataPartitionPlan; +import org.apache.iotdb.confignode.manager.node.NodeManager; +import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; +import org.apache.iotdb.confignode.procedure.exception.ProcedureException; +import org.apache.iotdb.confignode.procedure.impl.StateMachineProcedure; +import org.apache.iotdb.confignode.procedure.state.DataPartitionTableIntegrityCheckProcedureState; +import org.apache.iotdb.confignode.rpc.thrift.TTimeSlotList; +import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableResp; +import org.apache.iotdb.mpp.rpc.thrift.TGetEarliestTimeslotsResp; +import org.apache.iotdb.rpc.TSStatusCode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; + +/** + * Procedure for checking and restoring data partition table integrity. This procedure scans all + * DataNodes to detect missing data partitions and restores the DataPartitionTable on the ConfigNode + * Leader. + */ +public class DataPartitionTableIntegrityCheckProcedure + extends StateMachineProcedure< + ConfigNodeProcedureEnv, DataPartitionTableIntegrityCheckProcedureState> { + + private static final Logger LOG = + LoggerFactory.getLogger(DataPartitionTableIntegrityCheckProcedure.class); + + /** Error codes for DataNode responses */ + public static final int DN_ERROR_CODE_SUCCESS = 0; + + public static final int DN_ERROR_CODE_IN_PROGRESS = 2; + public static final int DN_ERROR_CODE_FAILED = 1; + public static final int DN_ERROR_CODE_UNKNOWN = -1; + + /** Collected earliest timeslots from DataNodes: database -> earliest timeslot */ + private Map earliestTimeslots = new ConcurrentHashMap<>(); + + /** DataPartitionTables collected from DataNodes: dataNodeId -> DataPartitionTable */ + private Map dataPartitionTables = new ConcurrentHashMap<>(); + + /** Final merged DataPartitionTable */ + private DataPartitionTable finalDataPartitionTable; + + /** List of DataNodes that need to generate DataPartitionTable */ + private List allDataNodes = new ArrayList<>(); + + private Set lostDataPartitionsOfDatabases; + + NodeManager dataNodeManager; + + /** Current retry attempt */ + private int retryCount = 0; + + private static final int MAX_RETRY_COUNT = 3; + + private static Set skipDnIds; + private static Set failedDnIds; + + private static ScheduledExecutorService heartBeatExecutor; + private static final long HEART_BEAT_REQUEST_RATE = 60000; + + public DataPartitionTableIntegrityCheckProcedure() { + super(); + } + + @Override + protected Flow executeFromState(ConfigNodeProcedureEnv env, DataPartitionTableIntegrityCheckProcedureState state) + throws InterruptedException { + try { + // Ensure to get the real-time DataNodes in the current cluster at every step + dataNodeManager = env.getConfigManager().getNodeManager(); + allDataNodes = dataNodeManager.getRegisteredDataNodes(); + + switch (state) { + case COLLECT_EARLIEST_TIMESLOTS: + failedDnIds = new HashSet<>(); + return collectEarliestTimeslots(env); + case ANALYZE_MISSING_PARTITIONS: + return analyzeMissingPartitions(env); + case REQUEST_PARTITION_TABLES: + heartBeatExecutor = Executors.newScheduledThreadPool(allDataNodes.size()); + return requestPartitionTables(env); + case MERGE_PARTITION_TABLES: + return mergePartitionTables(env); + case WRITE_PARTITION_TABLE_TO_RAFT: + return writePartitionTableToRaft(env); + default: + throw new ProcedureException("Unknown state: " + state); + } + } catch (Exception e) { + LOG.error("Error executing state {}: {}", state, e.getMessage(), e); + setFailure("DataPartitionTableIntegrityCheckProcedure", e); + return Flow.NO_MORE_STATE; + } + } + + @Override + protected void rollbackState(ConfigNodeProcedureEnv env, DataPartitionTableIntegrityCheckProcedureState state) + throws IOException, InterruptedException, ProcedureException { + switch (state) { + case COLLECT_EARLIEST_TIMESLOTS: + case ANALYZE_MISSING_PARTITIONS: + case REQUEST_PARTITION_TABLES: + case MERGE_PARTITION_TABLES: + case WRITE_PARTITION_TABLE_TO_RAFT: + // Cleanup resources + earliestTimeslots.clear(); + dataPartitionTables.clear(); + allDataNodes.clear(); + finalDataPartitionTable = null; + break; + case SUCCESS: + case FAILED: + // No cleanup needed for terminal states + break; + default: + throw new ProcedureException("Unknown state for rollback: " + state); + } + } + + @Override + protected DataPartitionTableIntegrityCheckProcedureState getState(int stateId) { + return null; + } + + @Override + protected int getStateId(DataPartitionTableIntegrityCheckProcedureState state) { + return 0; + } + + @Override + protected DataPartitionTableIntegrityCheckProcedureState getInitialState() { + skipDnIds = new HashSet<>(); + return DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS; + } + + /** + * Collect earliest timeslot information from all DataNodes. Each DataNode returns a Map where key is database name and value is the earliest timeslot id. + */ + private Flow collectEarliestTimeslots(ConfigNodeProcedureEnv env) { + if (LOG.isDebugEnabled()) { + LOG.debug("Collecting earliest timeslots from all DataNodes..."); + } + + if (allDataNodes.isEmpty()) { + LOG.error("No DataNodes registered, no way to collect earliest timeslots, terminating procedure"); + setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); + return Flow.HAS_MORE_STATE; + } + + // Collect earliest timeslots from all DataNodes + Map mergedEarliestTimeslots = new ConcurrentHashMap<>(); + + for (TDataNodeConfiguration dataNode : allDataNodes) { + try { + TGetEarliestTimeslotsResp resp = (TGetEarliestTimeslotsResp) SyncDataNodeClientPool.getInstance() + .sendSyncRequestToDataNodeWithGivenRetry(dataNode.getLocation().getInternalEndPoint(), null, CnToDnSyncRequestType.COLLECT_EARLIEST_TIMESLOTS, MAX_RETRY_COUNT); + if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + failedDnIds.add(dataNode.getLocation().getDataNodeId()); + LOG.error("Failed to collected earliest timeslots from the DataNode[id={}], response status is {}", dataNode.getLocation().getDataNodeId(), resp.getStatus()); + continue; + } + + Map nodeTimeslots = resp.getDatabaseToEarliestTimeslot(); + + // Merge with existing timeslots (take minimum) + for (Map.Entry entry : nodeTimeslots.entrySet()) { + mergedEarliestTimeslots.merge(entry.getKey(), entry.getValue(), Math::min); + } + + if (LOG.isDebugEnabled()) { + LOG.debug( + "Collected earliest timeslots from the DataNode[id={}]: {}", + dataNode.getLocation().getDataNodeId(), + nodeTimeslots); + } + } catch (Exception e) { + LOG.error( + "Failed to collect earliest timeslots from the DataNode[id={}]: {}", + dataNode.getLocation().getDataNodeId(), + e.getMessage(), + e); + failedDnIds.add(dataNode.getLocation().getDataNodeId()); + } + } + + earliestTimeslots = mergedEarliestTimeslots; + + if (LOG.isDebugEnabled()) { + LOG.info( + "Collected earliest timeslots from {} DataNodes: {}, the number of successful DataNodes is {}", + allDataNodes.size(), + earliestTimeslots, + allDataNodes.size() - failedDnIds.size()); + } + + Set allDnIds = allDataNodes.stream().map(dataNodeConfiguration -> dataNodeConfiguration.getLocation().getDataNodeId()).collect(Collectors.toSet()); + if (failedDnIds.size() == allDataNodes.size() && allDnIds.containsAll(failedDnIds)) { + setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); + } else { + setNextState(DataPartitionTableIntegrityCheckProcedureState.ANALYZE_MISSING_PARTITIONS); + } + return Flow.HAS_MORE_STATE; + } + + /** + * Analyze which data partitions are missing based on earliest timeslots. Identify data partitions of databases need to be repaired. + */ + private Flow analyzeMissingPartitions(ConfigNodeProcedureEnv env) { + if (LOG.isDebugEnabled()) { + LOG.debug("Analyzing missing data partitions..."); + } + + if (earliestTimeslots.isEmpty()) { + LOG.error("No missing data partitions detected, nothing needs to be repaired, terminating procedure"); + setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); + return Flow.HAS_MORE_STATE; + } + + // Find all databases that have lost data partition tables + lostDataPartitionsOfDatabases = new HashSet<>(); + + for (Map.Entry entry : earliestTimeslots.entrySet()) { + String database = entry.getKey(); + long earliestTimeslot = entry.getValue(); + + // Get current DataPartitionTable from ConfigManager + Map>>> + dataPartitionTable = getLocalDataPartitionTable(env, database); + + // Check if ConfigNode has a data partition that is associated with the earliestTimeslot + if (dataPartitionTable.isEmpty() || dataPartitionTable.get(database) == null || dataPartitionTable.get(database).isEmpty()) { + LOG.error("No data partition table related to database {} was found from the ConfigNode", database); + continue; + } + + Map>> seriesPartitionMap = dataPartitionTable.get(database); + for (Map.Entry>> + seriesPartitionEntry : seriesPartitionMap.entrySet()) { + Map> tTimePartitionSlotListMap = seriesPartitionEntry.getValue(); + tTimePartitionSlotListMap.keySet().forEach(slot -> { + if (!TimePartitionUtils.satisfyPartitionId(slot.getStartTime(), earliestTimeslot)) { + lostDataPartitionsOfDatabases.add(database); + LOG.warn("Database {} has lost timeslot {} in its data table partition, and this issue needs to be repaired", database, earliestTimeslot); + } + }); + } + } + + if (lostDataPartitionsOfDatabases.isEmpty()) { + LOG.info("No databases have lost data partitions, terminating procedure"); + return Flow.NO_MORE_STATE; + } + + LOG.info( + "Identified {} databases have lost data partitions, will request DataPartitionTable generation from {} DataNodes", + lostDataPartitionsOfDatabases.size(), + allDataNodes.size() - failedDnIds.size()); + setNextState(DataPartitionTableIntegrityCheckProcedureState.REQUEST_PARTITION_TABLES); + return Flow.HAS_MORE_STATE; + } + + private Map>>> getLocalDataPartitionTable(ConfigNodeProcedureEnv env, String database) { + Map> schemaPartitionTable = env.getConfigManager().getSchemaPartition(Collections.singletonMap(database, Collections.emptyList())) + .getSchemaPartitionTable(); + + // Construct request for getting data partition + final Map> partitionSlotsMap = new HashMap<>(); + schemaPartitionTable.forEach( + (key, value) -> { + Map slotListMap = new HashMap<>(); + value + .keySet() + .forEach( + slot -> + slotListMap.put( + slot, new TTimeSlotList(Collections.emptyList(), true, true))); + partitionSlotsMap.put(key, slotListMap); + }); + final GetDataPartitionPlan getDataPartitionPlan = new GetDataPartitionPlan(partitionSlotsMap); + return env.getConfigManager().getDataPartition(getDataPartitionPlan).getDataPartitionTable(); + } + + /** + * Request DataPartitionTable generation from target DataNodes. Each DataNode scans its tsfile + * resources and generates a DataPartitionTable. + */ + private Flow requestPartitionTables(ConfigNodeProcedureEnv env) { + if (LOG.isDebugEnabled()) { + LOG.debug("Requesting DataPartitionTable generation from {} DataNodes...", allDataNodes.size()); + } + + if (allDataNodes.isEmpty()) { + LOG.error("No DataNodes registered, no way to requested DataPartitionTable generation, terminating procedure"); + setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); + return Flow.HAS_MORE_STATE; + } + + heartBeatExecutor.scheduleAtFixedRate(this::checkPartitionTableGenerationStatus, 0, HEART_BEAT_REQUEST_RATE, TimeUnit.MILLISECONDS); + + for (TDataNodeConfiguration dataNode : allDataNodes) { + int dataNodeId = dataNode.getLocation().getDataNodeId(); + if (!dataPartitionTables.containsKey(dataNodeId)) { + try { + TGenerateDataPartitionTableResp resp = (TGenerateDataPartitionTableResp) SyncDataNodeClientPool.getInstance() + .sendSyncRequestToDataNodeWithGivenRetry(dataNode.getLocation().getInternalEndPoint(), null, CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE, MAX_RETRY_COUNT); + if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + failedDnIds.add(dataNode.getLocation().getDataNodeId()); + LOG.error("Failed to request DataPartitionTable generation from the DataNode[id={}], response status is {}", dataNode.getLocation().getDataNodeId(), resp.getStatus()); + continue; + } + + byte[] bytes = resp.getDataPartitionTable(); + DataPartitionTable dataPartitionTable = new DataPartitionTable(); + dataPartitionTable.deserialize(ByteBuffer.wrap(bytes)); + dataPartitionTables.put(dataNodeId, dataPartitionTable); + } catch (Exception e) { + failedDnIds.add(dataNode.getLocation().getDataNodeId()); + LOG.error( + "Failed to request DataPartitionTable generation from DataNode[id={}]: {}", + dataNodeId, + e.getMessage(), + e); + } + } + } + + Set allDnIds = allDataNodes.stream().map(dataNodeConfiguration -> dataNodeConfiguration.getLocation().getDataNodeId()).collect(Collectors.toSet()); + if (failedDnIds.size() == allDataNodes.size() && allDnIds.containsAll(failedDnIds)) { + setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); + return Flow.HAS_MORE_STATE; + } + + setNextState(DataPartitionTableIntegrityCheckProcedureState.WRITE_PARTITION_TABLE_TO_RAFT); + return Flow.HAS_MORE_STATE; + } + + /** + * Check completion status of DataPartitionTable generation tasks. + */ + private void checkPartitionTableGenerationStatus() { + LOG.info("Checking DataPartitionTable generation completion status..."); + + int completeCount = 0; + for (TDataNodeConfiguration dataNode : allDataNodes) { + int dataNodeId = dataNode.getLocation().getDataNodeId(); + + if (!dataPartitionTables.containsKey(dataNodeId)) { + try { + TGenerateDataPartitionTableResp resp = (TGenerateDataPartitionTableResp) SyncDataNodeClientPool.getInstance() + .sendSyncRequestToDataNodeWithGivenRetry(dataNode.getLocation().getInternalEndPoint(), null, CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE_HEART_BEAT, MAX_RETRY_COUNT); + DataPartitionTableGeneratorState state = DataPartitionTableGeneratorState.getStateByCode(resp.getStatus().getCode()); + + switch (state) { + case SUCCESS: + LOG.info("DataNode {} completed DataPartitionTable generation, terminating heart beat", dataNodeId); + completeCount++; + break; + case IN_PROGRESS: + LOG.info("DataNode {} still generating DataPartitionTable", dataNodeId); + break; + case FAILED: + LOG.error("DataNode {} failed to generate DataPartitionTable, terminating heart beat", dataNodeId); + completeCount++; + break; + default: + LOG.error("DataNode {} returned unknown error code: {}", dataNodeId, resp.getStatus().getCode()); + break; + } + } catch (Exception e) { + LOG.error( + "Error checking DataPartitionTable status from DataNode {}: {}, terminating heart beat", + dataNodeId, + e.getMessage(), + e); + completeCount++; + } + } + } + + if (completeCount >= allDataNodes.size()) { + heartBeatExecutor.shutdown(); + } + } + + private static void declineThread() { + heartBeatExecutor.shutdown(); + } + + /** + * Merge DataPartitionTables from all DataNodes into a final table. + */ + private Flow mergePartitionTables(ConfigNodeProcedureEnv env) { + if (LOG.isDebugEnabled()) { + LOG.info("Merging DataPartitionTables from {} DataNodes...", dataPartitionTables.size()); + } + + if (dataPartitionTables.isEmpty()) { + LOG.error("No DataPartitionTables to merge, dataPartitionTables is empty"); + setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); + return Flow.HAS_MORE_STATE; + } + + try { + finalDataPartitionTable = new DataPartitionTable(); + + // TODO: Implement proper merging logic + // For now, use the first DataPartitionTable as the final one + if (!dataPartitionTables.isEmpty()) { + DataPartitionTable firstTable = dataPartitionTables.values().iterator().next(); + finalDataPartitionTable = firstTable; + + // In a real implementation, you would: + // 1. Merge all series partition slots from all DataNodes + // 2. For each series slot, merge time slot information + // 3. Resolve conflicts by choosing the most recent/complete data + // 4. Ensure consistency across all DataNodes + + LOG.info( + "Merged DataPartitionTable contains {} series partitions", + finalDataPartitionTable.getDataPartitionMap().size()); + } + + LOG.info("DataPartitionTable merge completed successfully"); + setNextState(DataPartitionTableIntegrityCheckProcedureState.WRITE_PARTITION_TABLE_TO_RAFT); + return Flow.HAS_MORE_STATE; + + } catch (Exception e) { + LOG.error("Failed to merge DataPartitionTables", e); + setFailure("DataPartitionTableIntegrityCheckProcedure", e); + return Flow.NO_MORE_STATE; + } + } + + /** Write the final DataPartitionTable to raft log. */ + private Flow writePartitionTableToRaft(ConfigNodeProcedureEnv env) { + LOG.info("Writing DataPartitionTable to raft log..."); + + if (finalDataPartitionTable == null) { + LOG.error("No DataPartitionTable to write to raft"); + setFailure( + "DataPartitionTableIntegrityCheckProcedure", + new ProcedureException("No DataPartitionTable available for raft write")); + return Flow.NO_MORE_STATE; + } + + try { + // TODO: Implement actual raft log write + // This should create a consensus request to write the DataPartitionTable + // Example: + // WriteDataPartitionTablePlan plan = new + // WriteDataPartitionTablePlan(finalDataPartitionTable); + // env.getConfigManager().getConsensusManager().write(plan); + + // For now, simulate successful write + boolean writeSuccess = true; + + if (writeSuccess) { + LOG.info("DataPartitionTable successfully written to raft log"); + setNextState(DataPartitionTableIntegrityCheckProcedureState.SUCCESS); + return Flow.HAS_MORE_STATE; + } else { + LOG.error("Failed to write DataPartitionTable to raft log"); + setFailure( + "DataPartitionTableIntegrityCheckProcedure", + new ProcedureException("Failed to write DataPartitionTable to raft log")); + return Flow.NO_MORE_STATE; + } + + } catch (Exception e) { + LOG.error("Error writing DataPartitionTable to raft log", e); + setFailure("DataPartitionTableIntegrityCheckProcedure", e); + return Flow.NO_MORE_STATE; + } + } + + // @TODO + @Override + public void serialize(DataOutputStream stream) throws IOException { + super.serialize(stream); + + // Serialize earliestTimeslots + stream.writeInt(earliestTimeslots.size()); + for (Map.Entry entry : earliestTimeslots.entrySet()) { + stream.writeUTF(entry.getKey()); + stream.writeLong(entry.getValue()); + } + + // Serialize dataPartitionTables count + stream.writeInt(dataPartitionTables.size()); + // Note: DataPartitionTable serialization would need to be implemented here + + // Serialize targetDataNodes count + stream.writeInt(targetDataNodes.size()); + for (TDataNodeConfiguration dataNode : targetDataNodes) { + stream.writeInt(dataNode.getLocation().getDataNodeId()); + } + + // Serialize retryCount + stream.writeInt(retryCount); + } + + // @TODO + @Override + public void deserialize(ByteBuffer byteBuffer) { + super.deserialize(byteBuffer); + + // Deserialize earliestTimeslots + int earliestTimeslotsSize = byteBuffer.getInt(); + earliestTimeslots = new ConcurrentHashMap<>(); + for (int i = 0; i < earliestTimeslotsSize; i++) { + String database = String.valueOf(byteBuffer.getChar()); + long timeslot = byteBuffer.getLong(); + earliestTimeslots.put(database, timeslot); + } + + // Deserialize dataPartitionTables count + int dataPartitionTablesSize = byteBuffer.getInt(); + dataPartitionTables = new ConcurrentHashMap<>(); + // Note: DataPartitionTable deserialization would need to be implemented here + + // Deserialize targetDataNodes + int targetDataNodesSize = byteBuffer.getInt(); + targetDataNodes = new ArrayList<>(); + for (int i = 0; i < targetDataNodesSize; i++) { + int dataNodeId = byteBuffer.getInt(); + // Note: TDataNodeLocation reconstruction would need to be implemented here + } + + // Deserialize retryCount + retryCount = byteBuffer.getInt(); + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/DataPartitionTableIntegrityCheckProcedureState.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/DataPartitionTableIntegrityCheckProcedureState.java new file mode 100644 index 0000000000000..7028adf9b4b9a --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/DataPartitionTableIntegrityCheckProcedureState.java @@ -0,0 +1,18 @@ +package org.apache.iotdb.confignode.procedure.state; + +public enum DataPartitionTableIntegrityCheckProcedureState { + /** Collect earliest timeslot information from all DataNodes */ + COLLECT_EARLIEST_TIMESLOTS, + /** Analyze missing data partitions */ + ANALYZE_MISSING_PARTITIONS, + /** Request DataPartitionTable generation from DataNodes */ + REQUEST_PARTITION_TABLES, + /** Merge DataPartitionTables from all DataNodes */ + MERGE_PARTITION_TABLES, + /** Write final DataPartitionTable to raft log */ + WRITE_PARTITION_TABLE_TO_RAFT, + /** Procedure completed successfully */ + SUCCESS, + /** Procedure failed */ + FAILED +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureFactory.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureFactory.java index dd15558608718..140fffa852ccc 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureFactory.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureFactory.java @@ -26,6 +26,7 @@ import org.apache.iotdb.confignode.procedure.impl.node.RemoveAINodeProcedure; import org.apache.iotdb.confignode.procedure.impl.node.RemoveConfigNodeProcedure; import org.apache.iotdb.confignode.procedure.impl.node.RemoveDataNodesProcedure; +import org.apache.iotdb.confignode.procedure.impl.partition.DataPartitionTableIntegrityCheckProcedure; import org.apache.iotdb.confignode.procedure.impl.pipe.plugin.CreatePipePluginProcedure; import org.apache.iotdb.confignode.procedure.impl.pipe.plugin.DropPipePluginProcedure; import org.apache.iotdb.confignode.procedure.impl.pipe.runtime.PipeHandleLeaderChangeProcedure; @@ -404,6 +405,9 @@ public Procedure create(ByteBuffer buffer) throws IOException { case ADD_NEVER_FINISH_SUB_PROCEDURE_PROCEDURE: procedure = new AddNeverFinishSubProcedureProcedure(); break; + case DATA_PARTITION_TABLE_INTEGRITY_CHECK_PROCEDURE: + procedure = new DataPartitionTableIntegrityCheckProcedure(); + break; default: LOGGER.error("Unknown Procedure type: {}", typeCode); throw new IOException("Unknown Procedure type: " + typeCode); @@ -554,6 +558,8 @@ public static ProcedureType getProcedureType(final Procedure procedure) { return ProcedureType.NEVER_FINISH_PROCEDURE; } else if (procedure instanceof AddNeverFinishSubProcedureProcedure) { return ProcedureType.ADD_NEVER_FINISH_SUB_PROCEDURE_PROCEDURE; + } else if (procedure instanceof DataPartitionTableIntegrityCheckProcedure) { + return ProcedureType.DATA_PARTITION_TABLE_INTEGRITY_CHECK_PROCEDURE; } throw new UnsupportedOperationException( "Procedure type " + procedure.getClass() + " is not supported"); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureType.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureType.java index 820a90f7ebfb9..839c8ace0984d 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureType.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureType.java @@ -172,7 +172,10 @@ public enum ProcedureType { @TestOnly NEVER_FINISH_PROCEDURE((short) 30000), @TestOnly - ADD_NEVER_FINISH_SUB_PROCEDURE_PROCEDURE((short) 30001); + ADD_NEVER_FINISH_SUB_PROCEDURE_PROCEDURE((short) 30001), + + /** Data Partition Table Integrity Check */ + DATA_PARTITION_TABLE_INTEGRITY_CHECK_PROCEDURE((short) 1600); private final short typeCode; diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java index f20f77095d97a..3befc7f1634f1 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java @@ -67,7 +67,6 @@ import org.apache.iotdb.metrics.metricsets.net.NetMetrics; import org.apache.iotdb.metrics.metricsets.system.SystemMetrics; import org.apache.iotdb.rpc.TSStatusCode; - import org.apache.ratis.util.ExitUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -203,6 +202,14 @@ public void active() { } loadSecretKey(); loadHardwareCode(); + + // The data partition table integrity check is only performed when the ConfigNode is the leader node + if (configManager.getConsensusManager().isLeader()) { + TSStatus status = configManager.getProcedureManager().dataPartitionTableIntegrityCheck(); + if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + LOGGER.error("Data partition table integrity check failed!"); + } + } return; } else { saveSecretKey(); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBConfig.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBConfig.java index 98c15a2d9bf06..2ce50415549e3 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBConfig.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBConfig.java @@ -1219,6 +1219,10 @@ public class IoTDBConfig { private long maxObjectSizeInByte = 4 * 1024 * 1024 * 1024L; + /* Need use these parameters when repair data partition table */ + private int partitionTableRecoverWorkerNum = 10; + private int partitionTableRecoverMaxReadBytesPerSecond = 1000; + IoTDBConfig() {} public int getMaxLogEntriesNumPerBatch() { @@ -4367,4 +4371,20 @@ public long getMaxObjectSizeInByte() { public void setMaxObjectSizeInByte(long maxObjectSizeInByte) { this.maxObjectSizeInByte = maxObjectSizeInByte; } + + public int getPartitionTableRecoverWorkerNum() { + return partitionTableRecoverWorkerNum; + } + + public void setPartitionTableRecoverWorkerNum(int partitionTableRecoverWorkerNum) { + this.partitionTableRecoverWorkerNum = partitionTableRecoverWorkerNum; + } + + public int getPartitionTableRecoverMaxReadBytesPerSecond() { + return partitionTableRecoverMaxReadBytesPerSecond; + } + + public void setPartitionTableRecoverMaxReadBytesPerSecond(int partitionTableRecoverWorkerNum) { + this.partitionTableRecoverWorkerNum = partitionTableRecoverWorkerNum; + } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java index 6730138b2af5c..4c4d7a6928747 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java @@ -1139,6 +1139,11 @@ public void loadProperties(TrimProperties properties) throws BadNodeUrlException // update trusted_uri_pattern loadTrustedUriPattern(properties); + conf.setPartitionTableRecoverWorkerNum( + Integer.parseInt(properties.getProperty("partition_table_recover_worker_num", String.valueOf(conf.getPartitionTableRecoverWorkerNum())))); + conf.setPartitionTableRecoverMaxReadBytesPerSecond( + Integer.parseInt(properties.getProperty("partition_table_recover_max_read_bytes_per_second", String.valueOf(conf.getPartitionTableRecoverMaxReadBytesPerSecond())))); + conf.setIncludeNullValueInWriteThroughputMetric( Boolean.parseBoolean( properties.getProperty( diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java new file mode 100644 index 0000000000000..689a12bd8df89 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java @@ -0,0 +1,357 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.partition; + +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; +import org.apache.iotdb.common.rpc.thrift.TSeriesPartitionSlot; +import org.apache.iotdb.common.rpc.thrift.TTimePartitionSlot; +import org.apache.iotdb.commons.partition.DataPartitionTable; +import org.apache.iotdb.commons.partition.SeriesPartitionTable; +import org.apache.iotdb.commons.partition.executor.SeriesPartitionExecutor; +import org.apache.iotdb.commons.utils.TimePartitionUtils; +import org.apache.iotdb.commons.utils.rateLimiter.LeakyBucketRateLimiter; +import org.apache.iotdb.db.conf.IoTDBDescriptor; +import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Generator for DataPartitionTable by scanning tsfile resources. This class scans the data + * directory structure and builds a complete DataPartitionTable based on existing tsfiles. + */ +public class DataPartitionTableGenerator { + + private static final Logger LOG = LoggerFactory.getLogger(DataPartitionTableGenerator.class); + + // Task status + private volatile TaskStatus status = TaskStatus.NOT_STARTED; + private volatile String errorMessage; + private volatile DataPartitionTable dataPartitionTable; + + // Progress tracking + private final AtomicInteger processedFiles = new AtomicInteger(0); + private final AtomicInteger failedFiles = new AtomicInteger(0); + private final AtomicLong totalFiles = new AtomicLong(0); + + // Configuration + private final String[] dataDirectories; + private final ExecutorService executor; + private final int seriesSlotNum; + private final String seriesPartitionExecutorClass; + + private static final int EXECUTOR_MAX_TIMEOUT = 60; + + private static final LeakyBucketRateLimiter limiter = + new LeakyBucketRateLimiter((long) IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverMaxReadBytesPerSecond() * 1024 * 1024); + + private static final String SCAN_FILE_SUFFIX_NAME = ".resource"; + + public DataPartitionTableGenerator( + String dataDirectory, + ExecutorService executor, + int seriesSlotNum, + String seriesPartitionExecutorClass) { + this.dataDirectories = new String[]{dataDirectory}; + this.executor = executor; + this.seriesSlotNum = seriesSlotNum; + this.seriesPartitionExecutorClass = seriesPartitionExecutorClass; + } + + public DataPartitionTableGenerator( + String[] dataDirectories, + ExecutorService executor, + int seriesSlotNum, + String seriesPartitionExecutorClass) { + this.dataDirectories = dataDirectories; + this.executor = executor; + this.seriesSlotNum = seriesSlotNum; + this.seriesPartitionExecutorClass = seriesPartitionExecutorClass; + } + + public enum TaskStatus { + NOT_STARTED, + IN_PROGRESS, + COMPLETED, + FAILED + } + + /** Start generating DataPartitionTable asynchronously. */ + public void startGeneration() { + if (status != TaskStatus.NOT_STARTED) { + throw new IllegalStateException("Task is already started or completed"); + } + + status = TaskStatus.IN_PROGRESS; + + CompletableFuture.runAsync( + () -> { + try { + generateDataPartitionTable(); + status = TaskStatus.COMPLETED; + } catch (Exception e) { + LOG.error("Failed to generate DataPartitionTable", e); + errorMessage = e.getMessage(); + status = TaskStatus.FAILED; + } + }); + } + + /** Generate DataPartitionTable by scanning all resource files. */ + private void generateDataPartitionTable() throws IOException { + LOG.info("Starting DataPartitionTable generation from {} directories", dataDirectories.length); + + List> futures = new ArrayList<>(); + + Map dataPartitionMap = new ConcurrentHashMap<>(); + + try { + // Count total files first for progress tracking + countTotalFiles(); + + // Process all data directories + for (String dataDirectory : dataDirectories) { + LOG.info("Processing data directory: {}", dataDirectory); + + // First layer: database directories + Files.list(Paths.get(dataDirectory)) + .filter(Files::isDirectory) + .forEach( + dbPath -> { + String databaseName = dbPath.getFileName().toString(); + LOG.debug("Processing database: {}", databaseName); + + try { + Files.list(dbPath) + .filter(Files::isDirectory) + .forEach( + regionPath -> { + processRegionDirectory( + regionPath, + databaseName, + dataPartitionMap, + executor, + futures); + }); + } catch (IOException e) { + LOG.error("Failed to process database directory: {}", dbPath, e); + failedFiles.incrementAndGet(); + } + }); + } + + // Wait for all tasks to complete + CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join(); + + dataPartitionTable = new DataPartitionTable(dataPartitionMap); + + LOG.info( + "DataPartitionTable generation completed. Processed: {}, Failed: {}", + processedFiles.get(), + failedFiles.get()); + + } finally { + executor.shutdown(); + try { + if (!executor.awaitTermination(EXECUTOR_MAX_TIMEOUT, TimeUnit.SECONDS)) { + executor.shutdownNow(); + } + } catch (InterruptedException e) { + executor.shutdownNow(); + Thread.currentThread().interrupt(); + } + } + } + + /** Process a region directory. */ + private void processRegionDirectory( + java.nio.file.Path regionPath, + String databaseName, + Map dataPartitionMap, + ExecutorService executor, + List> futures) { + + int regionId; + try { + regionId = Integer.parseInt(regionPath.getFileName().toString()); + LOG.debug("Processing region: {}", regionId); + } catch (NumberFormatException e) { + LOG.error("Invalid region directory: {}", regionPath); + return; + } + + TConsensusGroupId consensusGroupId = new TConsensusGroupId(); + consensusGroupId.setId(regionId); + + // Process time partitions asynchronously + CompletableFuture regionFuture = + CompletableFuture.runAsync( + () -> { + try { + Files.list(regionPath) + .filter(Files::isDirectory) + .forEach( + timeSlotPath -> { + processTimeSlotDirectory( + timeSlotPath, databaseName, consensusGroupId, dataPartitionMap); + }); + } catch (IOException e) { + LOG.error("Failed to list region directory: {}", regionPath, e); + } + }, + executor); + + futures.add(regionFuture); + } + + /** Process a time slot directory. */ + private void processTimeSlotDirectory( + java.nio.file.Path timeSlotPath, + String databaseName, + TConsensusGroupId consensusGroupId, + Map dataPartitionMap) { + + long timeSlotLong; + try { + timeSlotLong = Long.parseLong(timeSlotPath.getFileName().toString()); + LOG.debug("Processing time slot: {}", timeSlotLong); + } catch (NumberFormatException e) { + LOG.error("Invalid time slot directory: {}", timeSlotPath); + return; + } + + try { + // Fourth layer: .tsfile files + Files.walk(timeSlotPath) + .filter(Files::isRegularFile) + .filter(p -> p.toString().endsWith(SCAN_FILE_SUFFIX_NAME)) + .forEach( + tsFilePath -> { + processTsFile( + tsFilePath.toFile(), + consensusGroupId, + timeSlotLong, + dataPartitionMap); + }); + } catch (IOException e) { + LOG.error("Failed to walk time slot directory: {}", timeSlotPath, e); + } + } + + /** Process a single tsfile. */ + private void processTsFile( + File tsFile, + TConsensusGroupId consensusGroupId, + long timeSlotId, + Map dataPartitionMap) { + try { + TsFileResource tsFileResource = new TsFileResource(tsFile.getAbsoluteFile()); + tsFileResource.deserialize(); + + Set devices = tsFileResource.getDevices(limiter); + processedFiles.incrementAndGet(); + + SeriesPartitionExecutor seriesPartitionExecutor = + SeriesPartitionExecutor.getSeriesPartitionExecutor( + seriesPartitionExecutorClass, seriesSlotNum); + + for (org.apache.tsfile.file.metadata.IDeviceID deviceId : devices) { + TSeriesPartitionSlot seriesSlotId = seriesPartitionExecutor.getSeriesPartitionSlot(deviceId); + TTimePartitionSlot timePartitionSlot = new TTimePartitionSlot(TimePartitionUtils.getTimeByPartitionId(timeSlotId)); + dataPartitionMap.computeIfAbsent(seriesSlotId, empty -> newSeriesPartitionTable(consensusGroupId, timeSlotId)).putDataPartition(timePartitionSlot, consensusGroupId); + } + + if (processedFiles.get() % 1000 == 0) { + LOG.info("Processed {} files, current: {}", processedFiles.get(), tsFile.getName()); + } + } catch (IOException e) { + failedFiles.incrementAndGet(); + LOG.error("Failed to process tsfile: {} -> {}", tsFile.getAbsolutePath(), e.getMessage()); + } + } + + private static SeriesPartitionTable newSeriesPartitionTable(TConsensusGroupId consensusGroupId, long timeSlotId) { + SeriesPartitionTable seriesPartitionTable = new SeriesPartitionTable(); + TTimePartitionSlot timePartitionSlot = new TTimePartitionSlot(TimePartitionUtils.getTimeByPartitionId(timeSlotId)); + seriesPartitionTable.putDataPartition(timePartitionSlot, consensusGroupId); + return seriesPartitionTable; + } + + /** Count total files for progress tracking. */ + private void countTotalFiles() throws IOException { + AtomicLong fileCount = new AtomicLong(0); + + for (String dataDirectory : dataDirectories) { + Files.walk(Paths.get(dataDirectory)) + .filter(Files::isRegularFile) + .filter(p -> p.toString().endsWith(SCAN_FILE_SUFFIX_NAME)) + .forEach(p -> fileCount.incrementAndGet()); + } + + totalFiles.set(fileCount.get()); + LOG.info("Found {} resource files to process", totalFiles.get()); + } + + // Getters + public TaskStatus getStatus() { + return status; + } + + public String getErrorMessage() { + return errorMessage; + } + + public DataPartitionTable getDataPartitionTable() { + return dataPartitionTable; + } + + public int getProcessedFiles() { + return processedFiles.get(); + } + + public int getFailedFiles() { + return failedFiles.get(); + } + + public long getTotalFiles() { + return totalFiles.get(); + } + + public double getProgress() { + if (totalFiles.get() == 0) { + return 0.0; + } + return (double) (processedFiles.get() + failedFiles.get()) / totalFiles.get(); + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/OperationType.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/OperationType.java index 9c44de9f5fdca..881e823ef2d67 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/OperationType.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/OperationType.java @@ -55,7 +55,10 @@ public enum OperationType { WRITE_AUDIT_LOG("writeAuditLog"), PREPARE_STATEMENT("prepareStatement"), EXECUTE_PREPARED_STATEMENT("executePreparedStatement"), - DEALLOCATE_PREPARED_STATEMENT("deallocatePreparedStatement"); + DEALLOCATE_PREPARED_STATEMENT("deallocatePreparedStatement"), + GET_EARLIEST_TIMESLOTS("getEarliestTimeslots"), + GENERATE_DATA_PARTITION_TABLE("generateDataPartitionTable"), + CHECK_DATA_PARTITION_TABLE_STATUS("checkDataPartitionTableStatus"); private final String name; OperationType(String name) { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index 42929be741819..cf2178c3569c1 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -19,6 +19,7 @@ package org.apache.iotdb.db.protocol.thrift.impl; +import com.google.common.collect.ImmutableList; import org.apache.iotdb.common.rpc.thrift.TConfigNodeLocation; import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; @@ -61,8 +62,10 @@ import org.apache.iotdb.commons.consensus.SchemaRegionId; import org.apache.iotdb.commons.consensus.index.ProgressIndex; import org.apache.iotdb.commons.consensus.index.ProgressIndexType; +import org.apache.iotdb.commons.enums.DataPartitionTableGeneratorState; import org.apache.iotdb.commons.exception.IllegalPathException; import org.apache.iotdb.commons.exception.MetadataException; +import org.apache.iotdb.commons.partition.DataPartitionTable; import org.apache.iotdb.commons.path.ExtendedPartialPath; import org.apache.iotdb.commons.path.MeasurementPath; import org.apache.iotdb.commons.path.PartialPath; @@ -102,6 +105,7 @@ import org.apache.iotdb.db.consensus.SchemaRegionConsensusImpl; import org.apache.iotdb.db.exception.StorageEngineException; import org.apache.iotdb.db.exception.query.QueryProcessException; +import org.apache.iotdb.db.partition.DataPartitionTableGenerator; import org.apache.iotdb.db.pipe.agent.PipeDataNodeAgent; import org.apache.iotdb.db.protocol.client.ConfigNodeInfo; import org.apache.iotdb.db.protocol.client.cn.DnToCnInternalServiceAsyncRequestManager; @@ -260,6 +264,10 @@ import org.apache.iotdb.mpp.rpc.thrift.TFireTriggerReq; import org.apache.iotdb.mpp.rpc.thrift.TFireTriggerResp; import org.apache.iotdb.mpp.rpc.thrift.TFragmentInstanceInfoResp; +import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableHeartbeatResp; +import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableReq; +import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableResp; +import org.apache.iotdb.mpp.rpc.thrift.TGetEarliestTimeslotsResp; import org.apache.iotdb.mpp.rpc.thrift.TInactiveTriggerInstanceReq; import org.apache.iotdb.mpp.rpc.thrift.TInvalidateCacheReq; import org.apache.iotdb.mpp.rpc.thrift.TInvalidateColumnCacheReq; @@ -314,9 +322,10 @@ import org.apache.iotdb.service.rpc.thrift.TSInsertRecordReq; import org.apache.iotdb.trigger.api.enums.FailureStrategy; import org.apache.iotdb.trigger.api.enums.TriggerEvent; - -import com.google.common.collect.ImmutableList; import org.apache.thrift.TException; +import org.apache.thrift.protocol.TBinaryProtocol; +import org.apache.thrift.protocol.TProtocol; +import org.apache.thrift.transport.TIOStreamTransport; import org.apache.tsfile.enums.TSDataType; import org.apache.tsfile.exception.NotImplementedException; import org.apache.tsfile.read.common.TimeRange; @@ -331,9 +340,11 @@ import java.io.ByteArrayOutputStream; import java.io.DataOutputStream; +import java.io.File; import java.io.IOException; import java.net.URL; import java.nio.ByteBuffer; +import java.nio.file.Files; import java.time.ZoneId; import java.util.ArrayList; import java.util.Arrays; @@ -370,7 +381,6 @@ import static org.apache.iotdb.db.utils.ErrorHandlingUtils.onQueryException; public class DataNodeInternalRPCServiceImpl implements IDataNodeRPCService.Iface { - private static final Logger LOGGER = LoggerFactory.getLogger(DataNodeInternalRPCServiceImpl.class); @@ -414,6 +424,32 @@ public class DataNodeInternalRPCServiceImpl implements IDataNodeRPCService.Iface private static final String SYSTEM = "system"; + private final ExecutorService findEarliestTimeSlotExecutor = + new WrappedThreadPoolExecutor( + 0, + IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum(), + 0L, + TimeUnit.SECONDS, + new ArrayBlockingQueue<>( + IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum()), + new IoTThreadFactory(ThreadName.FIND_EARLIEST_TIME_SLOT_PARALLEL_POOL.getName()), + ThreadName.FIND_EARLIEST_TIME_SLOT_PARALLEL_POOL.getName(), + new ThreadPoolExecutor.CallerRunsPolicy()); + + private final ExecutorService partitionTableRecoverExecutor = + new WrappedThreadPoolExecutor( + 0, + IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum(), + 0L, + TimeUnit.SECONDS, + new ArrayBlockingQueue<>( + IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum()), + new IoTThreadFactory(ThreadName.DATA_PARTITION_RECOVER_PARALLEL_POOL.getName()), + ThreadName.DATA_PARTITION_RECOVER_PARALLEL_POOL.getName(), + new ThreadPoolExecutor.CallerRunsPolicy()); + + private static final long timeoutMs = 600000; // 600 seconds timeout + public DataNodeInternalRPCServiceImpl() { super(); partitionFetcher = ClusterPartitionFetcher.getInstance(); @@ -3117,4 +3153,258 @@ public TSStatus writeAuditLog(TAuditLogReq req) { public void handleClientExit() { // Do nothing } + + // ==================================================== + // Data Partition Table Integrity Check Implementation + // ==================================================== + + private volatile DataPartitionTableGenerator currentGenerator; + private volatile long currentTaskId = 0; + + @Override + public TGetEarliestTimeslotsResp getEarliestTimeslots() { + TGetEarliestTimeslotsResp resp = new TGetEarliestTimeslotsResp(); + + try { + Map earliestTimeslots = new HashMap<>(); + + // Get data directories from configuration + String[] dataDirs = IoTDBDescriptor.getInstance().getConfig().getDataDirs(); + + for (String dataDir : dataDirs) { + File dir = new File(dataDir); + if (dir.exists() && dir.isDirectory()) { + processDataDirectoryForEarliestTimeslots(dir, earliestTimeslots); + } + } + + resp.setStatus(RpcUtils.getStatus(TSStatusCode.SUCCESS_STATUS)); + resp.setDatabaseToEarliestTimeslot(earliestTimeslots); + + LOGGER.info("Retrieved earliest timeslots for {} databases", earliestTimeslots.size()); + + } catch (Exception e) { + LOGGER.error("Failed to get earliest timeslots", e); + resp.setStatus( + onIoTDBException( + e, + OperationType.GET_EARLIEST_TIMESLOTS, + TSStatusCode.INTERNAL_SERVER_ERROR.getStatusCode())); + } + + return resp; + } + + @Override + public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataPartitionTableReq req) { + TGenerateDataPartitionTableResp resp = new TGenerateDataPartitionTableResp(); + byte[] empty = new byte[0]; + + try { + // Check if there's already a task in the progress + if (currentGenerator != null + && currentGenerator.getStatus() == DataPartitionTableGenerator.TaskStatus.IN_PROGRESS) { + resp.setDataPartitionTable(empty); + resp.setErrorCode(DataPartitionTableGeneratorState.IN_PROGRESS.getCode()); + resp.setMessage("DataPartitionTable generation is already in the progress"); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + return resp; + } + + // Get data directories and configuration + String[] dataDirs = IoTDBDescriptor.getInstance().getConfig().getDataDirs(); + if (dataDirs.length == 0) { + resp.setDataPartitionTable(empty); + resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); + resp.setMessage("dataDirs parameter are not configured in the iotdb-system.properties"); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + return resp; + } + + // Create generator for all data directories + int seriesSlotNum = IoTDBDescriptor.getInstance().getConfig().getSeriesPartitionSlotNum(); + String seriesPartitionExecutorClass = + IoTDBDescriptor.getInstance().getConfig().getSeriesPartitionExecutorClass(); + + currentGenerator = + new DataPartitionTableGenerator( + dataDirs, partitionTableRecoverExecutor, seriesSlotNum, seriesPartitionExecutorClass); + currentTaskId = System.currentTimeMillis(); + + // Start generation synchronously for now to return the data partition table immediately + currentGenerator.startGeneration(); + + // Wait for completion (with timeout) + long startTime = System.currentTimeMillis(); + + while (currentGenerator.getStatus() == DataPartitionTableGenerator.TaskStatus.IN_PROGRESS) { + if (System.currentTimeMillis() - startTime > timeoutMs) { + resp.setErrorCode(DataPartitionTableGeneratorState.IN_PROGRESS.getCode()); + resp.setMessage("DataPartitionTable generation timed out"); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + return resp; + } + + try { + Thread.sleep(100); // Sleep for 100ms + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + resp.setDataPartitionTable(empty); + resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); + resp.setMessage("DataPartitionTable generation interrupted"); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + return resp; + } + } + + // Check final status + if (currentGenerator.getStatus() == DataPartitionTableGenerator.TaskStatus.COMPLETED) { + DataPartitionTable dataPartitionTable = currentGenerator.getDataPartitionTable(); + if (dataPartitionTable != null) { + ByteBuffer result = serializeDataPartitionTable(dataPartitionTable); + resp.setDataPartitionTable(result.array()); + } + + resp.setErrorCode(DataPartitionTableGeneratorState.SUCCESS.getCode()); + resp.setMessage("DataPartitionTable generation completed successfully"); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.SUCCESS_STATUS)); + + LOGGER.info("DataPartitionTable generation completed with task ID: {}", currentTaskId); + } else { + resp.setDataPartitionTable(empty); + resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); + resp.setMessage( + "DataPartitionTable generation failed: " + currentGenerator.getErrorMessage()); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + } + + // Clear current generator + currentGenerator = null; + } catch (Exception e) { + LOGGER.error("Failed to generate DataPartitionTable", e); + resp.setStatus( + onIoTDBException( + e, + OperationType.GENERATE_DATA_PARTITION_TABLE, + TSStatusCode.INTERNAL_SERVER_ERROR.getStatusCode())); + } + + return resp; + } + + @Override + public TGenerateDataPartitionTableHeartbeatResp generateDataPartitionTableHeartbeat() { + TGenerateDataPartitionTableHeartbeatResp resp = new TGenerateDataPartitionTableHeartbeatResp(); + + try { + if (currentGenerator == null) { + resp.setErrorCode(DataPartitionTableGeneratorState.UNKNOWN.getCode()); + resp.setMessage("No DataPartitionTable generation task found"); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + return resp; + } + + DataPartitionTableGenerator.TaskStatus status = currentGenerator.getStatus(); + + switch (status) { + case IN_PROGRESS: + resp.setErrorCode(DataPartitionTableGeneratorState.IN_PROGRESS.getCode()); + resp.setMessage( + String.format( + "DataPartitionTable generation in progress: %.1f%%", + currentGenerator.getProgress() * 100)); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + break; + case COMPLETED: + resp.setErrorCode(DataPartitionTableGeneratorState.SUCCESS.getCode()); + resp.setMessage("DataPartitionTable generation completed successfully"); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.SUCCESS_STATUS)); + break; + case FAILED: + resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); + resp.setMessage("DataPartitionTable generation failed: " + currentGenerator.getErrorMessage()); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + break; + default: + resp.setErrorCode(DataPartitionTableGeneratorState.UNKNOWN.getCode()); + resp.setMessage("Unknown task status: " + status); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + break; + } + } catch (Exception e) { + LOGGER.error("Failed to check DataPartitionTable generation status", e); + resp.setStatus( + onIoTDBException( + e, + OperationType.CHECK_DATA_PARTITION_TABLE_STATUS, + TSStatusCode.INTERNAL_SERVER_ERROR.getStatusCode())); + } + + return resp; + } + + /** Process data directory to find the earliest timeslots for each database. */ + private void processDataDirectoryForEarliestTimeslots( + File dataDir, Map earliestTimeslots) { + try { + Files.list(dataDir.toPath()) + .filter(Files::isDirectory) + .forEach( + dbPath -> { + String databaseName = dbPath.getFileName().toString(); + long earliestTimeslot = findEarliestTimeslotInDatabase(dbPath.toFile()); + + if (earliestTimeslot != Long.MAX_VALUE) { + earliestTimeslots.merge(databaseName, earliestTimeslot, Math::min); + } + }); + } catch (IOException e) { + LOGGER.error("Failed to process data directory: {}", dataDir, e); + } + } + + /** Find the earliest timeslot in a database directory. */ + private long findEarliestTimeslotInDatabase(File databaseDir) { + final AtomicLong earliest = new AtomicLong(Long.MAX_VALUE); + + try { + Files.walk(databaseDir.toPath()) + .filter(Files::isDirectory) + .forEach( + regionPath -> { + findEarliestTimeSlotExecutor.submit(() -> { + try { + Files.list(regionPath) + .filter(Files::isDirectory) + .forEach(timeSlotPath -> { + String timeSlotName = timeSlotPath.getFileName().toString(); + long timeslot = Long.parseLong(timeSlotName); + if (timeslot < earliest.get()) { + earliest.set(timeslot); + } + }); + } catch (IOException e) { + LOGGER.error("Failed to scan {}", regionPath, e); + } + }); + }); + } catch (IOException e) { + LOGGER.error("Failed to walk database directory: {}", databaseDir, e); + } + + return earliest.get(); + } + + /** Serialize DataPartitionTable to ByteBuffer for RPC transmission. */ + private ByteBuffer serializeDataPartitionTable(DataPartitionTable dataPartitionTable) { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); + TIOStreamTransport tioStreamTransport = new TIOStreamTransport(baos)) { + TProtocol protocol = new TBinaryProtocol(tioStreamTransport); + dataPartitionTable.serialize(baos, protocol); + return ByteBuffer.wrap(baos.toByteArray()); + } catch (Exception e) { + LOGGER.error("Failed to serialize DataPartitionTable", e); + return ByteBuffer.allocate(0); + } + } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/TsFileResource.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/TsFileResource.java index b84cce9e8d21b..d625b753e193b 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/TsFileResource.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/TsFileResource.java @@ -27,6 +27,7 @@ import org.apache.iotdb.commons.pipe.datastructure.resource.PersistentResource; import org.apache.iotdb.commons.utils.CommonDateTimeUtils; import org.apache.iotdb.commons.utils.TestOnly; +import org.apache.iotdb.commons.utils.rateLimiter.LeakyBucketRateLimiter; import org.apache.iotdb.db.conf.IoTDBConfig; import org.apache.iotdb.db.conf.IoTDBDescriptor; import org.apache.iotdb.db.exception.load.PartitionViolationException; @@ -677,6 +678,10 @@ public Set getDevices() { return timeIndex.getDevices(file.getPath(), this); } + public Set getDevices(LeakyBucketRateLimiter limiter) { + return timeIndex.getDevicesByRateLimiter(file.getPath(), this, limiter); + } + public ArrayDeviceTimeIndex buildDeviceTimeIndex(IDeviceID.Deserializer deserializer) throws IOException { readLock(); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java index e4a812012a8e3..6cbcc48021f77 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java @@ -22,6 +22,7 @@ import org.apache.iotdb.commons.path.PartialPath; import org.apache.iotdb.commons.utils.CommonDateTimeUtils; import org.apache.iotdb.commons.utils.TimePartitionUtils; +import org.apache.iotdb.commons.utils.rateLimiter.LeakyBucketRateLimiter; import org.apache.iotdb.db.exception.load.PartitionViolationException; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; @@ -120,6 +121,41 @@ public Set getDevices(String tsFilePath, TsFileResource tsFileResourc } } + @Override + public Set getDevices(String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter) { + byte[] buffer = new byte[64 * 1024]; + tsFileResource.readLock(); + try (InputStream inputStream = + FSFactoryProducer.getFSFactory() + .getBufferedInputStream(tsFilePath + TsFileResource.RESOURCE_SUFFIX)) { + // The first byte is VERSION_NUMBER, second byte is timeIndexType. + byte[] bytes = ReadWriteIOUtils.readBytes(inputStream, 2); + limiter.acquire(bytes.length); + if (bytes[1] == ARRAY_DEVICE_TIME_INDEX_TYPE) { + return ArrayDeviceTimeIndex.getDevices(inputStream); + } else { + return PlainDeviceTimeIndex.getDevices(inputStream); + } + } catch (NoSuchFileException e) { + // deleted by ttl + if (tsFileResource.isDeleted()) { + return Collections.emptySet(); + } else { + logger.error( + "Can't read file {} from disk ", tsFilePath + TsFileResource.RESOURCE_SUFFIX, e); + throw new RuntimeException( + "Can't read file " + tsFilePath + TsFileResource.RESOURCE_SUFFIX + " from disk"); + } + } catch (Exception e) { + logger.error( + "Failed to get devices from tsfile: {}", tsFilePath + TsFileResource.RESOURCE_SUFFIX, e); + throw new RuntimeException( + "Failed to get devices from tsfile: " + tsFilePath + TsFileResource.RESOURCE_SUFFIX); + } finally { + tsFileResource.readUnlock(); + } + } + @Override public boolean endTimeEmpty() { return endTime == Long.MIN_VALUE; diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java index d705a2417d7c6..7b3f047b34d92 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java @@ -20,9 +20,9 @@ package org.apache.iotdb.db.storageengine.dataregion.tsfile.timeindex; import org.apache.iotdb.commons.path.PartialPath; +import org.apache.iotdb.commons.utils.rateLimiter.LeakyBucketRateLimiter; import org.apache.iotdb.db.exception.load.PartitionViolationException; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; - import org.apache.tsfile.file.metadata.IDeviceID; import org.apache.tsfile.utils.Pair; import org.apache.tsfile.utils.ReadWriteIOUtils; @@ -74,6 +74,13 @@ ITimeIndex deserialize(InputStream inputStream, IDeviceID.Deserializer deseriali */ Set getDevices(String tsFilePath, TsFileResource tsFileResource); + /** + * get devices in TimeIndex that use inputStream + * + * @return device names + */ + Set getDevicesByRateLimiter(String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter); + /** * @return whether end time is empty (Long.MIN_VALUE) */ diff --git a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template index f4ebae2fb807e..f90f664572553 100644 --- a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template +++ b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template @@ -742,6 +742,21 @@ failure_detector_phi_acceptable_pause_in_ms=10000 # Datatype: double(percentage) disk_space_warning_threshold=0.05 +# The number of threads used for parallel scanning in the partition table recovery +# effectiveMode: restart +# Datatype: Integer +partition_table_recover_worker_num=10 + +# Limit the number of files used for parallel processing +# effectiveMode: restart +# Datatype: Integer +#partition_table_recover_process_file_num=1000 + +# Limit the number of bytes read per second from a file +# effectiveMode: restart +# Datatype: Integer +partition_table_recover_max_read_bytes_per_second=10 + #################### ### Memory Control Configuration #################### diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java index 6f9f95ca8fe88..39bc7eebfa92b 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java @@ -202,6 +202,8 @@ public enum ThreadName { FILE_TIME_INDEX_RECORD("FileTimeIndexRecord"), BINARY_ALLOCATOR_SAMPLE_EVICTOR("BinaryAllocator-SampleEvictor"), BINARY_ALLOCATOR_AUTO_RELEASER("BinaryAllocator-Auto-Releaser"), + FIND_EARLIEST_TIME_SLOT_PARALLEL_POOL("FindEarliestTimeSlot-Parallel-Pool"), + DATA_PARTITION_RECOVER_PARALLEL_POOL("DataPartitionRecover-Parallel-Pool"), // the unknown thread name is used for metrics UNKNOWN("UNKNOWN"); diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/enums/DataPartitionTableGeneratorState.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/enums/DataPartitionTableGeneratorState.java new file mode 100644 index 0000000000000..0d0d09c182e05 --- /dev/null +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/enums/DataPartitionTableGeneratorState.java @@ -0,0 +1,33 @@ +package org.apache.iotdb.commons.enums; + +public enum DataPartitionTableGeneratorState { + SUCCESS(0), + FAILED(1), + IN_PROGRESS(2), + UNKNOWN(-1); + + private final int code; + + DataPartitionTableGeneratorState(int code) { + this.code = code; + } + + public int getCode() { + return code; + } + + /** + * get DataNodeRemoveState by code + * + * @param code code + * @return DataNodeRemoveState + */ + public static DataPartitionTableGeneratorState getStateByCode(int code) { + for (DataPartitionTableGeneratorState state : DataPartitionTableGeneratorState.values()) { + if (state.code == code) { + return state; + } + } + return UNKNOWN; + } +} diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java index eb53cdb2798dd..c5dd3e401d13e 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java @@ -77,6 +77,12 @@ public static TTimePartitionSlot getTimePartitionSlot(long time) { return timePartitionSlot; } + public static TTimePartitionSlot getTimePartitionSlot(long partitionId) { + TTimePartitionSlot timePartitionSlot = new TTimePartitionSlot(); + timePartitionSlot.setStartTime(getTimePartitionLowerBound(time)); + return timePartitionSlot; + } + public static long getTimePartitionInterval() { return timePartitionInterval; } @@ -112,6 +118,14 @@ public static long getTimePartitionId(long time) { : time / timePartitionInterval - 1; } + public static long getTime(long partitionId) { + long time = partitionId * timePartitionInterval; + if (time > 0 || time % timePartitionInterval == 0) { + return time + timePartitionOrigin; + } + return ((partitionId + 1) * timePartitionInterval) + timePartitionOrigin; + } + public static long getTimePartitionIdWithoutOverflow(long time) { BigInteger bigTime = BigInteger.valueOf(time).subtract(bigTimePartitionOrigin); BigInteger partitionId = @@ -122,6 +136,18 @@ public static long getTimePartitionIdWithoutOverflow(long time) { return partitionId.longValue(); } + public static long getTimeWithoutOverflow(long partitionId) { + BigInteger bigTime = bigTimePartitionInterval.multiply(BigInteger.valueOf(partitionId)); + if (bigTime.compareTo(BigInteger.ZERO) > 0 || bigTime.remainder(bigTimePartitionInterval).equals(BigInteger.ZERO)) { + return bigTime.add(bigTimePartitionOrigin).longValue(); + } + return BigInteger.valueOf(partitionId).add(BigInteger.ONE).multiply(bigTimePartitionInterval).add(bigTimePartitionOrigin).longValue(); + } + + public static long getTimeByPartitionId(long partitionId) { + return originMayCauseOverflow ? getTimeWithoutOverflow(partitionId) : getTime(partitionId); + } + public static boolean satisfyPartitionId(long startTime, long endTime, long partitionId) { long startPartition = originMayCauseOverflow @@ -134,6 +160,13 @@ public static boolean satisfyPartitionId(long startTime, long endTime, long part return startPartition <= partitionId && endPartition >= partitionId; } + public static boolean satisfyPartitionId(long startTime, long partitionId) { + long endTime = startTime >= timePartitionLowerBoundWithoutOverflow + ? Long.MAX_VALUE + : (startTime + timePartitionInterval - 1); + return satisfyPartitionId(startTime, endTime, partitionId); + } + public static boolean satisfyPartitionStartTime(Filter timeFilter, long partitionStartTime) { if (timeFilter == null) { return true; diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java new file mode 100644 index 0000000000000..faff05c6ff69c --- /dev/null +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.commons.utils.rateLimiter; + +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.locks.LockSupport; + +/** + * A global leaky-bucket rate limiter for bytes throughput. + * Features: + * - Strict throughput limiting (no burst) + * - Smooth bandwidth shaping + * - Thread-safe + * - Fair for multi-thread + * - Low contention + */ +public class LeakyBucketRateLimiter { + /** bytes per second */ + private volatile long bytesPerSecond; + + /** start time */ + private final long startTimeNs; + + /** total consumed bytes */ + private final AtomicLong totalBytes = new AtomicLong(0); + + public LeakyBucketRateLimiter(long bytesPerSecond) { + if (bytesPerSecond <= 0) { + throw new IllegalArgumentException("bytesPerSecond must be > 0"); + } + this.bytesPerSecond = bytesPerSecond; + this.startTimeNs = System.nanoTime(); + } + + /** + * Acquire permission for reading bytes. + * + * This method will block if reading too fast. + */ + public void acquire(long bytes) { + if (bytes <= 0) { + return; + } + + long currentTotal = totalBytes.addAndGet(bytes); + + long expectedTimeNs = expectedTimeNs(currentTotal); + long now = System.nanoTime(); + + long sleepNs = expectedTimeNs - now; + + if (sleepNs > 0) { + LockSupport.parkNanos(sleepNs); + } + } + + /** + * Try acquire without blocking. + * + * @return true if allowed immediately + */ + public boolean tryAcquire(long bytes) { + if (bytes <= 0) { + return true; + } + + long currentTotal = totalBytes.addAndGet(bytes); + + long expectedTimeNs = expectedTimeNs(currentTotal); + long now = System.nanoTime(); + + if (expectedTimeNs <= now) { + return true; + } + + // rollback + totalBytes.addAndGet(-bytes); + return false; + } + + /** + * Update rate dynamically. + */ + public void setRate(long newBytesPerSecond) { + if (newBytesPerSecond <= 0) { + throw new IllegalArgumentException("bytesPerSecond must be > 0"); + } + this.bytesPerSecond = newBytesPerSecond; + } + + /** + * Current rate. + */ + public long getRate() { + return bytesPerSecond; + } + + /** + * Total bytes processed. + */ + public long getTotalBytes() { + return totalBytes.get(); + } + + /** + * Expected time based on bytes processed. + */ + private long expectedTimeNs(long totalBytes) { + return startTimeNs + (totalBytes * 1_000_000_000L) / bytesPerSecond; + } +} diff --git a/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift b/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift index cca7110f28d40..54479b2859875 100644 --- a/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift +++ b/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift @@ -678,6 +678,36 @@ struct TAuditLogReq { 11: required i32 cnId } +/** +* BEGIN: Data Partition Table Integrity Check Structures +**/ + +struct TGetEarliestTimeslotsResp { + 1: required common.TSStatus status + 2: optional map databaseToEarliestTimeslot +} + +struct TGenerateDataPartitionTableReq { + 1: required string database +} + +struct TGenerateDataPartitionTableResp { + 1: required common.TSStatus status + 2: required i32 errorCode + 3: optional string message + 4: optional binary dataPartitionTable +} + +struct TGenerateDataPartitionTableHeartbeatResp { + 1: required common.TSStatus status + 2: required i32 errorCode + 3: optional string message +} + +/** +* END: Data Partition Table Integrity Check Structures +**/ + /** * BEGIN: Used for EXPLAIN ANALYZE **/ @@ -1276,6 +1306,30 @@ service IDataNodeRPCService { * Write an audit log entry to the DataNode's AuditEventLogger */ common.TSStatus writeAuditLog(TAuditLogReq req); + + /** + * BEGIN: Data Partition Table Integrity Check + **/ + + /** + * Get earliest timeslot information from DataNode + * Returns map of database name to earliest timeslot id + */ + TGetEarliestTimeslotsResp getEarliestTimeslots() + + /** + * Request DataNode to generate DataPartitionTable by scanning tsfile resources + */ + TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataPartitionTableReq req) + + /** + * Check the status of DataPartitionTable generation task + */ + TGenerateDataPartitionTableHeartbeatResp generateDataPartitionTableHeartbeat() + + /** + * END: Data Partition Table Integrity Check + **/ } service MPPDataExchangeService { From 1fb3ab59c27e124d4425b1d4592b99b1b966712a Mon Sep 17 00:00:00 2001 From: libo Date: Mon, 9 Mar 2026 18:54:03 +0800 Subject: [PATCH 02/13] Debugged and verified all key logic in the procedure. --- .../AsyncDataNodeHeartbeatClientPool.java | 8 - .../confignode/conf/ConfigNodeConfig.java | 10 + .../confignode/conf/ConfigNodeDescriptor.java | 7 + .../load/service/HeartbeatService.java | 3 - .../confignode/manager/node/NodeManager.java | 48 +++ ...PartitionTableIntegrityCheckProcedure.java | 332 +++++++++++------- .../iotdb/confignode/service/ConfigNode.java | 47 ++- .../DataPartitionTableGenerator.java | 100 ++++-- .../impl/DataNodeInternalRPCServiceImpl.java | 77 ++-- .../dataregion/tsfile/TsFileResource.java | 2 +- .../timeindex/ArrayDeviceTimeIndex.java | 6 + .../tsfile/timeindex/FileTimeIndex.java | 1 - .../tsfile/timeindex/ITimeIndex.java | 4 +- .../DataNodeInternalRPCServiceImplTest.java | 50 +++ .../conf/iotdb-system.properties.template | 6 + .../iotdb/commons/ServerCommandLine.java | 3 +- .../commons/utils/TimePartitionUtils.java | 6 - .../src/main/thrift/datanode.thrift | 2 +- 18 files changed, 513 insertions(+), 199 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/AsyncDataNodeHeartbeatClientPool.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/AsyncDataNodeHeartbeatClientPool.java index d32cb5b416934..324e351302787 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/AsyncDataNodeHeartbeatClientPool.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/AsyncDataNodeHeartbeatClientPool.java @@ -63,14 +63,6 @@ public void writeAuditLog( } } - public void generateDataPartitionTableHeartbeat(TEndPoint endPoint, TDataNodeHeartbeatReq req, DataNodeHeartbeatHandler handler) { - try { - clientManager.borrowClient(endPoint).generateDataPartitionTableHeartbeat(req, handler); - } catch (Exception ignore) { - // Just ignore - } - } - private static class AsyncDataNodeHeartbeatClientPoolHolder { private static final AsyncDataNodeHeartbeatClientPool INSTANCE = diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java index 88e8d76001dc5..1c1485d90a0b8 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java @@ -316,6 +316,8 @@ public class ConfigNodeConfig { private long forceWalPeriodForConfigNodeSimpleInMs = 100; + private long partitionTableRecoverWaitAllDnUpTimeout=2000; + public ConfigNodeConfig() { // empty constructor } @@ -1275,4 +1277,12 @@ public long getFailureDetectorPhiAcceptablePauseInMs() { public void setFailureDetectorPhiAcceptablePauseInMs(long failureDetectorPhiAcceptablePauseInMs) { this.failureDetectorPhiAcceptablePauseInMs = failureDetectorPhiAcceptablePauseInMs; } + + public long getPartitionTableRecoverWaitAllDnUpTimeout() { + return partitionTableRecoverWaitAllDnUpTimeout; + } + + public void setPartitionTableRecoverWaitAllDnUpTimeout(long partitionTableRecoverWaitAllDnUpTimeout) { + this.partitionTableRecoverWaitAllDnUpTimeout = partitionTableRecoverWaitAllDnUpTimeout; + } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java index 0ea7a278732e5..12a1f08b953e5 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java @@ -319,6 +319,13 @@ private void loadProperties(TrimProperties properties) throws BadNodeUrlExceptio "failure_detector_phi_acceptable_pause_in_ms", String.valueOf(conf.getFailureDetectorPhiAcceptablePauseInMs())))); + conf.setPartitionTableRecoverWaitAllDnUpTimeout( + Long.parseLong( + properties.getProperty( + "partition_table_recover_wait_all_dn_up_timeout", + String.valueOf(conf.getPartitionTableRecoverWaitAllDnUpTimeout()))) + ); + String leaderDistributionPolicy = properties.getProperty("leader_distribution_policy", conf.getLeaderDistributionPolicy()); if (AbstractLeaderBalancer.GREEDY_POLICY.equals(leaderDistributionPolicy) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/load/service/HeartbeatService.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/load/service/HeartbeatService.java index a2b1c3ed66ffd..64322da5bbb20 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/load/service/HeartbeatService.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/load/service/HeartbeatService.java @@ -279,9 +279,6 @@ private void pingRegisteredDataNodes( AsyncDataNodeHeartbeatClientPool.getInstance() .getDataNodeHeartBeat( dataNodeInfo.getLocation().getInternalEndPoint(), heartbeatReq, handler); - AsyncDataNodeHeartbeatClientPool.getInstance() - .generateDataPartitionTableHeartbeat( - dataNodeInfo.getLocation().getInternalEndPoint(), heartbeatReq, handler); } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/node/NodeManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/node/NodeManager.java index e3d775259d626..7a7cf3ff13290 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/node/NodeManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/node/NodeManager.java @@ -352,6 +352,9 @@ public DataSet registerDataNode(TDataNodeRegisterReq req) { // Adjust the maximum RegionGroup number of each Database getClusterSchemaManager().adjustMaxRegionGroupNum(); + // Check if all DataNodes are registered and trigger integrity check if needed + checkAndTriggerIntegrityCheck(); + resp.setStatus(ClusterNodeStartUtils.ACCEPT_NODE_REGISTRATION); resp.setDataNodeId( registerDataNodePlan.getDataNodeConfiguration().getLocation().getDataNodeId()); @@ -1346,4 +1349,49 @@ private TTLManager getTTLManager() { private ExternalServiceManager getServiceManager() { return configManager.getExternalServiceManager(); } + + /** + * Check if all DataNodes are registered and running, then trigger integrity check. + * This method should be called after each DataNode registration. + */ + private void checkAndTriggerIntegrityCheck() { + // Only trigger integrity check if this ConfigNode is the leader + if (!configManager.getConsensusManager().isLeader()) { + return; + } + + // Get all registered DataNodes + List registeredDataNodes = getRegisteredDataNodes(); + + // Check if all registered DataNodes are running + boolean allDataNodesRunning = registeredDataNodes.stream() + .allMatch(dataNode -> { + Integer dataNodeId = dataNode.getLocation().getDataNodeId(); + NodeStatus status = getLoadManager().getLoadCache().getNodeStatus(dataNodeId); + return status == NodeStatus.Running; + }); + + if (allDataNodesRunning && !registeredDataNodes.isEmpty()) { + LOGGER.info("All {} DataNodes are registered and running, triggering data partition table integrity check", + registeredDataNodes.size()); + + // Trigger integrity check asynchronously + try { + configManager.getProcedureManager().dataPartitionTableIntegrityCheck(); + LOGGER.info("Data partition table integrity check procedure submitted successfully"); + } catch (Exception e) { + LOGGER.error("Failed to submit data partition table integrity check procedure", e); + } + } else { + LOGGER.debug("Not all DataNodes are ready yet. Registered: {}, Running: {}", + registeredDataNodes.size(), + (int) registeredDataNodes.stream() + .filter(dataNode -> { + Integer dataNodeId = dataNode.getLocation().getDataNodeId(); + NodeStatus status = getLoadManager().getLoadCache().getNodeStatus(dataNodeId); + return status == NodeStatus.Running; + }) + .count()); + } + } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index 860f34ed1d735..5c06b29418f65 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -21,28 +21,42 @@ import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; import org.apache.iotdb.common.rpc.thrift.TDataNodeConfiguration; +import org.apache.iotdb.common.rpc.thrift.TSStatus; import org.apache.iotdb.common.rpc.thrift.TSeriesPartitionSlot; import org.apache.iotdb.common.rpc.thrift.TTimePartitionSlot; import org.apache.iotdb.commons.enums.DataPartitionTableGeneratorState; import org.apache.iotdb.commons.partition.DataPartitionTable; +import org.apache.iotdb.commons.partition.SeriesPartitionTable; import org.apache.iotdb.commons.utils.TimePartitionUtils; import org.apache.iotdb.confignode.client.sync.CnToDnSyncRequestType; import org.apache.iotdb.confignode.client.sync.SyncDataNodeClientPool; import org.apache.iotdb.confignode.consensus.request.read.partition.GetDataPartitionPlan; +import org.apache.iotdb.confignode.consensus.request.write.partition.CreateDataPartitionPlan; import org.apache.iotdb.confignode.manager.node.NodeManager; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; import org.apache.iotdb.confignode.procedure.exception.ProcedureException; import org.apache.iotdb.confignode.procedure.impl.StateMachineProcedure; import org.apache.iotdb.confignode.procedure.state.DataPartitionTableIntegrityCheckProcedureState; import org.apache.iotdb.confignode.rpc.thrift.TTimeSlotList; +import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableHeartbeatResp; +import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableReq; import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableResp; import org.apache.iotdb.mpp.rpc.thrift.TGetEarliestTimeslotsResp; import org.apache.iotdb.rpc.TSStatusCode; +import org.apache.thrift.TException; +import org.apache.thrift.protocol.TBinaryProtocol; +import org.apache.thrift.transport.TIOStreamTransport; +import org.apache.thrift.transport.TTransport; +import org.apache.tsfile.utils.ReadWriteIOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.DataOutputStream; import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Collections; @@ -69,46 +83,36 @@ public class DataPartitionTableIntegrityCheckProcedure private static final Logger LOG = LoggerFactory.getLogger(DataPartitionTableIntegrityCheckProcedure.class); - /** Error codes for DataNode responses */ - public static final int DN_ERROR_CODE_SUCCESS = 0; + private static final int MAX_RETRY_COUNT = 3; + private static final long HEART_BEAT_REQUEST_RATE = 60000; - public static final int DN_ERROR_CODE_IN_PROGRESS = 2; - public static final int DN_ERROR_CODE_FAILED = 1; - public static final int DN_ERROR_CODE_UNKNOWN = -1; + NodeManager dataNodeManager; + private List allDataNodes = new ArrayList<>(); + //============Need serialize BEGIN=============/ /** Collected earliest timeslots from DataNodes: database -> earliest timeslot */ private Map earliestTimeslots = new ConcurrentHashMap<>(); /** DataPartitionTables collected from DataNodes: dataNodeId -> DataPartitionTable */ private Map dataPartitionTables = new ConcurrentHashMap<>(); + private Set lostDataPartitionsOfDatabases = new HashSet<>(); + /** Final merged DataPartitionTable */ private DataPartitionTable finalDataPartitionTable; - /** List of DataNodes that need to generate DataPartitionTable */ - private List allDataNodes = new ArrayList<>(); - - private Set lostDataPartitionsOfDatabases; - - NodeManager dataNodeManager; - - /** Current retry attempt */ - private int retryCount = 0; - - private static final int MAX_RETRY_COUNT = 3; - - private static Set skipDnIds; - private static Set failedDnIds; + private static Set skipDnIds = new HashSet<>(); + private static Set failedDnIds = new HashSet<>(); private static ScheduledExecutorService heartBeatExecutor; - private static final long HEART_BEAT_REQUEST_RATE = 60000; + //============Need serialize END=============/ public DataPartitionTableIntegrityCheckProcedure() { super(); } @Override - protected Flow executeFromState(ConfigNodeProcedureEnv env, DataPartitionTableIntegrityCheckProcedureState state) + protected Flow executeFromState(final ConfigNodeProcedureEnv env, final DataPartitionTableIntegrityCheckProcedureState state) throws InterruptedException { try { // Ensure to get the real-time DataNodes in the current cluster at every step @@ -120,9 +124,10 @@ protected Flow executeFromState(ConfigNodeProcedureEnv env, DataPartitionTableIn failedDnIds = new HashSet<>(); return collectEarliestTimeslots(env); case ANALYZE_MISSING_PARTITIONS: + lostDataPartitionsOfDatabases = new HashSet<>(); return analyzeMissingPartitions(env); case REQUEST_PARTITION_TABLES: - heartBeatExecutor = Executors.newScheduledThreadPool(allDataNodes.size()); + heartBeatExecutor = Executors.newScheduledThreadPool(1); return requestPartitionTables(env); case MERGE_PARTITION_TABLES: return mergePartitionTables(env); @@ -139,7 +144,7 @@ protected Flow executeFromState(ConfigNodeProcedureEnv env, DataPartitionTableIn } @Override - protected void rollbackState(ConfigNodeProcedureEnv env, DataPartitionTableIntegrityCheckProcedureState state) + protected void rollbackState(final ConfigNodeProcedureEnv env, final DataPartitionTableIntegrityCheckProcedureState state) throws IOException, InterruptedException, ProcedureException { switch (state) { case COLLECT_EARLIEST_TIMESLOTS: @@ -153,23 +158,19 @@ protected void rollbackState(ConfigNodeProcedureEnv env, DataPartitionTableInteg allDataNodes.clear(); finalDataPartitionTable = null; break; - case SUCCESS: - case FAILED: - // No cleanup needed for terminal states - break; default: throw new ProcedureException("Unknown state for rollback: " + state); } } @Override - protected DataPartitionTableIntegrityCheckProcedureState getState(int stateId) { - return null; + protected DataPartitionTableIntegrityCheckProcedureState getState(final int stateId) { + return DataPartitionTableIntegrityCheckProcedureState.values()[stateId]; } @Override - protected int getStateId(DataPartitionTableIntegrityCheckProcedureState state) { - return 0; + protected int getStateId(final DataPartitionTableIntegrityCheckProcedureState state) { + return state.ordinal(); } @Override @@ -182,7 +183,7 @@ protected DataPartitionTableIntegrityCheckProcedureState getInitialState() { * Collect earliest timeslot information from all DataNodes. Each DataNode returns a Map where key is database name and value is the earliest timeslot id. */ - private Flow collectEarliestTimeslots(ConfigNodeProcedureEnv env) { + private Flow collectEarliestTimeslots(final ConfigNodeProcedureEnv env) { if (LOG.isDebugEnabled()) { LOG.debug("Collecting earliest timeslots from all DataNodes..."); } @@ -194,8 +195,6 @@ private Flow collectEarliestTimeslots(ConfigNodeProcedureEnv env) { } // Collect earliest timeslots from all DataNodes - Map mergedEarliestTimeslots = new ConcurrentHashMap<>(); - for (TDataNodeConfiguration dataNode : allDataNodes) { try { TGetEarliestTimeslotsResp resp = (TGetEarliestTimeslotsResp) SyncDataNodeClientPool.getInstance() @@ -207,10 +206,14 @@ private Flow collectEarliestTimeslots(ConfigNodeProcedureEnv env) { } Map nodeTimeslots = resp.getDatabaseToEarliestTimeslot(); +// Map nodeTimeslots = new HashMap<>(); +// nodeTimeslots.put("test", 2927L); +// nodeTimeslots.put("root.test", 0L); +// nodeTimeslots.put("root.demo", 0L); // Merge with existing timeslots (take minimum) for (Map.Entry entry : nodeTimeslots.entrySet()) { - mergedEarliestTimeslots.merge(entry.getKey(), entry.getValue(), Math::min); + earliestTimeslots.merge(entry.getKey(), entry.getValue(), Math::min); } if (LOG.isDebugEnabled()) { @@ -229,8 +232,6 @@ private Flow collectEarliestTimeslots(ConfigNodeProcedureEnv env) { } } - earliestTimeslots = mergedEarliestTimeslots; - if (LOG.isDebugEnabled()) { LOG.info( "Collected earliest timeslots from {} DataNodes: {}, the number of successful DataNodes is {}", @@ -251,7 +252,7 @@ private Flow collectEarliestTimeslots(ConfigNodeProcedureEnv env) { /** * Analyze which data partitions are missing based on earliest timeslots. Identify data partitions of databases need to be repaired. */ - private Flow analyzeMissingPartitions(ConfigNodeProcedureEnv env) { + private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { if (LOG.isDebugEnabled()) { LOG.debug("Analyzing missing data partitions..."); } @@ -263,23 +264,21 @@ private Flow analyzeMissingPartitions(ConfigNodeProcedureEnv env) { } // Find all databases that have lost data partition tables - lostDataPartitionsOfDatabases = new HashSet<>(); - for (Map.Entry entry : earliestTimeslots.entrySet()) { String database = entry.getKey(); long earliestTimeslot = entry.getValue(); // Get current DataPartitionTable from ConfigManager Map>>> - dataPartitionTable = getLocalDataPartitionTable(env, database); + localDataPartitionTable = getLocalDataPartitionTable(env, database); // Check if ConfigNode has a data partition that is associated with the earliestTimeslot - if (dataPartitionTable.isEmpty() || dataPartitionTable.get(database) == null || dataPartitionTable.get(database).isEmpty()) { + if (localDataPartitionTable == null || localDataPartitionTable.isEmpty() || localDataPartitionTable.get(database) == null || localDataPartitionTable.get(database).isEmpty()) { LOG.error("No data partition table related to database {} was found from the ConfigNode", database); continue; } - Map>> seriesPartitionMap = dataPartitionTable.get(database); + Map>> seriesPartitionMap = localDataPartitionTable.get(database); for (Map.Entry>> seriesPartitionEntry : seriesPartitionMap.entrySet()) { Map> tTimePartitionSlotListMap = seriesPartitionEntry.getValue(); @@ -292,7 +291,9 @@ private Flow analyzeMissingPartitions(ConfigNodeProcedureEnv env) { } } - if (lostDataPartitionsOfDatabases.isEmpty()) { + //@TODO simulate case that lost data partition +// if (lostDataPartitionsOfDatabases.isEmpty()) { + if (!lostDataPartitionsOfDatabases.isEmpty()) { LOG.info("No databases have lost data partitions, terminating procedure"); return Flow.NO_MORE_STATE; } @@ -305,7 +306,7 @@ private Flow analyzeMissingPartitions(ConfigNodeProcedureEnv env) { return Flow.HAS_MORE_STATE; } - private Map>>> getLocalDataPartitionTable(ConfigNodeProcedureEnv env, String database) { + private Map>>> getLocalDataPartitionTable(final ConfigNodeProcedureEnv env, final String database) { Map> schemaPartitionTable = env.getConfigManager().getSchemaPartition(Collections.singletonMap(database, Collections.emptyList())) .getSchemaPartitionTable(); @@ -330,7 +331,7 @@ private Map finalDataPartitionMap = new HashMap<>(); + + for (String database : lostDataPartitionsOfDatabases) { + // Get current DataPartitionTable from ConfigManager + Map>>> + localDataPartitionTableMap = getLocalDataPartitionTable(env, database); - // TODO: Implement proper merging logic - // For now, use the first DataPartitionTable as the final one - if (!dataPartitionTables.isEmpty()) { - DataPartitionTable firstTable = dataPartitionTables.values().iterator().next(); - finalDataPartitionTable = firstTable; - - // In a real implementation, you would: - // 1. Merge all series partition slots from all DataNodes - // 2. For each series slot, merge time slot information - // 3. Resolve conflicts by choosing the most recent/complete data - // 4. Ensure consistency across all DataNodes - - LOG.info( - "Merged DataPartitionTable contains {} series partitions", - finalDataPartitionTable.getDataPartitionMap().size()); + // Check if ConfigNode has a data partition that is associated with the earliestTimeslot + if (localDataPartitionTableMap == null || localDataPartitionTableMap.isEmpty() || localDataPartitionTableMap.get(database) == null || localDataPartitionTableMap.get(database).isEmpty()) { + LOG.error("No data partition table related to database {} was found from the ConfigNode", database); + continue; + } + + localDataPartitionTableMap.values().forEach(map -> map.forEach((tSeriesPartitionSlot, seriesPartitionTableMap) -> { + if (tSeriesPartitionSlot == null || seriesPartitionTableMap == null || seriesPartitionTableMap.isEmpty()) { + return; + } + finalDataPartitionMap.computeIfAbsent(tSeriesPartitionSlot, k -> new SeriesPartitionTable(seriesPartitionTableMap)); + })); } + finalDataPartitionMap.forEach((tSeriesPartitionSlot, seriesPartitionTable) -> { + dataPartitionTables.values().forEach(dataPartitionTable -> { + if (dataPartitionTable == null || dataPartitionTable.getDataPartitionMap() == null || dataPartitionTable.getDataPartitionMap().isEmpty()) { + return; + } + dataPartitionTable.getDataPartitionMap().forEach((dnSeriesPartitionSlot,dnDataPartitionTable) -> { + if (!tSeriesPartitionSlot.equals(dnSeriesPartitionSlot)) { + return; + } + + if (seriesPartitionTable == null || seriesPartitionTable.getSeriesPartitionMap() == null || seriesPartitionTable.getSeriesPartitionMap().isEmpty()) { + finalDataPartitionMap.put(tSeriesPartitionSlot, dnDataPartitionTable); + } + + // dnDataPartitionTable merged to seriesPartitionTable + dnDataPartitionTable.getSeriesPartitionMap().forEach((k, v) -> v.forEach(tConsensusGroupId -> { + if (seriesPartitionTable == null) { + return; + } + seriesPartitionTable.putDataPartition(k, tConsensusGroupId); + })); + }); + }); + }); + + finalDataPartitionTable = new DataPartitionTable(finalDataPartitionMap); + LOG.info("DataPartitionTable merge completed successfully"); setNextState(DataPartitionTableIntegrityCheckProcedureState.WRITE_PARTITION_TABLE_TO_RAFT); return Flow.HAS_MORE_STATE; @@ -478,9 +507,17 @@ private Flow mergePartitionTables(ConfigNodeProcedureEnv env) { } /** Write the final DataPartitionTable to raft log. */ - private Flow writePartitionTableToRaft(ConfigNodeProcedureEnv env) { + private Flow writePartitionTableToRaft(final ConfigNodeProcedureEnv env) { LOG.info("Writing DataPartitionTable to raft log..."); + if (lostDataPartitionsOfDatabases.isEmpty()) { + LOG.error("No database lost data partition table"); + setFailure( + "DataPartitionTableIntegrityCheckProcedure", + new ProcedureException("No database lost data partition table for raft write")); + return Flow.NO_MORE_STATE; + } + if (finalDataPartitionTable == null) { LOG.error("No DataPartitionTable to write to raft"); setFailure( @@ -489,39 +526,35 @@ private Flow writePartitionTableToRaft(ConfigNodeProcedureEnv env) { return Flow.NO_MORE_STATE; } - try { - // TODO: Implement actual raft log write - // This should create a consensus request to write the DataPartitionTable - // Example: - // WriteDataPartitionTablePlan plan = new - // WriteDataPartitionTablePlan(finalDataPartitionTable); - // env.getConfigManager().getConsensusManager().write(plan); - - // For now, simulate successful write - boolean writeSuccess = true; - - if (writeSuccess) { - LOG.info("DataPartitionTable successfully written to raft log"); - setNextState(DataPartitionTableIntegrityCheckProcedureState.SUCCESS); - return Flow.HAS_MORE_STATE; - } else { - LOG.error("Failed to write DataPartitionTable to raft log"); - setFailure( - "DataPartitionTableIntegrityCheckProcedure", - new ProcedureException("Failed to write DataPartitionTable to raft log")); - return Flow.NO_MORE_STATE; + int failedCnt = 0; + while (failedCnt < MAX_RETRY_COUNT) { + try { + CreateDataPartitionPlan createPlan = new CreateDataPartitionPlan(); + Map assignedDataPartition = new HashMap<>(); + assignedDataPartition.put(lostDataPartitionsOfDatabases.stream().findFirst().get(), finalDataPartitionTable); + createPlan.setAssignedDataPartition(assignedDataPartition); + TSStatus tsStatus = env.getConfigManager().getConsensusManager().write(createPlan); + + if (tsStatus.getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + LOG.info("DataPartitionTable successfully written to raft log"); + break; + } else { + LOG.error("Failed to write DataPartitionTable to raft log"); + setFailure( + "DataPartitionTableIntegrityCheckProcedure", + new ProcedureException("Failed to write DataPartitionTable to raft log")); + } + } catch (Exception e) { + LOG.error("Error writing DataPartitionTable to raft log", e); + setFailure("DataPartitionTableIntegrityCheckProcedure", e); } - - } catch (Exception e) { - LOG.error("Error writing DataPartitionTable to raft log", e); - setFailure("DataPartitionTableIntegrityCheckProcedure", e); - return Flow.NO_MORE_STATE; + failedCnt++; } + return Flow.NO_MORE_STATE; } - // @TODO @Override - public void serialize(DataOutputStream stream) throws IOException { + public void serialize(final DataOutputStream stream) throws IOException { super.serialize(stream); // Serialize earliestTimeslots @@ -533,21 +566,45 @@ public void serialize(DataOutputStream stream) throws IOException { // Serialize dataPartitionTables count stream.writeInt(dataPartitionTables.size()); - // Note: DataPartitionTable serialization would need to be implemented here + for (Map.Entry entry : dataPartitionTables.entrySet()) { + stream.writeInt(entry.getKey()); + try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); + ObjectOutputStream oos = new ObjectOutputStream(baos)) { + TTransport transport = new TIOStreamTransport(oos); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + entry.getValue().serialize(oos, protocol); + } catch (IOException | TException e) { + LOG.error("{} serialize failed", this.getClass().getSimpleName(), e); + } + } + + stream.writeInt(lostDataPartitionsOfDatabases.size()); + for (String database : lostDataPartitionsOfDatabases) { + stream.writeUTF(database); + } + + try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); + ObjectOutputStream oos = new ObjectOutputStream(baos)) { + TTransport transport = new TIOStreamTransport(oos); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + finalDataPartitionTable.serialize(oos, protocol); + } catch (IOException | TException e) { + LOG.error("{} serialize failed", this.getClass().getSimpleName(), e); + } - // Serialize targetDataNodes count - stream.writeInt(targetDataNodes.size()); - for (TDataNodeConfiguration dataNode : targetDataNodes) { - stream.writeInt(dataNode.getLocation().getDataNodeId()); + stream.writeInt(skipDnIds.size()); + for (int skipDnId : skipDnIds) { + stream.writeInt(skipDnId); } - // Serialize retryCount - stream.writeInt(retryCount); + stream.writeInt(failedDnIds.size()); + for (int failedDnId : failedDnIds) { + stream.writeInt(failedDnId); + } } - // @TODO @Override - public void deserialize(ByteBuffer byteBuffer) { + public void deserialize(final ByteBuffer byteBuffer) { super.deserialize(byteBuffer); // Deserialize earliestTimeslots @@ -561,18 +618,57 @@ public void deserialize(ByteBuffer byteBuffer) { // Deserialize dataPartitionTables count int dataPartitionTablesSize = byteBuffer.getInt(); - dataPartitionTables = new ConcurrentHashMap<>(); - // Note: DataPartitionTable deserialization would need to be implemented here + for (int i = 0; i < dataPartitionTablesSize; i++) { + int key = byteBuffer.getInt(); + int size = byteBuffer.getInt(); + byte[] bytes = new byte[size]; + byteBuffer.get(bytes); + try (ByteArrayInputStream bais = new ByteArrayInputStream(bytes); + ObjectInputStream ois = new ObjectInputStream(bais)) { + TTransport transport = new TIOStreamTransport(ois); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + + // Deserialize by input stream and protocol + DataPartitionTable value = new DataPartitionTable(); + value.deserialize(ois, protocol); + dataPartitionTables.put(key, value); + } catch (IOException | TException e) { + LOG.error("{} deserialize failed", this.getClass().getSimpleName(), e); + throw new RuntimeException(e); + } + } - // Deserialize targetDataNodes - int targetDataNodesSize = byteBuffer.getInt(); - targetDataNodes = new ArrayList<>(); - for (int i = 0; i < targetDataNodesSize; i++) { - int dataNodeId = byteBuffer.getInt(); - // Note: TDataNodeLocation reconstruction would need to be implemented here + int lostDataPartitionsOfDatabasesSize = byteBuffer.getInt(); + for (int i = 0; i < lostDataPartitionsOfDatabasesSize; i++) { + String database = ReadWriteIOUtils.readString(byteBuffer); + lostDataPartitionsOfDatabases.add(database); } - // Deserialize retryCount - retryCount = byteBuffer.getInt(); + // Deserialize finalDataPartitionTable size + int finalDataPartitionTableSize = byteBuffer.getInt(); + byte[] finalDataPartitionTableBytes = new byte[finalDataPartitionTableSize]; + byteBuffer.get(finalDataPartitionTableBytes); + try (ByteArrayInputStream bais = new ByteArrayInputStream(finalDataPartitionTableBytes); + ObjectInputStream ois = new ObjectInputStream(bais)) { + TTransport transport = new TIOStreamTransport(ois); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + + // Deserialize by input stream and protocol + finalDataPartitionTable = new DataPartitionTable(); + finalDataPartitionTable.deserialize(ois, protocol); + } catch (IOException | TException e) { + LOG.error("{} deserialize failed", this.getClass().getSimpleName(), e); + throw new RuntimeException(e); + } + + int skipDnIdsSize = byteBuffer.getInt(); + for (int i = 0; i < skipDnIdsSize; i++) { + skipDnIds.add(byteBuffer.getInt()); + } + + int failedDnIdsSize = byteBuffer.getInt(); + for (int i = 0; i < failedDnIdsSize; i++) { + failedDnIds.add(byteBuffer.getInt()); + } } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java index 3befc7f1634f1..9e4836a089cdf 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java @@ -24,6 +24,8 @@ import org.apache.iotdb.common.rpc.thrift.TSStatus; import org.apache.iotdb.commons.ServerCommandLine; import org.apache.iotdb.commons.client.ClientManagerMetrics; +import org.apache.iotdb.commons.cluster.NodeStatus; +import org.apache.iotdb.commons.concurrent.IoTDBThreadPoolFactory; import org.apache.iotdb.commons.concurrent.ThreadModule; import org.apache.iotdb.commons.concurrent.ThreadName; import org.apache.iotdb.commons.concurrent.ThreadPoolMetrics; @@ -78,6 +80,10 @@ import java.util.Arrays; import java.util.List; import java.util.Set; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; public class ConfigNode extends ServerCommandLine implements ConfigNodeMBean { @@ -109,6 +115,14 @@ public class ConfigNode extends ServerCommandLine implements ConfigNodeMBean { private int exitStatusCode = 0; + private Future dataPartitionTableCheckFuture; + + private ExecutorService dataPartitionTableCheckExecutor = + IoTDBThreadPoolFactory.newSingleThreadExecutor( + "DATA_PARTITION_TABLE_CHECK"); + + private final CountDownLatch latch = new CountDownLatch(1); + public ConfigNode() { super("ConfigNode"); // We do not init anything here, so that we can re-initialize the instance in IT. @@ -146,6 +160,11 @@ protected void start() throws IoTDBException { } active(); LOGGER.info("IoTDB started"); + try { + dataPartitionTableCheckFuture.get(); + } catch (ExecutionException | InterruptedException e) { + LOGGER.error("Data partition table check task execute failed", e); + } } @Override @@ -203,13 +222,29 @@ public void active() { loadSecretKey(); loadHardwareCode(); - // The data partition table integrity check is only performed when the ConfigNode is the leader node - if (configManager.getConsensusManager().isLeader()) { - TSStatus status = configManager.getProcedureManager().dataPartitionTableIntegrityCheck(); - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.error("Data partition table integrity check failed!"); + dataPartitionTableCheckFuture = dataPartitionTableCheckExecutor.submit(() -> { + LOGGER.info("Prepare to start dataPartitionTableIntegrityCheck after all datanodes are started up"); + Thread.sleep(CONF.getPartitionTableRecoverWaitAllDnUpTimeout()); + + while (latch.getCount() > 0) { + List dnList = configManager + .getLoadManager() + .filterDataNodeThroughStatus(NodeStatus.Running); + if (dnList != null && !dnList.isEmpty()) { + LOGGER.info("Starting dataPartitionTableIntegrityCheck..."); + TSStatus status = + configManager.getProcedureManager().dataPartitionTableIntegrityCheck(); + if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + LOGGER.error("Data partition table integrity check failed!"); + } + latch.countDown(); + } else { + LOGGER.info("No running datanodes found, waiting..."); + Thread.sleep(5000); // 等待5秒后重新检查 + } } - } + return null; + }); return; } else { saveSecretKey(); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java index 689a12bd8df89..56bc17d808b16 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java @@ -20,6 +20,7 @@ package org.apache.iotdb.db.partition; import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupType; import org.apache.iotdb.common.rpc.thrift.TSeriesPartitionSlot; import org.apache.iotdb.common.rpc.thrift.TTimePartitionSlot; import org.apache.iotdb.commons.partition.DataPartitionTable; @@ -37,6 +38,7 @@ import java.nio.file.Files; import java.nio.file.Paths; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -68,6 +70,7 @@ public class DataPartitionTableGenerator { // Configuration private final String[] dataDirectories; private final ExecutorService executor; + private final Set databases; private final int seriesSlotNum; private final String seriesPartitionExecutorClass; @@ -76,15 +79,20 @@ public class DataPartitionTableGenerator { private static final LeakyBucketRateLimiter limiter = new LeakyBucketRateLimiter((long) IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverMaxReadBytesPerSecond() * 1024 * 1024); - private static final String SCAN_FILE_SUFFIX_NAME = ".resource"; + public static final String SCAN_FILE_SUFFIX_NAME = ".tsfile"; + public static final Set IGNORE_DATABASE = new HashSet() {{ + add("root.__audit"); + }}; public DataPartitionTableGenerator( String dataDirectory, ExecutorService executor, + Set databases, int seriesSlotNum, String seriesPartitionExecutorClass) { this.dataDirectories = new String[]{dataDirectory}; this.executor = executor; + this.databases = databases; this.seriesSlotNum = seriesSlotNum; this.seriesPartitionExecutorClass = seriesPartitionExecutorClass; } @@ -92,10 +100,12 @@ public DataPartitionTableGenerator( public DataPartitionTableGenerator( String[] dataDirectories, ExecutorService executor, + Set databases, int seriesSlotNum, String seriesPartitionExecutorClass) { this.dataDirectories = dataDirectories; this.executor = executor; + this.databases = databases; this.seriesSlotNum = seriesSlotNum; this.seriesPartitionExecutorClass = seriesPartitionExecutorClass; } @@ -147,28 +157,42 @@ private void generateDataPartitionTable() throws IOException { // First layer: database directories Files.list(Paths.get(dataDirectory)) .filter(Files::isDirectory) - .forEach( - dbPath -> { - String databaseName = dbPath.getFileName().toString(); - LOG.debug("Processing database: {}", databaseName); - - try { - Files.list(dbPath) + .forEach(sequenceTypePath -> { + try { + Files.list(sequenceTypePath) .filter(Files::isDirectory) - .forEach( - regionPath -> { - processRegionDirectory( - regionPath, - databaseName, - dataPartitionMap, - executor, - futures); - }); - } catch (IOException e) { - LOG.error("Failed to process database directory: {}", dbPath, e); - failedFiles.incrementAndGet(); - } - }); + .forEach(dbPath -> { + String databaseName = dbPath.getFileName().toString(); + if (!databases.contains(databaseName) || IGNORE_DATABASE.contains(databaseName)) { + return; + } + + if (LOG.isDebugEnabled()) { + LOG.debug("Processing database: {}", databaseName); + } + + try { + Files.list(dbPath) + .filter(Files::isDirectory) + .forEach( + regionPath -> { + processRegionDirectory( + regionPath, + databaseName, + dataPartitionMap, + executor, + futures); + }); + } catch (IOException e) { + LOG.error("Failed to process database directory: {}", dbPath, e); + failedFiles.incrementAndGet(); + } + }); + } catch (IOException e) { + LOG.error("Failed to process database directory: {}", sequenceTypePath, e); + failedFiles.incrementAndGet(); + } + }); } // Wait for all tasks to complete @@ -213,6 +237,7 @@ private void processRegionDirectory( TConsensusGroupId consensusGroupId = new TConsensusGroupId(); consensusGroupId.setId(regionId); + consensusGroupId.setType(TConsensusGroupType.DataRegion); // Process time partitions asynchronously CompletableFuture regionFuture = @@ -252,7 +277,7 @@ private void processTimeSlotDirectory( } try { - // Fourth layer: .tsfile files + // Fourth layer: .resource files Files.walk(timeSlotPath) .filter(Files::isRegularFile) .filter(p -> p.toString().endsWith(SCAN_FILE_SUFFIX_NAME)) @@ -313,10 +338,31 @@ private void countTotalFiles() throws IOException { AtomicLong fileCount = new AtomicLong(0); for (String dataDirectory : dataDirectories) { - Files.walk(Paths.get(dataDirectory)) - .filter(Files::isRegularFile) - .filter(p -> p.toString().endsWith(SCAN_FILE_SUFFIX_NAME)) - .forEach(p -> fileCount.incrementAndGet()); + Files.list(Paths.get(dataDirectory)) + .filter(Files::isDirectory) + .forEach(sequenceTypePath -> { + try { + Files.list(sequenceTypePath) + .filter(Files::isDirectory) + .forEach(dbPath -> { + String databaseName = dbPath.getFileName().toString(); + if (!databases.contains(databaseName) || IGNORE_DATABASE.contains(databaseName)) { + return; + } + + try { + Files.walk(dbPath) + .filter(Files::isRegularFile) + .filter(p -> p.toString().endsWith(SCAN_FILE_SUFFIX_NAME)) + .forEach(p -> fileCount.incrementAndGet()); + } catch (IOException e) { + LOG.error("countTotalFiles failed when scan {}", dbPath, e); + } + }); + } catch (IOException e) { + LOG.error("countTotalFiles failed when scan {}", sequenceTypePath, e); + } + }); } totalFiles.set(fileCount.get()); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index cf2178c3569c1..222025214532e 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -324,13 +324,14 @@ import org.apache.iotdb.trigger.api.enums.TriggerEvent; import org.apache.thrift.TException; import org.apache.thrift.protocol.TBinaryProtocol; -import org.apache.thrift.protocol.TProtocol; import org.apache.thrift.transport.TIOStreamTransport; +import org.apache.thrift.transport.TTransport; import org.apache.tsfile.enums.TSDataType; import org.apache.tsfile.exception.NotImplementedException; import org.apache.tsfile.read.common.TimeRange; import org.apache.tsfile.read.common.block.TsBlock; import org.apache.tsfile.utils.Pair; +import org.apache.tsfile.utils.PublicBAOS; import org.apache.tsfile.utils.RamUsageEstimator; import org.apache.tsfile.utils.ReadWriteIOUtils; import org.apache.tsfile.write.record.Tablet; @@ -345,6 +346,7 @@ import java.net.URL; import java.nio.ByteBuffer; import java.nio.file.Files; +import java.nio.file.Path; import java.time.ZoneId; import java.util.ArrayList; import java.util.Arrays; @@ -3197,6 +3199,11 @@ public TGetEarliestTimeslotsResp getEarliestTimeslots() { @Override public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataPartitionTableReq req) { + String[] dataDirs = IoTDBDescriptor.getInstance().getConfig().getDataDirs(); + return generateDataPartitionTable(req, dataDirs); + } + + public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataPartitionTableReq req, String[] dataDirs) { TGenerateDataPartitionTableResp resp = new TGenerateDataPartitionTableResp(); byte[] empty = new byte[0]; @@ -3212,7 +3219,6 @@ public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataP } // Get data directories and configuration - String[] dataDirs = IoTDBDescriptor.getInstance().getConfig().getDataDirs(); if (dataDirs.length == 0) { resp.setDataPartitionTable(empty); resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); @@ -3228,7 +3234,7 @@ public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataP currentGenerator = new DataPartitionTableGenerator( - dataDirs, partitionTableRecoverExecutor, seriesSlotNum, seriesPartitionExecutorClass); + dataDirs, partitionTableRecoverExecutor, req.getDatabases(), seriesSlotNum, seriesPartitionExecutorClass); currentTaskId = System.currentTimeMillis(); // Start generation synchronously for now to return the data partition table immediately @@ -3237,7 +3243,7 @@ public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataP // Wait for completion (with timeout) long startTime = System.currentTimeMillis(); - while (currentGenerator.getStatus() == DataPartitionTableGenerator.TaskStatus.IN_PROGRESS) { + while (currentGenerator != null && currentGenerator.getStatus() == DataPartitionTableGenerator.TaskStatus.IN_PROGRESS) { if (System.currentTimeMillis() - startTime > timeoutMs) { resp.setErrorCode(DataPartitionTableGeneratorState.IN_PROGRESS.getCode()); resp.setMessage("DataPartitionTable generation timed out"); @@ -3261,8 +3267,8 @@ public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataP if (currentGenerator.getStatus() == DataPartitionTableGenerator.TaskStatus.COMPLETED) { DataPartitionTable dataPartitionTable = currentGenerator.getDataPartitionTable(); if (dataPartitionTable != null) { - ByteBuffer result = serializeDataPartitionTable(dataPartitionTable); - resp.setDataPartitionTable(result.array()); + byte[] result = serializeDataPartitionTable(dataPartitionTable); + resp.setDataPartitionTable(result); } resp.setErrorCode(DataPartitionTableGeneratorState.SUCCESS.getCode()); @@ -3350,12 +3356,23 @@ private void processDataDirectoryForEarliestTimeslots( Files.list(dataDir.toPath()) .filter(Files::isDirectory) .forEach( - dbPath -> { - String databaseName = dbPath.getFileName().toString(); - long earliestTimeslot = findEarliestTimeslotInDatabase(dbPath.toFile()); + sequenceTypePath -> { + try { + Files.list(sequenceTypePath) + .filter(Files::isDirectory) + .forEach(dbPath -> { + String databaseName = dbPath.getFileName().toString(); + if (DataPartitionTableGenerator.IGNORE_DATABASE.contains(databaseName)) { + return; + } + long earliestTimeslot = findEarliestTimeslotInDatabase(dbPath.toFile()); - if (earliestTimeslot != Long.MAX_VALUE) { - earliestTimeslots.merge(databaseName, earliestTimeslot, Math::min); + if (earliestTimeslot != Long.MAX_VALUE) { + earliestTimeslots.merge(databaseName, earliestTimeslot, Math::min); + } + }); + } catch (IOException e) { + LOGGER.error("Failed to process data directory: {}", sequenceTypePath.toFile(), e); } }); } catch (IOException e) { @@ -3368,7 +3385,7 @@ private long findEarliestTimeslotInDatabase(File databaseDir) { final AtomicLong earliest = new AtomicLong(Long.MAX_VALUE); try { - Files.walk(databaseDir.toPath()) + Files.list(databaseDir.toPath()) .filter(Files::isDirectory) .forEach( regionPath -> { @@ -3377,10 +3394,18 @@ private long findEarliestTimeslotInDatabase(File databaseDir) { Files.list(regionPath) .filter(Files::isDirectory) .forEach(timeSlotPath -> { - String timeSlotName = timeSlotPath.getFileName().toString(); - long timeslot = Long.parseLong(timeSlotName); - if (timeslot < earliest.get()) { - earliest.set(timeslot); + try { + Optional matchedFile = Files.find(timeSlotPath, 1, (path, attrs) -> attrs.isRegularFile() && path.toString().endsWith(DataPartitionTableGenerator.SCAN_FILE_SUFFIX_NAME)).findFirst(); + if (!matchedFile.isPresent()) { + return; + } + String timeSlotName = timeSlotPath.getFileName().toString(); + long timeslot = Long.parseLong(timeSlotName); + if (timeslot < earliest.get()) { + earliest.set(timeslot); + } + } catch (IOException e) { + LOGGER.error("Failed to find any {} files in the {} directory", DataPartitionTableGenerator.SCAN_FILE_SUFFIX_NAME, timeSlotPath, e); } }); } catch (IOException e) { @@ -3396,15 +3421,19 @@ private long findEarliestTimeslotInDatabase(File databaseDir) { } /** Serialize DataPartitionTable to ByteBuffer for RPC transmission. */ - private ByteBuffer serializeDataPartitionTable(DataPartitionTable dataPartitionTable) { - try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); - TIOStreamTransport tioStreamTransport = new TIOStreamTransport(baos)) { - TProtocol protocol = new TBinaryProtocol(tioStreamTransport); - dataPartitionTable.serialize(baos, protocol); - return ByteBuffer.wrap(baos.toByteArray()); - } catch (Exception e) { + private byte[] serializeDataPartitionTable(DataPartitionTable dataPartitionTable) { +// try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); +// ObjectOutputStream oos = new ObjectOutputStream(baos)) { + + try (PublicBAOS baos = new PublicBAOS(); + DataOutputStream oos = new DataOutputStream(baos)) { + TTransport transport = new TIOStreamTransport(oos); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + dataPartitionTable.serialize(oos, protocol); + return baos.getBuf(); + } catch (IOException | TException e) { LOGGER.error("Failed to serialize DataPartitionTable", e); - return ByteBuffer.allocate(0); + return ByteBuffer.allocate(0).array(); } } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/TsFileResource.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/TsFileResource.java index d625b753e193b..f4a950a72afd6 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/TsFileResource.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/TsFileResource.java @@ -679,7 +679,7 @@ public Set getDevices() { } public Set getDevices(LeakyBucketRateLimiter limiter) { - return timeIndex.getDevicesByRateLimiter(file.getPath(), this, limiter); + return timeIndex.getDevices(file.getPath(), this, limiter); } public ArrayDeviceTimeIndex buildDeviceTimeIndex(IDeviceID.Deserializer deserializer) diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ArrayDeviceTimeIndex.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ArrayDeviceTimeIndex.java index 8499b6d6b3d3e..71a761a813731 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ArrayDeviceTimeIndex.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ArrayDeviceTimeIndex.java @@ -23,6 +23,7 @@ import org.apache.iotdb.commons.path.PartialPath; import org.apache.iotdb.commons.utils.CommonDateTimeUtils; import org.apache.iotdb.commons.utils.TimePartitionUtils; +import org.apache.iotdb.commons.utils.rateLimiter.LeakyBucketRateLimiter; import org.apache.iotdb.db.exception.load.PartitionViolationException; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; @@ -171,6 +172,11 @@ public Set getDevices(String tsFilePath, TsFileResource tsFileResourc return deviceToIndex.keySet(); } + @Override + public Set getDevices(String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter) { + return deviceToIndex.keySet(); + } + public Map getDeviceToIndex() { return deviceToIndex; } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java index 6cbcc48021f77..a0a725c85d73d 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java @@ -123,7 +123,6 @@ public Set getDevices(String tsFilePath, TsFileResource tsFileResourc @Override public Set getDevices(String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter) { - byte[] buffer = new byte[64 * 1024]; tsFileResource.readLock(); try (InputStream inputStream = FSFactoryProducer.getFSFactory() diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java index 7b3f047b34d92..5f94703a944ba 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java @@ -75,11 +75,11 @@ ITimeIndex deserialize(InputStream inputStream, IDeviceID.Deserializer deseriali Set getDevices(String tsFilePath, TsFileResource tsFileResource); /** - * get devices in TimeIndex that use inputStream + * get devices in TimeIndex and limit files reading rate * * @return device names */ - Set getDevicesByRateLimiter(String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter); + Set getDevices(String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter); /** * @return whether end time is empty (Long.MIN_VALUE) diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java index 622c2c4ebbfe7..fc2c05b75b799 100644 --- a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java @@ -28,6 +28,7 @@ import org.apache.iotdb.commons.consensus.DataRegionId; import org.apache.iotdb.commons.consensus.SchemaRegionId; import org.apache.iotdb.commons.exception.MetadataException; +import org.apache.iotdb.commons.partition.DataPartitionTable; import org.apache.iotdb.commons.path.MeasurementPath; import org.apache.iotdb.commons.path.PartialPath; import org.apache.iotdb.consensus.ConsensusFactory; @@ -53,6 +54,9 @@ import org.apache.iotdb.db.storageengine.dataregion.DataRegion; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; import org.apache.iotdb.db.utils.EnvironmentUtils; +import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableReq; +import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableResp; +import org.apache.iotdb.mpp.rpc.thrift.TGetEarliestTimeslotsResp; import org.apache.iotdb.mpp.rpc.thrift.TPlanNode; import org.apache.iotdb.mpp.rpc.thrift.TSendBatchPlanNodeReq; import org.apache.iotdb.mpp.rpc.thrift.TSendBatchPlanNodeResp; @@ -68,6 +72,8 @@ import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; @@ -75,12 +81,16 @@ import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.Set; public class DataNodeInternalRPCServiceImplTest { + private static final Logger LOG = + LoggerFactory.getLogger(DataNodeInternalRPCServiceImplTest.class); private static final IoTDBConfig conf = IoTDBDescriptor.getInstance().getConfig(); DataNodeInternalRPCServiceImpl dataNodeInternalRPCServiceImpl; private static IConsensus instance; @@ -412,4 +422,44 @@ private List genSchemaRegionPeerList(TRegionReplicaSet regionReplicaSet) { } return peerList; } + + @Test + public void testGetEarliestTimeslots() { + Set lostDataPartitionsOfDatabases = new HashSet<>(); + lostDataPartitionsOfDatabases.add("root.demo"); + + TGenerateDataPartitionTableReq req = new TGenerateDataPartitionTableReq(); + req.setDatabases(lostDataPartitionsOfDatabases); + + // Use consensus layer to execute request + TGetEarliestTimeslotsResp response = + dataNodeInternalRPCServiceImpl.getEarliestTimeslots(); + + Map result = new HashMap(){{ + put("test", 2927L); + put("root.test", 0L); + put("root.demo", 0L); + }}; + Assert.assertNotSame(response.getDatabaseToEarliestTimeslot(), result); + } + + @Test + public void testGenerateDataPartitionTable() { + Set lostDataPartitionsOfDatabases = new HashSet<>(); + lostDataPartitionsOfDatabases.add("root.demo"); + + TGenerateDataPartitionTableReq req = new TGenerateDataPartitionTableReq(); + req.setDatabases(lostDataPartitionsOfDatabases); + + // Use consensus layer to execute request + String[] dataDirs = new String[]{"D:\\Users\\libo\\Downloads\\muliti-iotdb\\master-iotdb-source-conf\\data\\datanode\\data"}; + TGenerateDataPartitionTableResp response = + dataNodeInternalRPCServiceImpl.generateDataPartitionTable(req, dataDirs); + + Assert.assertNotSame(response.getDataPartitionTable(), ByteBuffer.allocate(0).array()); + + DataPartitionTable dataPartitionTable = new DataPartitionTable(); + dataPartitionTable.deserialize(ByteBuffer.wrap(response.getDataPartitionTable())); + Assert.assertEquals(1, dataPartitionTable.getTimeSlotCount()); + } } diff --git a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template index f90f664572553..ff9066c0dec22 100644 --- a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template +++ b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template @@ -757,6 +757,12 @@ partition_table_recover_worker_num=10 # Datatype: Integer partition_table_recover_max_read_bytes_per_second=10 +# Set a timeout to wait for all datanodes complete startup, the unit is ms300000 +# effectiveMode: restart +# Datatype: Integer +#partition_table_recover_wait_all_dn_up_timeout=300000 +partition_table_recover_wait_all_dn_up_timeout=2000 + #################### ### Memory Control Configuration #################### diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/ServerCommandLine.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/ServerCommandLine.java index b700dbbb6b033..7901f9cc36a1d 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/ServerCommandLine.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/ServerCommandLine.java @@ -18,8 +18,6 @@ */ package org.apache.iotdb.commons; -import org.apache.iotdb.commons.exception.IoTDBException; - import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.DefaultParser; @@ -28,6 +26,7 @@ import org.apache.commons.cli.OptionGroup; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; +import org.apache.iotdb.commons.exception.IoTDBException; import java.io.PrintWriter; import java.util.HashSet; diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java index c5dd3e401d13e..d1a550c5ca1c9 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java @@ -77,12 +77,6 @@ public static TTimePartitionSlot getTimePartitionSlot(long time) { return timePartitionSlot; } - public static TTimePartitionSlot getTimePartitionSlot(long partitionId) { - TTimePartitionSlot timePartitionSlot = new TTimePartitionSlot(); - timePartitionSlot.setStartTime(getTimePartitionLowerBound(time)); - return timePartitionSlot; - } - public static long getTimePartitionInterval() { return timePartitionInterval; } diff --git a/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift b/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift index 54479b2859875..b248599f59cc4 100644 --- a/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift +++ b/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift @@ -688,7 +688,7 @@ struct TGetEarliestTimeslotsResp { } struct TGenerateDataPartitionTableReq { - 1: required string database + 1: required set databases } struct TGenerateDataPartitionTableResp { From cb5f6014e6a88587cb34c07cf841d731205ea8d4 Mon Sep 17 00:00:00 2001 From: libo Date: Tue, 10 Mar 2026 10:53:13 +0800 Subject: [PATCH 03/13] Correct dataPartitionTables and finalDataPartitionTable serialization; Adjust method that record the earliest timeslot id for every database --- ...PartitionTableIntegrityCheckProcedure.java | 73 ++++++++++++------- .../impl/DataNodeInternalRPCServiceImpl.java | 14 ++-- 2 files changed, 54 insertions(+), 33 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index 5c06b29418f65..76c575a40609d 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -206,10 +206,6 @@ private Flow collectEarliestTimeslots(final ConfigNodeProcedureEnv env) { } Map nodeTimeslots = resp.getDatabaseToEarliestTimeslot(); -// Map nodeTimeslots = new HashMap<>(); -// nodeTimeslots.put("test", 2927L); -// nodeTimeslots.put("root.test", 0L); -// nodeTimeslots.put("root.demo", 0L); // Merge with existing timeslots (take minimum) for (Map.Entry entry : nodeTimeslots.entrySet()) { @@ -388,7 +384,9 @@ private Flow requestPartitionTables(final ConfigNodeProcedureEnv env) { * Check completion status of DataPartitionTable generation tasks. */ private void checkPartitionTableGenerationStatus() { - LOG.info("Checking DataPartitionTable generation completion status..."); + if (LOG.isDebugEnabled()) { + LOG.info("Checking DataPartitionTable generation completion status..."); + } int completeCount = 0; for (TDataNodeConfiguration dataNode : allDataNodes) { @@ -508,7 +506,9 @@ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { /** Write the final DataPartitionTable to raft log. */ private Flow writePartitionTableToRaft(final ConfigNodeProcedureEnv env) { - LOG.info("Writing DataPartitionTable to raft log..."); + if (LOG.isDebugEnabled()) { + LOG.info("Writing DataPartitionTable to raft log..."); + } if (lostDataPartitionsOfDatabases.isEmpty()) { LOG.error("No database lost data partition table"); @@ -573,8 +573,14 @@ public void serialize(final DataOutputStream stream) throws IOException { TTransport transport = new TIOStreamTransport(oos); TBinaryProtocol protocol = new TBinaryProtocol(transport); entry.getValue().serialize(oos, protocol); + + // Write the size and data for byte array after serialize + byte[] data = baos.toByteArray(); + stream.writeInt(data.length); + stream.write(data); } catch (IOException | TException e) { LOG.error("{} serialize failed", this.getClass().getSimpleName(), e); + throw new IOException("Failed to serialize dataPartitionTables", e); } } @@ -583,13 +589,23 @@ public void serialize(final DataOutputStream stream) throws IOException { stream.writeUTF(database); } - try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); - ObjectOutputStream oos = new ObjectOutputStream(baos)) { - TTransport transport = new TIOStreamTransport(oos); - TBinaryProtocol protocol = new TBinaryProtocol(transport); - finalDataPartitionTable.serialize(oos, protocol); - } catch (IOException | TException e) { - LOG.error("{} serialize failed", this.getClass().getSimpleName(), e); + if (finalDataPartitionTable != null) { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); + ObjectOutputStream oos = new ObjectOutputStream(baos)) { + TTransport transport = new TIOStreamTransport(oos); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + finalDataPartitionTable.serialize(oos, protocol); + + // Write the size and data for byte array after serialize + byte[] data = baos.toByteArray(); + stream.writeInt(data.length); + stream.write(data); + } catch (IOException | TException e) { + LOG.error("{} serialize failed", this.getClass().getSimpleName(), e); + throw new IOException("Failed to serialize finalDataPartitionTable", e); + } + } else { + stream.writeInt(0); } stream.writeInt(skipDnIds.size()); @@ -618,6 +634,7 @@ public void deserialize(final ByteBuffer byteBuffer) { // Deserialize dataPartitionTables count int dataPartitionTablesSize = byteBuffer.getInt(); + dataPartitionTables = new HashMap<>(); for (int i = 0; i < dataPartitionTablesSize; i++) { int key = byteBuffer.getInt(); int size = byteBuffer.getInt(); @@ -646,19 +663,23 @@ public void deserialize(final ByteBuffer byteBuffer) { // Deserialize finalDataPartitionTable size int finalDataPartitionTableSize = byteBuffer.getInt(); - byte[] finalDataPartitionTableBytes = new byte[finalDataPartitionTableSize]; - byteBuffer.get(finalDataPartitionTableBytes); - try (ByteArrayInputStream bais = new ByteArrayInputStream(finalDataPartitionTableBytes); - ObjectInputStream ois = new ObjectInputStream(bais)) { - TTransport transport = new TIOStreamTransport(ois); - TBinaryProtocol protocol = new TBinaryProtocol(transport); - - // Deserialize by input stream and protocol - finalDataPartitionTable = new DataPartitionTable(); - finalDataPartitionTable.deserialize(ois, protocol); - } catch (IOException | TException e) { - LOG.error("{} deserialize failed", this.getClass().getSimpleName(), e); - throw new RuntimeException(e); + if (finalDataPartitionTableSize > 0) { + byte[] finalDataPartitionTableBytes = new byte[finalDataPartitionTableSize]; + byteBuffer.get(finalDataPartitionTableBytes); + try (ByteArrayInputStream bais = new ByteArrayInputStream(finalDataPartitionTableBytes); + ObjectInputStream ois = new ObjectInputStream(bais)) { + TTransport transport = new TIOStreamTransport(ois); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + + // Deserialize by input stream and protocol + finalDataPartitionTable = new DataPartitionTable(); + finalDataPartitionTable.deserialize(ois, protocol); + } catch (IOException | TException e) { + LOG.error("{} deserialize failed", this.getClass().getSimpleName(), e); + throw new RuntimeException(e); + } + } else { + finalDataPartitionTable = null; } int skipDnIdsSize = byteBuffer.getInt(); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index 222025214532e..2b033c2498e4d 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -450,6 +450,8 @@ public class DataNodeInternalRPCServiceImpl implements IDataNodeRPCService.Iface ThreadName.DATA_PARTITION_RECOVER_PARALLEL_POOL.getName(), new ThreadPoolExecutor.CallerRunsPolicy()); + private Map databaseEarliestRegionMap = new ConcurrentHashMap<>(); + private static final long timeoutMs = 600000; // 600 seconds timeout public DataNodeInternalRPCServiceImpl() { @@ -3365,6 +3367,7 @@ private void processDataDirectoryForEarliestTimeslots( if (DataPartitionTableGenerator.IGNORE_DATABASE.contains(databaseName)) { return; } + databaseEarliestRegionMap.computeIfAbsent(databaseName, key -> Long.MAX_VALUE); long earliestTimeslot = findEarliestTimeslotInDatabase(dbPath.toFile()); if (earliestTimeslot != Long.MAX_VALUE) { @@ -3382,7 +3385,7 @@ private void processDataDirectoryForEarliestTimeslots( /** Find the earliest timeslot in a database directory. */ private long findEarliestTimeslotInDatabase(File databaseDir) { - final AtomicLong earliest = new AtomicLong(Long.MAX_VALUE); + String databaseName = databaseDir.getName(); try { Files.list(databaseDir.toPath()) @@ -3401,8 +3404,8 @@ private long findEarliestTimeslotInDatabase(File databaseDir) { } String timeSlotName = timeSlotPath.getFileName().toString(); long timeslot = Long.parseLong(timeSlotName); - if (timeslot < earliest.get()) { - earliest.set(timeslot); + if (timeslot < databaseEarliestRegionMap.get(databaseName)) { + databaseEarliestRegionMap.put(databaseName, timeslot); } } catch (IOException e) { LOGGER.error("Failed to find any {} files in the {} directory", DataPartitionTableGenerator.SCAN_FILE_SUFFIX_NAME, timeSlotPath, e); @@ -3417,14 +3420,11 @@ private long findEarliestTimeslotInDatabase(File databaseDir) { LOGGER.error("Failed to walk database directory: {}", databaseDir, e); } - return earliest.get(); + return databaseEarliestRegionMap.get(databaseName); } /** Serialize DataPartitionTable to ByteBuffer for RPC transmission. */ private byte[] serializeDataPartitionTable(DataPartitionTable dataPartitionTable) { -// try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); -// ObjectOutputStream oos = new ObjectOutputStream(baos)) { - try (PublicBAOS baos = new PublicBAOS(); DataOutputStream oos = new DataOutputStream(baos)) { TTransport transport = new TIOStreamTransport(oos); From f19d1d381efb2e86ccc801665c10b7e858151d71 Mon Sep 17 00:00:00 2001 From: libo Date: Tue, 10 Mar 2026 11:47:31 +0800 Subject: [PATCH 04/13] Correct heartbeat logic when data partition table is generating; Remove two unit testes only run successful in local environment --- ...PartitionTableIntegrityCheckProcedure.java | 15 ++++--- .../impl/DataNodeInternalRPCServiceImpl.java | 3 -- .../DataNodeInternalRPCServiceImplTest.java | 41 ------------------- 3 files changed, 10 insertions(+), 49 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index 76c575a40609d..52fe407be661d 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -288,8 +288,8 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { } //@TODO simulate case that lost data partition -// if (lostDataPartitionsOfDatabases.isEmpty()) { - if (!lostDataPartitionsOfDatabases.isEmpty()) { + lostDataPartitionsOfDatabases.add("root.demo"); + if (lostDataPartitionsOfDatabases.isEmpty()) { LOG.info("No databases have lost data partitions, terminating procedure"); return Flow.NO_MORE_STATE; } @@ -345,7 +345,6 @@ private Flow requestPartitionTables(final ConfigNodeProcedureEnv env) { if (!dataPartitionTables.containsKey(dataNodeId)) { try { TGenerateDataPartitionTableReq req = new TGenerateDataPartitionTableReq(); - lostDataPartitionsOfDatabases.add("root.demo"); req.setDatabases(lostDataPartitionsOfDatabases); TGenerateDataPartitionTableResp resp = (TGenerateDataPartitionTableResp) SyncDataNodeClientPool.getInstance() .sendSyncRequestToDataNodeWithGivenRetry(dataNode.getLocation().getInternalEndPoint(), req, CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE, MAX_RETRY_COUNT); @@ -396,8 +395,12 @@ private void checkPartitionTableGenerationStatus() { try { TGenerateDataPartitionTableHeartbeatResp resp = (TGenerateDataPartitionTableHeartbeatResp) SyncDataNodeClientPool.getInstance() .sendSyncRequestToDataNodeWithGivenRetry(dataNode.getLocation().getInternalEndPoint(), null, CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE_HEART_BEAT, MAX_RETRY_COUNT); - DataPartitionTableGeneratorState state = DataPartitionTableGeneratorState.getStateByCode(resp.getStatus().getCode()); + DataPartitionTableGeneratorState state = DataPartitionTableGeneratorState.getStateByCode(resp.getErrorCode()); + if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + LOG.error("Failed to request DataPartitionTable generation heart beat from the DataNode[id={}], response status is {}", dataNode.getLocation().getDataNodeId(), resp.getStatus()); + continue; + } switch (state) { case SUCCESS: LOG.info("DataNode {} completed DataPartitionTable generation, terminating heart beat", dataNodeId); @@ -411,7 +414,7 @@ private void checkPartitionTableGenerationStatus() { completeCount++; break; default: - LOG.error("DataNode {} returned unknown error code: {}", dataNodeId, resp.getStatus().getCode()); + LOG.error("DataNode {} returned unknown error code: {}", dataNodeId, resp.getErrorCode()); break; } } catch (Exception e) { @@ -422,6 +425,8 @@ private void checkPartitionTableGenerationStatus() { e); completeCount++; } + } else { + completeCount++; } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index 2b033c2498e4d..d108180029b46 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -3285,9 +3285,6 @@ public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataP "DataPartitionTable generation failed: " + currentGenerator.getErrorMessage()); resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); } - - // Clear current generator - currentGenerator = null; } catch (Exception e) { LOGGER.error("Failed to generate DataPartitionTable", e); resp.setStatus( diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java index fc2c05b75b799..adf276dd8e2a8 100644 --- a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java @@ -61,7 +61,6 @@ import org.apache.iotdb.mpp.rpc.thrift.TSendBatchPlanNodeReq; import org.apache.iotdb.mpp.rpc.thrift.TSendBatchPlanNodeResp; import org.apache.iotdb.mpp.rpc.thrift.TSendSinglePlanNodeReq; - import org.apache.ratis.util.FileUtils; import org.apache.tsfile.enums.TSDataType; import org.apache.tsfile.file.metadata.enums.CompressionType; @@ -422,44 +421,4 @@ private List genSchemaRegionPeerList(TRegionReplicaSet regionReplicaSet) { } return peerList; } - - @Test - public void testGetEarliestTimeslots() { - Set lostDataPartitionsOfDatabases = new HashSet<>(); - lostDataPartitionsOfDatabases.add("root.demo"); - - TGenerateDataPartitionTableReq req = new TGenerateDataPartitionTableReq(); - req.setDatabases(lostDataPartitionsOfDatabases); - - // Use consensus layer to execute request - TGetEarliestTimeslotsResp response = - dataNodeInternalRPCServiceImpl.getEarliestTimeslots(); - - Map result = new HashMap(){{ - put("test", 2927L); - put("root.test", 0L); - put("root.demo", 0L); - }}; - Assert.assertNotSame(response.getDatabaseToEarliestTimeslot(), result); - } - - @Test - public void testGenerateDataPartitionTable() { - Set lostDataPartitionsOfDatabases = new HashSet<>(); - lostDataPartitionsOfDatabases.add("root.demo"); - - TGenerateDataPartitionTableReq req = new TGenerateDataPartitionTableReq(); - req.setDatabases(lostDataPartitionsOfDatabases); - - // Use consensus layer to execute request - String[] dataDirs = new String[]{"D:\\Users\\libo\\Downloads\\muliti-iotdb\\master-iotdb-source-conf\\data\\datanode\\data"}; - TGenerateDataPartitionTableResp response = - dataNodeInternalRPCServiceImpl.generateDataPartitionTable(req, dataDirs); - - Assert.assertNotSame(response.getDataPartitionTable(), ByteBuffer.allocate(0).array()); - - DataPartitionTable dataPartitionTable = new DataPartitionTable(); - dataPartitionTable.deserialize(ByteBuffer.wrap(response.getDataPartitionTable())); - Assert.assertEquals(1, dataPartitionTable.getTimeSlotCount()); - } } From 49ef823303dfe418d4454c6a21e567e209f8b951 Mon Sep 17 00:00:00 2001 From: libo Date: Wed, 11 Mar 2026 17:52:13 +0800 Subject: [PATCH 05/13] Use StorageEngine.getInstance().getAllDataRegions() to get Data Partition Information instead of scanning data directories in the DataNode; Correct the logic that retry after the step failed; Correct skipDataNodes and failedDataNodes serialization and deserialization. --- ...PartitionTableIntegrityCheckProcedure.java | 228 ++++++++++++------ .../DataPartitionTableGenerator.java | 96 ++++++-- .../impl/DataNodeInternalRPCServiceImpl.java | 100 ++++---- 3 files changed, 273 insertions(+), 151 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index 52fe407be661d..dbb47019e136d 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -69,7 +69,6 @@ import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; -import java.util.stream.Collectors; /** * Procedure for checking and restoring data partition table integrity. This procedure scans all @@ -101,8 +100,8 @@ public class DataPartitionTableIntegrityCheckProcedure /** Final merged DataPartitionTable */ private DataPartitionTable finalDataPartitionTable; - private static Set skipDnIds = new HashSet<>(); - private static Set failedDnIds = new HashSet<>(); + private static Set skipDataNodes = new HashSet<>(); + private static Set failedDataNodes = new HashSet<>(); private static ScheduledExecutorService heartBeatExecutor; //============Need serialize END=============/ @@ -121,8 +120,8 @@ protected Flow executeFromState(final ConfigNodeProcedureEnv env, final DataPart switch (state) { case COLLECT_EARLIEST_TIMESLOTS: - failedDnIds = new HashSet<>(); - return collectEarliestTimeslots(env); + failedDataNodes = new HashSet<>(); + return collectEarliestTimeslots(); case ANALYZE_MISSING_PARTITIONS: lostDataPartitionsOfDatabases = new HashSet<>(); return analyzeMissingPartitions(env); @@ -175,7 +174,8 @@ protected int getStateId(final DataPartitionTableIntegrityCheckProcedureState st @Override protected DataPartitionTableIntegrityCheckProcedureState getInitialState() { - skipDnIds = new HashSet<>(); + skipDataNodes = new HashSet<>(); + failedDataNodes = new HashSet<>(); return DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS; } @@ -183,7 +183,7 @@ protected DataPartitionTableIntegrityCheckProcedureState getInitialState() { * Collect earliest timeslot information from all DataNodes. Each DataNode returns a Map where key is database name and value is the earliest timeslot id. */ - private Flow collectEarliestTimeslots(final ConfigNodeProcedureEnv env) { + private Flow collectEarliestTimeslots() { if (LOG.isDebugEnabled()) { LOG.debug("Collecting earliest timeslots from all DataNodes..."); } @@ -195,12 +195,13 @@ private Flow collectEarliestTimeslots(final ConfigNodeProcedureEnv env) { } // Collect earliest timeslots from all DataNodes + allDataNodes.removeAll(skipDataNodes); for (TDataNodeConfiguration dataNode : allDataNodes) { try { TGetEarliestTimeslotsResp resp = (TGetEarliestTimeslotsResp) SyncDataNodeClientPool.getInstance() .sendSyncRequestToDataNodeWithGivenRetry(dataNode.getLocation().getInternalEndPoint(), null, CnToDnSyncRequestType.COLLECT_EARLIEST_TIMESLOTS, MAX_RETRY_COUNT); if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - failedDnIds.add(dataNode.getLocation().getDataNodeId()); + failedDataNodes.add(dataNode); LOG.error("Failed to collected earliest timeslots from the DataNode[id={}], response status is {}", dataNode.getLocation().getDataNodeId(), resp.getStatus()); continue; } @@ -224,7 +225,7 @@ private Flow collectEarliestTimeslots(final ConfigNodeProcedureEnv env) { dataNode.getLocation().getDataNodeId(), e.getMessage(), e); - failedDnIds.add(dataNode.getLocation().getDataNodeId()); + failedDataNodes.add(dataNode); } } @@ -233,11 +234,10 @@ private Flow collectEarliestTimeslots(final ConfigNodeProcedureEnv env) { "Collected earliest timeslots from {} DataNodes: {}, the number of successful DataNodes is {}", allDataNodes.size(), earliestTimeslots, - allDataNodes.size() - failedDnIds.size()); + allDataNodes.size() - failedDataNodes.size()); } - Set allDnIds = allDataNodes.stream().map(dataNodeConfiguration -> dataNodeConfiguration.getLocation().getDataNodeId()).collect(Collectors.toSet()); - if (failedDnIds.size() == allDataNodes.size() && allDnIds.containsAll(failedDnIds)) { + if (failedDataNodes.size() == allDataNodes.size() && new HashSet<>(allDataNodes).containsAll(failedDataNodes)) { setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); } else { setNextState(DataPartitionTableIntegrityCheckProcedureState.ANALYZE_MISSING_PARTITIONS); @@ -287,8 +287,6 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { } } - //@TODO simulate case that lost data partition - lostDataPartitionsOfDatabases.add("root.demo"); if (lostDataPartitionsOfDatabases.isEmpty()) { LOG.info("No databases have lost data partitions, terminating procedure"); return Flow.NO_MORE_STATE; @@ -297,7 +295,7 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { LOG.info( "Identified {} databases have lost data partitions, will request DataPartitionTable generation from {} DataNodes", lostDataPartitionsOfDatabases.size(), - allDataNodes.size() - failedDnIds.size()); + allDataNodes.size() - failedDataNodes.size()); setNextState(DataPartitionTableIntegrityCheckProcedureState.REQUEST_PARTITION_TABLES); return Flow.HAS_MORE_STATE; } @@ -340,6 +338,8 @@ private Flow requestPartitionTables(final ConfigNodeProcedureEnv env) { heartBeatExecutor.scheduleAtFixedRate(this::checkPartitionTableGenerationStatus, 0, HEART_BEAT_REQUEST_RATE, TimeUnit.MILLISECONDS); + allDataNodes.removeAll(skipDataNodes); + allDataNodes.removeAll(failedDataNodes); for (TDataNodeConfiguration dataNode : allDataNodes) { int dataNodeId = dataNode.getLocation().getDataNodeId(); if (!dataPartitionTables.containsKey(dataNodeId)) { @@ -349,7 +349,7 @@ private Flow requestPartitionTables(final ConfigNodeProcedureEnv env) { TGenerateDataPartitionTableResp resp = (TGenerateDataPartitionTableResp) SyncDataNodeClientPool.getInstance() .sendSyncRequestToDataNodeWithGivenRetry(dataNode.getLocation().getInternalEndPoint(), req, CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE, MAX_RETRY_COUNT); if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - failedDnIds.add(dataNode.getLocation().getDataNodeId()); + failedDataNodes.add(dataNode); LOG.error("Failed to request DataPartitionTable generation from the DataNode[id={}], response status is {}", dataNode.getLocation().getDataNodeId(), resp.getStatus()); continue; } @@ -359,7 +359,7 @@ private Flow requestPartitionTables(final ConfigNodeProcedureEnv env) { dataPartitionTable.deserialize(ByteBuffer.wrap(bytes)); dataPartitionTables.put(dataNodeId, dataPartitionTable); } catch (Exception e) { - failedDnIds.add(dataNode.getLocation().getDataNodeId()); + failedDataNodes.add(dataNode); LOG.error( "Failed to request DataPartitionTable generation from DataNode[id={}]: {}", dataNodeId, @@ -369,8 +369,7 @@ private Flow requestPartitionTables(final ConfigNodeProcedureEnv env) { } } - Set allDnIds = allDataNodes.stream().map(dataNodeConfiguration -> dataNodeConfiguration.getLocation().getDataNodeId()).collect(Collectors.toSet()); - if (failedDnIds.size() == allDataNodes.size() && allDnIds.containsAll(failedDnIds)) { + if (failedDataNodes.size() == allDataNodes.size() && new HashSet<>(allDataNodes).containsAll(failedDataNodes)) { setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); return Flow.HAS_MORE_STATE; } @@ -398,7 +397,7 @@ private void checkPartitionTableGenerationStatus() { DataPartitionTableGeneratorState state = DataPartitionTableGeneratorState.getStateByCode(resp.getErrorCode()); if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOG.error("Failed to request DataPartitionTable generation heart beat from the DataNode[id={}], response status is {}", dataNode.getLocation().getDataNodeId(), resp.getStatus()); + LOG.error("Failed to request DataPartitionTable generation heart beat from the DataNode[id={}], state is {}, response status is {}", dataNode.getLocation().getDataNodeId(), state, resp.getStatus()); continue; } switch (state) { @@ -449,64 +448,70 @@ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { return Flow.HAS_MORE_STATE; } - try { - Map finalDataPartitionMap = new HashMap<>(); + int failedCnt = 0; + while (failedCnt < MAX_RETRY_COUNT) { + try { + Map finalDataPartitionMap = new HashMap<>(); - for (String database : lostDataPartitionsOfDatabases) { - // Get current DataPartitionTable from ConfigManager - Map>>> - localDataPartitionTableMap = getLocalDataPartitionTable(env, database); + for (String database : lostDataPartitionsOfDatabases) { + // Get current DataPartitionTable from ConfigManager + Map>>> + localDataPartitionTableMap = getLocalDataPartitionTable(env, database); - // Check if ConfigNode has a data partition that is associated with the earliestTimeslot - if (localDataPartitionTableMap == null || localDataPartitionTableMap.isEmpty() || localDataPartitionTableMap.get(database) == null || localDataPartitionTableMap.get(database).isEmpty()) { - LOG.error("No data partition table related to database {} was found from the ConfigNode", database); - continue; - } - - localDataPartitionTableMap.values().forEach(map -> map.forEach((tSeriesPartitionSlot, seriesPartitionTableMap) -> { - if (tSeriesPartitionSlot == null || seriesPartitionTableMap == null || seriesPartitionTableMap.isEmpty()) { - return; + // Check if ConfigNode has a data partition that is associated with the earliestTimeslot + if (localDataPartitionTableMap == null || localDataPartitionTableMap.isEmpty() || localDataPartitionTableMap.get(database) == null || localDataPartitionTableMap.get(database).isEmpty()) { + LOG.error("No data partition table related to database {} was found from the ConfigNode", database); + continue; } - finalDataPartitionMap.computeIfAbsent(tSeriesPartitionSlot, k -> new SeriesPartitionTable(seriesPartitionTableMap)); - })); - } - finalDataPartitionMap.forEach((tSeriesPartitionSlot, seriesPartitionTable) -> { - dataPartitionTables.values().forEach(dataPartitionTable -> { - if (dataPartitionTable == null || dataPartitionTable.getDataPartitionMap() == null || dataPartitionTable.getDataPartitionMap().isEmpty()) { - return; - } - dataPartitionTable.getDataPartitionMap().forEach((dnSeriesPartitionSlot,dnDataPartitionTable) -> { - if (!tSeriesPartitionSlot.equals(dnSeriesPartitionSlot)) { + localDataPartitionTableMap.values().forEach(map -> map.forEach((tSeriesPartitionSlot, seriesPartitionTableMap) -> { + if (tSeriesPartitionSlot == null || seriesPartitionTableMap == null || seriesPartitionTableMap.isEmpty()) { return; } + finalDataPartitionMap.computeIfAbsent(tSeriesPartitionSlot, k -> new SeriesPartitionTable(seriesPartitionTableMap)); + })); + } - if (seriesPartitionTable == null || seriesPartitionTable.getSeriesPartitionMap() == null || seriesPartitionTable.getSeriesPartitionMap().isEmpty()) { - finalDataPartitionMap.put(tSeriesPartitionSlot, dnDataPartitionTable); + finalDataPartitionMap.forEach((tSeriesPartitionSlot, seriesPartitionTable) -> { + dataPartitionTables.values().forEach(dataPartitionTable -> { + if (dataPartitionTable == null || dataPartitionTable.getDataPartitionMap() == null || dataPartitionTable.getDataPartitionMap().isEmpty()) { + return; } - - // dnDataPartitionTable merged to seriesPartitionTable - dnDataPartitionTable.getSeriesPartitionMap().forEach((k, v) -> v.forEach(tConsensusGroupId -> { - if (seriesPartitionTable == null) { + dataPartitionTable.getDataPartitionMap().forEach((dnSeriesPartitionSlot, dnDataPartitionTable) -> { + if (!tSeriesPartitionSlot.equals(dnSeriesPartitionSlot)) { return; } - seriesPartitionTable.putDataPartition(k, tConsensusGroupId); - })); - }); - }); - }); - finalDataPartitionTable = new DataPartitionTable(finalDataPartitionMap); + if (seriesPartitionTable == null || seriesPartitionTable.getSeriesPartitionMap() == null || seriesPartitionTable.getSeriesPartitionMap().isEmpty()) { + finalDataPartitionMap.put(tSeriesPartitionSlot, dnDataPartitionTable); + } - LOG.info("DataPartitionTable merge completed successfully"); - setNextState(DataPartitionTableIntegrityCheckProcedureState.WRITE_PARTITION_TABLE_TO_RAFT); - return Flow.HAS_MORE_STATE; + // dnDataPartitionTable merged to seriesPartitionTable + dnDataPartitionTable.getSeriesPartitionMap().forEach((k, v) -> v.forEach(tConsensusGroupId -> { + if (seriesPartitionTable == null) { + return; + } + seriesPartitionTable.putDataPartition(k, tConsensusGroupId); + })); + }); + }); + }); - } catch (Exception e) { - LOG.error("Failed to merge DataPartitionTables", e); - setFailure("DataPartitionTableIntegrityCheckProcedure", e); - return Flow.NO_MORE_STATE; + finalDataPartitionTable = new DataPartitionTable(finalDataPartitionMap); + break; + } catch (Exception e) { + LOG.error("Failed to merge DataPartitionTables", e); + setFailure("DataPartitionTableIntegrityCheckProcedure", e); + failedCnt++; + if (failedCnt >= MAX_RETRY_COUNT) { + return Flow.NO_MORE_STATE; + } + } } + + LOG.info("DataPartitionTable merge completed successfully"); + setNextState(DataPartitionTableIntegrityCheckProcedureState.WRITE_PARTITION_TABLE_TO_RAFT); + return Flow.HAS_MORE_STATE; } /** Write the final DataPartitionTable to raft log. */ @@ -520,7 +525,7 @@ private Flow writePartitionTableToRaft(final ConfigNodeProcedureEnv env) { setFailure( "DataPartitionTableIntegrityCheckProcedure", new ProcedureException("No database lost data partition table for raft write")); - return Flow.NO_MORE_STATE; + return getFlow(); } if (finalDataPartitionTable == null) { @@ -528,7 +533,7 @@ private Flow writePartitionTableToRaft(final ConfigNodeProcedureEnv env) { setFailure( "DataPartitionTableIntegrityCheckProcedure", new ProcedureException("No DataPartitionTable available for raft write")); - return Flow.NO_MORE_STATE; + return getFlow(); } int failedCnt = 0; @@ -555,7 +560,20 @@ private Flow writePartitionTableToRaft(final ConfigNodeProcedureEnv env) { } failedCnt++; } - return Flow.NO_MORE_STATE; + + return getFlow(); + } + + private Flow getFlow() { + if (!failedDataNodes.isEmpty()) { + allDataNodes.removeAll(failedDataNodes); + skipDataNodes = new HashSet<>(allDataNodes); + setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); + return Flow.HAS_MORE_STATE; + } else { + skipDataNodes.clear(); + return Flow.NO_MORE_STATE; + } } @Override @@ -613,14 +631,36 @@ public void serialize(final DataOutputStream stream) throws IOException { stream.writeInt(0); } - stream.writeInt(skipDnIds.size()); - for (int skipDnId : skipDnIds) { - stream.writeInt(skipDnId); + stream.writeInt(skipDataNodes.size()); + for (TDataNodeConfiguration skipDataNode : skipDataNodes) { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + TTransport transport = new TIOStreamTransport(baos); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + skipDataNode.write(protocol); + + byte[] data = baos.toByteArray(); + stream.writeInt(data.length); + stream.write(data); + } catch (TException e) { + LOG.error("Failed to serialize skipDataNode", e); + throw new IOException("Failed to serialize skipDataNode", e); + } } - stream.writeInt(failedDnIds.size()); - for (int failedDnId : failedDnIds) { - stream.writeInt(failedDnId); + stream.writeInt(failedDataNodes.size()); + for (TDataNodeConfiguration failedDataNode : failedDataNodes) { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + TTransport transport = new TIOStreamTransport(baos); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + failedDataNode.write(protocol); + + byte[] data = baos.toByteArray(); + stream.writeInt(data.length); + stream.write(data); + } catch (TException e) { + LOG.error("Failed to serialize failedDataNode", e); + throw new IOException("Failed to serialize failedDataNode", e); + } } } @@ -687,14 +727,44 @@ public void deserialize(final ByteBuffer byteBuffer) { finalDataPartitionTable = null; } - int skipDnIdsSize = byteBuffer.getInt(); - for (int i = 0; i < skipDnIdsSize; i++) { - skipDnIds.add(byteBuffer.getInt()); + skipDataNodes = new HashSet<>(); + int skipDataNodesSize = byteBuffer.getInt(); + for (int i = 0; i < skipDataNodesSize; i++) { + int size = byteBuffer.getInt(); + byte[] bytes = new byte[size]; + byteBuffer.get(bytes); + + try (ByteArrayInputStream bais = new ByteArrayInputStream(bytes)) { + TTransport transport = new TIOStreamTransport(bais); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + + TDataNodeConfiguration dataNode = new TDataNodeConfiguration(); + dataNode.read(protocol); + skipDataNodes.add(dataNode); + } catch (TException | IOException e) { + LOG.error("Failed to deserialize skipDataNode", e); + throw new RuntimeException(e); + } } - int failedDnIdsSize = byteBuffer.getInt(); - for (int i = 0; i < failedDnIdsSize; i++) { - failedDnIds.add(byteBuffer.getInt()); + failedDataNodes = new HashSet<>(); + int failedDataNodesSize = byteBuffer.getInt(); + for (int i = 0; i < failedDataNodesSize; i++) { + int size = byteBuffer.getInt(); + byte[] bytes = new byte[size]; + byteBuffer.get(bytes); + + try (ByteArrayInputStream bais = new ByteArrayInputStream(bytes)) { + TTransport transport = new TIOStreamTransport(bais); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + + TDataNodeConfiguration dataNode = new TDataNodeConfiguration(); + dataNode.read(protocol); + failedDataNodes.add(dataNode); + } catch (TException | IOException e) { + LOG.error("Failed to deserialize failedDataNode", e); + throw new RuntimeException(e); + } } } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java index 56bc17d808b16..4ce321e9a536c 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java @@ -29,7 +29,11 @@ import org.apache.iotdb.commons.utils.TimePartitionUtils; import org.apache.iotdb.commons.utils.rateLimiter.LeakyBucketRateLimiter; import org.apache.iotdb.db.conf.IoTDBDescriptor; +import org.apache.iotdb.db.storageengine.StorageEngine; +import org.apache.iotdb.db.storageengine.dataregion.DataRegion; +import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileManager; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; +import org.apache.tsfile.file.metadata.IDeviceID; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -68,7 +72,7 @@ public class DataPartitionTableGenerator { private final AtomicLong totalFiles = new AtomicLong(0); // Configuration - private final String[] dataDirectories; + private String[] dataDirectories; private final ExecutorService executor; private final Set databases; private final int seriesSlotNum; @@ -84,6 +88,17 @@ public class DataPartitionTableGenerator { add("root.__audit"); }}; + public DataPartitionTableGenerator( + ExecutorService executor, + Set databases, + int seriesSlotNum, + String seriesPartitionExecutorClass) { + this.executor = executor; + this.databases = databases; + this.seriesSlotNum = seriesSlotNum; + this.seriesPartitionExecutorClass = seriesPartitionExecutorClass; + } + public DataPartitionTableGenerator( String dataDirectory, ExecutorService executor, @@ -117,25 +132,78 @@ public enum TaskStatus { FAILED } - /** Start generating DataPartitionTable asynchronously. */ - public void startGeneration() { + /** + * Start generating DataPartitionTable asynchronously. + * + */ + public CompletableFuture startGeneration() { if (status != TaskStatus.NOT_STARTED) { throw new IllegalStateException("Task is already started or completed"); } status = TaskStatus.IN_PROGRESS; + return CompletableFuture.runAsync(this::generateDataPartitionTableByMemory); + } + + private void generateDataPartitionTableByMemory() { + Map dataPartitionMap = new ConcurrentHashMap<>(); + List> futures = new ArrayList<>(); + + SeriesPartitionExecutor seriesPartitionExecutor = + SeriesPartitionExecutor.getSeriesPartitionExecutor( + seriesPartitionExecutorClass, seriesSlotNum); + + for (DataRegion dataRegion : StorageEngine.getInstance().getAllDataRegions()) { + CompletableFuture regionFuture = + CompletableFuture.runAsync( + () -> { + TsFileManager tsFileManager = dataRegion.getTsFileManager(); + String databaseName = dataRegion.getDatabaseName(); + if (!databases.contains(databaseName) || IGNORE_DATABASE.contains(databaseName)) { + return; + } - CompletableFuture.runAsync( - () -> { - try { - generateDataPartitionTable(); - status = TaskStatus.COMPLETED; - } catch (Exception e) { - LOG.error("Failed to generate DataPartitionTable", e); - errorMessage = e.getMessage(); - status = TaskStatus.FAILED; - } - }); + tsFileManager.readLock(); + List seqTsFileList = tsFileManager.getTsFileList(true); + List unseqTsFileList = tsFileManager.getTsFileList(false); + tsFileManager.readUnlock(); + + constructDataPartitionMap(seqTsFileList, seriesPartitionExecutor, dataPartitionMap); + constructDataPartitionMap(unseqTsFileList, seriesPartitionExecutor, dataPartitionMap); + }, + executor); + futures.add(regionFuture); + } + + // Wait for all tasks to complete + CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join(); + + if (dataPartitionMap.isEmpty()) { + LOG.error("Failed to generate DataPartitionTable, dataPartitionMap is empty"); + status = TaskStatus.FAILED; + return; + } + + dataPartitionTable = new DataPartitionTable(dataPartitionMap); + status = TaskStatus.COMPLETED; + } + + private static void constructDataPartitionMap(List seqTsFileList, SeriesPartitionExecutor seriesPartitionExecutor, Map dataPartitionMap) { + for (TsFileResource tsFileResource : seqTsFileList) { + Set devices = tsFileResource.getDevices(limiter); + long timeSlotId = tsFileResource.getTsFileID().timePartitionId; + int regionId = tsFileResource.getTsFileID().regionId; + + TConsensusGroupId consensusGroupId = new TConsensusGroupId(); + consensusGroupId.setId(regionId); + consensusGroupId.setType(TConsensusGroupType.DataRegion); + + for (IDeviceID deviceId : devices) { + TSeriesPartitionSlot seriesSlotId = seriesPartitionExecutor.getSeriesPartitionSlot(deviceId); + TTimePartitionSlot timePartitionSlot = new TTimePartitionSlot(TimePartitionUtils.getTimeByPartitionId(timeSlotId)); + dataPartitionMap.computeIfAbsent(seriesSlotId, empty -> newSeriesPartitionTable(consensusGroupId, timeSlotId)).putDataPartition(timePartitionSlot, consensusGroupId); + } + } } /** Generate DataPartitionTable by scanning all resource files. */ diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index d108180029b46..6e5b615cc1b27 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -3201,11 +3201,6 @@ public TGetEarliestTimeslotsResp getEarliestTimeslots() { @Override public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataPartitionTableReq req) { - String[] dataDirs = IoTDBDescriptor.getInstance().getConfig().getDataDirs(); - return generateDataPartitionTable(req, dataDirs); - } - - public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataPartitionTableReq req, String[] dataDirs) { TGenerateDataPartitionTableResp resp = new TGenerateDataPartitionTableResp(); byte[] empty = new byte[0]; @@ -3220,71 +3215,50 @@ public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataP return resp; } - // Get data directories and configuration - if (dataDirs.length == 0) { - resp.setDataPartitionTable(empty); - resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); - resp.setMessage("dataDirs parameter are not configured in the iotdb-system.properties"); - resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); - return resp; - } - // Create generator for all data directories int seriesSlotNum = IoTDBDescriptor.getInstance().getConfig().getSeriesPartitionSlotNum(); String seriesPartitionExecutorClass = IoTDBDescriptor.getInstance().getConfig().getSeriesPartitionExecutorClass(); currentGenerator = - new DataPartitionTableGenerator( - dataDirs, partitionTableRecoverExecutor, req.getDatabases(), seriesSlotNum, seriesPartitionExecutorClass); + new DataPartitionTableGenerator(partitionTableRecoverExecutor, req.getDatabases(), seriesSlotNum, seriesPartitionExecutorClass); currentTaskId = System.currentTimeMillis(); // Start generation synchronously for now to return the data partition table immediately - currentGenerator.startGeneration(); + currentGenerator.startGeneration().get(timeoutMs, TimeUnit.MILLISECONDS); + + if (currentGenerator != null) { + switch (currentGenerator.getStatus()) { + case IN_PROGRESS: + resp.setDataPartitionTable(empty); + resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); + resp.setMessage("DataPartitionTable generation interrupted"); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + break; + case COMPLETED: + DataPartitionTable dataPartitionTable = currentGenerator.getDataPartitionTable(); + if (dataPartitionTable != null) { + byte[] result = serializeDataPartitionTable(dataPartitionTable); + resp.setDataPartitionTable(result); + } - // Wait for completion (with timeout) - long startTime = System.currentTimeMillis(); - - while (currentGenerator != null && currentGenerator.getStatus() == DataPartitionTableGenerator.TaskStatus.IN_PROGRESS) { - if (System.currentTimeMillis() - startTime > timeoutMs) { - resp.setErrorCode(DataPartitionTableGeneratorState.IN_PROGRESS.getCode()); - resp.setMessage("DataPartitionTable generation timed out"); - resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); - return resp; - } - - try { - Thread.sleep(100); // Sleep for 100ms - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - resp.setDataPartitionTable(empty); - resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); - resp.setMessage("DataPartitionTable generation interrupted"); - resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); - return resp; + resp.setErrorCode(DataPartitionTableGeneratorState.SUCCESS.getCode()); + resp.setMessage("DataPartitionTable generation completed successfully"); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.SUCCESS_STATUS)); + LOGGER.info("DataPartitionTable generation completed with task ID: {}", currentTaskId); + break; + default: + resp.setDataPartitionTable(empty); + resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); + resp.setMessage( + "DataPartitionTable generation failed: " + currentGenerator.getErrorMessage()); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + break; } } - // Check final status - if (currentGenerator.getStatus() == DataPartitionTableGenerator.TaskStatus.COMPLETED) { - DataPartitionTable dataPartitionTable = currentGenerator.getDataPartitionTable(); - if (dataPartitionTable != null) { - byte[] result = serializeDataPartitionTable(dataPartitionTable); - resp.setDataPartitionTable(result); - } - - resp.setErrorCode(DataPartitionTableGeneratorState.SUCCESS.getCode()); - resp.setMessage("DataPartitionTable generation completed successfully"); - resp.setStatus(RpcUtils.getStatus(TSStatusCode.SUCCESS_STATUS)); - - LOGGER.info("DataPartitionTable generation completed with task ID: {}", currentTaskId); - } else { - resp.setDataPartitionTable(empty); - resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); - resp.setMessage( - "DataPartitionTable generation failed: " + currentGenerator.getErrorMessage()); - resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); - } + // Clear current generator + currentGenerator = null; } catch (Exception e) { LOGGER.error("Failed to generate DataPartitionTable", e); resp.setStatus( @@ -3383,13 +3357,14 @@ private void processDataDirectoryForEarliestTimeslots( /** Find the earliest timeslot in a database directory. */ private long findEarliestTimeslotInDatabase(File databaseDir) { String databaseName = databaseDir.getName(); + List> futureList = new ArrayList<>(); try { Files.list(databaseDir.toPath()) .filter(Files::isDirectory) .forEach( regionPath -> { - findEarliestTimeSlotExecutor.submit(() -> { + Future future = findEarliestTimeSlotExecutor.submit(() -> { try { Files.list(regionPath) .filter(Files::isDirectory) @@ -3412,11 +3387,20 @@ private long findEarliestTimeslotInDatabase(File databaseDir) { LOGGER.error("Failed to scan {}", regionPath, e); } }); + futureList.add(future); }); } catch (IOException e) { LOGGER.error("Failed to walk database directory: {}", databaseDir, e); } + for (Future future : futureList) { + try { + future.get(); + } catch (InterruptedException | ExecutionException e) { + LOGGER.error("Failed to wait for task completion", e); + Thread.currentThread().interrupt(); + } + } return databaseEarliestRegionMap.get(databaseName); } From 3960a0716952be6f41a8407f8b4ad5f889e75cfc Mon Sep 17 00:00:00 2001 From: libo Date: Wed, 11 Mar 2026 17:58:22 +0800 Subject: [PATCH 06/13] Adjust the default value is 1 min --- .../org/apache/iotdb/confignode/conf/ConfigNodeConfig.java | 2 +- .../assembly/resources/conf/iotdb-system.properties.template | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java index c549e1347bc7c..c284762fc10c4 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java @@ -319,7 +319,7 @@ public class ConfigNodeConfig { private long forceWalPeriodForConfigNodeSimpleInMs = 100; - private long partitionTableRecoverWaitAllDnUpTimeout=2000; + private long partitionTableRecoverWaitAllDnUpTimeout=60000; public ConfigNodeConfig() { // empty constructor diff --git a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template index ff9066c0dec22..9dc55d3903261 100644 --- a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template +++ b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template @@ -757,11 +757,10 @@ partition_table_recover_worker_num=10 # Datatype: Integer partition_table_recover_max_read_bytes_per_second=10 -# Set a timeout to wait for all datanodes complete startup, the unit is ms300000 +# Set a timeout to wait for all datanodes complete startup, the unit is ms # effectiveMode: restart # Datatype: Integer -#partition_table_recover_wait_all_dn_up_timeout=300000 -partition_table_recover_wait_all_dn_up_timeout=2000 +partition_table_recover_wait_all_dn_up_timeout=60000 #################### ### Memory Control Configuration From 98a18c3fdcc8812ccc3bdf03c25473eee655178a Mon Sep 17 00:00:00 2001 From: libo Date: Wed, 11 Mar 2026 17:58:22 +0800 Subject: [PATCH 07/13] Adjust the default value is 1 min --- DataPartitionTableIntegrityCheck_README.md | 245 ------------------ .../confignode/conf/ConfigNodeConfig.java | 2 +- .../conf/iotdb-system.properties.template | 5 +- 3 files changed, 3 insertions(+), 249 deletions(-) delete mode 100644 DataPartitionTableIntegrityCheck_README.md diff --git a/DataPartitionTableIntegrityCheck_README.md b/DataPartitionTableIntegrityCheck_README.md deleted file mode 100644 index 7fe3eefabb041..0000000000000 --- a/DataPartitionTableIntegrityCheck_README.md +++ /dev/null @@ -1,245 +0,0 @@ -# IoTDB 数据分区表完整性检测功能实现 - -## 功能概述 - -本功能实现了IoTDB ConfigNode重启时的数据分区表完整性检测,能够自动发现并恢复丢失的数据分区信息。 - -## 实现架构 - -### 1. 核心组件 - -#### Procedure实现 -- **DataPartitionTableIntegrityCheckProcedure**: 主要的Procedure实现,负责整个完整性检测流程 -- **ConfigNodeProcedureEnv**: Procedure执行环境,提供ConfigManager访问 - -#### DataNode端实现 -- **DataPartitionTableGenerator**: 扫描tsfile并生成DataPartitionTable的核心组件 -- **RPC接口扩展**: 在DataNode RPC服务中添加了三个新接口 - -#### 配置和注册 -- **ProcedureType枚举扩展**: 添加了新的Procedure类型 -- **ProcedureFactory扩展**: 支持新Procedure的创建和反序列化 -- **启动监听器**: ConfigNode启动时自动触发检测 - -### 2. 执行流程 - -``` -ConfigNode重启 → 检查Leader状态 → 收集最早timeslot → 分析缺失分区 → -请求DN生成表 → 合并分区表 → 写入Raft日志 → 完成 -``` - -## 详细实现 - -### 1. Thrift接口定义 (datanode.thrift) - -新增的RPC接口: -```thrift -// 获取最早timeslot信息 -TGetEarliestTimeslotsResp getEarliestTimeslots() - -// 请求生成DataPartitionTable -TGenerateDataPartitionTableResp generateDataPartitionTable() - -// 检查生成状态 -TCheckDataPartitionTableStatusResp checkDataPartitionTableStatus() -``` - -对应的响应结构体: -```thrift -struct TGetEarliestTimeslotsResp { - 1: required common.TSStatus status - 2: optional map databaseToEarliestTimeslot -} - -struct TGenerateDataPartitionTableResp { - 1: required common.TSStatus status - 2: required i32 errorCode - 3: optional string message -} - -struct TCheckDataPartitionTableStatusResp { - 1: required common.TSStatus status - 2: required i32 errorCode - 3: optional string message - 4: optional binary dataPartitionTable -} -``` - -### 2. DataNode实现 - -#### DataPartitionTableGenerator -- **并行扫描**: 使用多线程并行扫描tsfile文件 -- **进度跟踪**: 提供处理进度和状态信息 -- **错误处理**: 统计失败文件并记录错误信息 -- **配置化**: 支持自定义线程数和分区配置 - -#### RPC服务实现 -在`DataNodeInternalRPCServiceImpl`中实现: -- `getEarliestTimeslots()`: 扫描数据目录获取每个数据库的最早timeslot -- `generateDataPartitionTable()`: 启动异步扫描任务 -- `checkDataPartitionTableStatus()`: 检查任务状态并返回结果 - -### 3. ConfigNode Procedure实现 - -#### 状态机设计 -```java -public enum State { - CHECK_LEADER_STATUS, // 检查Leader状态 - COLLECT_EARLIEST_TIMESLOTS, // 收集最早timeslot - ANALYZE_MISSING_PARTITIONS, // 分析缺失分区 - REQUEST_PARTITION_TABLES, // 请求生成分区表 - MERGE_PARTITION_TABLES, // 合并分区表 - WRITE_PARTITION_TABLE_TO_RAFT, // 写入Raft日志 - SUCCESS, // 成功完成 - FAILED // 执行失败 -} -``` - -#### 错误码定义 -```java -public static final int DN_ERROR_CODE_SUCCESS = 0; // 处理成功 -public static final int DN_ERROR_CODE_IN_PROGRESS = 2; // 正在执行 -public static final int DN_ERROR_CODE_FAILED = 1; // 处理失败 -public static final int DN_ERROR_CODE_UNKNOWN = -1; // DN未知状态 -``` - -#### 核心逻辑 -1. **Leader检查**: 只有Leader节点执行检测 -2. **数据收集**: 从所有DataNode收集最早timeslot信息 -3. **缺失分析**: 对比当前分区表,识别缺失的分区 -4. **异步处理**: 向DataNode发送异步扫描请求 -5. **状态轮询**: 定期检查任务状态,支持重试机制 -6. **数据合并**: 合并所有DataNode返回的分区表 -7. **Raft写入**: 通过共识层持久化最终分区表 - -### 4. 自动触发机制 - -#### 启动监听器 -```java -public class DataPartitionTableIntegrityCheckListener { - public void onStartupComplete() { - if (isLeader()) { - startIntegrityCheck(); - } - } - - public void onBecomeLeader() { - startIntegrityCheck(); - } -} -``` - -## 关键特性 - -### 1. 原子性保证 -- 每个步骤都是幂等的,支持重试 -- Procedure框架保证状态一致性 -- 失败时可以安全回滚 - -### 2. 容错机制 -- **重试策略**: 最多重试3次 -- **超时处理**: 避免无限等待 -- **部分失败**: 部分DataNode失败时继续处理 - -### 3. 性能优化 -- **并行扫描**: DataNode端使用多线程并行处理 -- **异步执行**: 避免阻塞主流程 -- **进度跟踪**: 提供实时进度信息 - -### 4. 可扩展性 -- **配置化**: 支持自定义线程数和分区配置 -- **模块化**: 各组件独立,易于扩展 -- **接口化**: 清晰的RPC接口定义 - -## 使用方式 - -### 1. 自动触发 -ConfigNode重启时自动检测并执行,无需手动干预。 - -### 2. 手动触发 -可以通过ProcedureExecutor手动提交检测Procedure: -```java -DataPartitionTableIntegrityCheckProcedure procedure = new DataPartitionTableIntegrityCheckProcedure(); -procedureExecutor.submit(procedure); -``` - -## 配置参数 - -### DataNode配置 -- `seriesSlotNum`: 系列分区槽数量 -- `seriesPartitionExecutorClass`: 分区执行器类名 -- `dataDirs`: 数据目录配置 - -### Procedure配置 -- `MAX_RETRY_COUNT`: 最大重试次数 (默认3) -- 重试间隔: 5秒 - -## 监控和日志 - -### 日志级别 -- **INFO**: 关键流程节点信息 -- **DEBUG**: 详细的执行过程 -- **ERROR**: 错误和异常信息 - -### 关键指标 -- 处理文件数量 -- 失败文件数量 -- 执行时间 -- 重试次数 -- DataNode响应状态 - -## 注意事项 - -### 1. 依赖关系 -- 需要ConfigNode为Leader状态 -- 依赖DataNode正常注册和通信 -- 需要共识层正常工作 - -### 2. 资源消耗 -- DataNode扫描会消耗CPU和I/O资源 -- 建议在低峰期执行 -- 大数据集时需要考虑内存使用 - -### 3. 网络带宽 -- DataPartitionTable序列化后可能较大 -- 需要考虑网络传输限制 -- 建议实现增量传输机制 - -## 后续优化建议 - -### 1. 增量扫描 -- 支持增量扫描,只处理新增文件 -- 维护扫描状态,避免重复工作 - -### 2. 分布式协调 -- 实现更智能的负载分配 -- 支持动态调整扫描策略 - -### 3. 缓存优化 -- 缓存扫描结果,避免重复计算 -- 实现智能失效机制 - -### 4. 监控增强 -- 添加更详细的性能指标 -- 实现告警机制 - -## 测试验证 - -### 1. 单元测试 -- 各组件独立测试 -- 边界条件测试 -- 异常场景验证 - -### 2. 集成测试 -- 端到端流程测试 -- 多节点环境验证 -- 故障恢复测试 - -### 3. 性能测试 -- 大数据集扫描测试 -- 并发性能测试 -- 资源使用监控 - ---- - -本实现提供了完整的IoTDB数据分区表完整性检测解决方案,具备高可用性、容错性和可扩展性,能够在ConfigNode重启时自动发现并恢复丢失的数据分区信息。 diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java index c549e1347bc7c..c284762fc10c4 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java @@ -319,7 +319,7 @@ public class ConfigNodeConfig { private long forceWalPeriodForConfigNodeSimpleInMs = 100; - private long partitionTableRecoverWaitAllDnUpTimeout=2000; + private long partitionTableRecoverWaitAllDnUpTimeout=60000; public ConfigNodeConfig() { // empty constructor diff --git a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template index ff9066c0dec22..9dc55d3903261 100644 --- a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template +++ b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template @@ -757,11 +757,10 @@ partition_table_recover_worker_num=10 # Datatype: Integer partition_table_recover_max_read_bytes_per_second=10 -# Set a timeout to wait for all datanodes complete startup, the unit is ms300000 +# Set a timeout to wait for all datanodes complete startup, the unit is ms # effectiveMode: restart # Datatype: Integer -#partition_table_recover_wait_all_dn_up_timeout=300000 -partition_table_recover_wait_all_dn_up_timeout=2000 +partition_table_recover_wait_all_dn_up_timeout=60000 #################### ### Memory Control Configuration From d0882ee513d2dfafa580eafc7545d14b733666e8 Mon Sep 17 00:00:00 2001 From: libo Date: Wed, 11 Mar 2026 18:12:06 +0800 Subject: [PATCH 08/13] Append a description about the unit --- .../resources/conf/iotdb-system.properties.template | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template index 9dc55d3903261..c36f35cd5778c 100644 --- a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template +++ b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template @@ -747,12 +747,7 @@ disk_space_warning_threshold=0.05 # Datatype: Integer partition_table_recover_worker_num=10 -# Limit the number of files used for parallel processing -# effectiveMode: restart -# Datatype: Integer -#partition_table_recover_process_file_num=1000 - -# Limit the number of bytes read per second from a file +# Limit the number of bytes read per second from a file, the unit is MB # effectiveMode: restart # Datatype: Integer partition_table_recover_max_read_bytes_per_second=10 From 9a53bd25ea0e7507fa81e1ff05bcb2380953273e Mon Sep 17 00:00:00 2001 From: libo Date: Thu, 12 Mar 2026 18:12:23 +0800 Subject: [PATCH 09/13] use the spotless command to format code --- .../client/sync/SyncDataNodeClientPool.java | 12 +- .../confignode/conf/ConfigNodeConfig.java | 5 +- .../confignode/conf/ConfigNodeDescriptor.java | 5 +- .../confignode/manager/ProcedureManager.java | 4 +- .../confignode/manager/node/NodeManager.java | 49 +-- ...PartitionTableIntegrityCheckProcedure.java | 301 ++++++++++++------ .../iotdb/confignode/service/ConfigNode.java | 54 ++-- .../apache/iotdb/db/conf/IoTDBDescriptor.java | 10 +- .../DataPartitionTableGenerator.java | 241 +++++++------- .../impl/DataNodeInternalRPCServiceImpl.java | 137 ++++---- .../timeindex/ArrayDeviceTimeIndex.java | 3 +- .../tsfile/timeindex/FileTimeIndex.java | 15 +- .../tsfile/timeindex/ITimeIndex.java | 4 +- .../DataNodeInternalRPCServiceImplTest.java | 9 +- .../iotdb/commons/ServerCommandLine.java | 3 +- .../commons/utils/TimePartitionUtils.java | 12 +- .../rateLimiter/LeakyBucketRateLimiter.java | 27 +- 17 files changed, 525 insertions(+), 366 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/SyncDataNodeClientPool.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/SyncDataNodeClientPool.java index 84c027e513298..9f5729ef06dfd 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/SyncDataNodeClientPool.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/SyncDataNodeClientPool.java @@ -141,14 +141,14 @@ private void buildActionMap() { CnToDnSyncRequestType.SHOW_APPLIED_CONFIGURATIONS, (req, client) -> client.showAppliedConfigurations()); actionMapBuilder.put( - CnToDnSyncRequestType.COLLECT_EARLIEST_TIMESLOTS, - (req, client) -> client.getEarliestTimeslots()); + CnToDnSyncRequestType.COLLECT_EARLIEST_TIMESLOTS, + (req, client) -> client.getEarliestTimeslots()); actionMapBuilder.put( - CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE, - (req, client) -> client.generateDataPartitionTable((TGenerateDataPartitionTableReq) req)); + CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE, + (req, client) -> client.generateDataPartitionTable((TGenerateDataPartitionTableReq) req)); actionMapBuilder.put( - CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE_HEART_BEAT, - (req, client) -> client.generateDataPartitionTableHeartbeat()); + CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE_HEART_BEAT, + (req, client) -> client.generateDataPartitionTableHeartbeat()); actionMap = actionMapBuilder.build(); } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java index c284762fc10c4..c682107698a3a 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java @@ -319,7 +319,7 @@ public class ConfigNodeConfig { private long forceWalPeriodForConfigNodeSimpleInMs = 100; - private long partitionTableRecoverWaitAllDnUpTimeout=60000; + private long partitionTableRecoverWaitAllDnUpTimeout = 60000; public ConfigNodeConfig() { // empty constructor @@ -1293,7 +1293,8 @@ public long getPartitionTableRecoverWaitAllDnUpTimeout() { return partitionTableRecoverWaitAllDnUpTimeout; } - public void setPartitionTableRecoverWaitAllDnUpTimeout(long partitionTableRecoverWaitAllDnUpTimeout) { + public void setPartitionTableRecoverWaitAllDnUpTimeout( + long partitionTableRecoverWaitAllDnUpTimeout) { this.partitionTableRecoverWaitAllDnUpTimeout = partitionTableRecoverWaitAllDnUpTimeout; } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java index 17ec570b6d7d0..e7d39fd3bcb87 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java @@ -325,9 +325,8 @@ private void loadProperties(TrimProperties properties) throws BadNodeUrlExceptio conf.setPartitionTableRecoverWaitAllDnUpTimeout( Long.parseLong( properties.getProperty( - "partition_table_recover_wait_all_dn_up_timeout", - String.valueOf(conf.getPartitionTableRecoverWaitAllDnUpTimeout()))) - ); + "partition_table_recover_wait_all_dn_up_timeout", + String.valueOf(conf.getPartitionTableRecoverWaitAllDnUpTimeout())))); String leaderDistributionPolicy = properties.getProperty("leader_distribution_policy", conf.getLeaderDistributionPolicy()); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java index bda00014ef204..1a69044d37d3d 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java @@ -1375,9 +1375,7 @@ public TSStatus createRegionGroups( } } - /** - * Used to repair the lost data partition table - */ + /** Used to repair the lost data partition table */ public TSStatus dataPartitionTableIntegrityCheck() { DataPartitionTableIntegrityCheckProcedure procedure; synchronized (this) { diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/node/NodeManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/node/NodeManager.java index 7a7cf3ff13290..fdf8ef89f65d7 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/node/NodeManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/node/NodeManager.java @@ -1351,8 +1351,8 @@ private ExternalServiceManager getServiceManager() { } /** - * Check if all DataNodes are registered and running, then trigger integrity check. - * This method should be called after each DataNode registration. + * Check if all DataNodes are registered and running, then trigger integrity check. This method + * should be called after each DataNode registration. */ private void checkAndTriggerIntegrityCheck() { // Only trigger integrity check if this ConfigNode is the leader @@ -1362,19 +1362,22 @@ private void checkAndTriggerIntegrityCheck() { // Get all registered DataNodes List registeredDataNodes = getRegisteredDataNodes(); - + // Check if all registered DataNodes are running - boolean allDataNodesRunning = registeredDataNodes.stream() - .allMatch(dataNode -> { - Integer dataNodeId = dataNode.getLocation().getDataNodeId(); - NodeStatus status = getLoadManager().getLoadCache().getNodeStatus(dataNodeId); - return status == NodeStatus.Running; - }); + boolean allDataNodesRunning = + registeredDataNodes.stream() + .allMatch( + dataNode -> { + Integer dataNodeId = dataNode.getLocation().getDataNodeId(); + NodeStatus status = getLoadManager().getLoadCache().getNodeStatus(dataNodeId); + return status == NodeStatus.Running; + }); if (allDataNodesRunning && !registeredDataNodes.isEmpty()) { - LOGGER.info("All {} DataNodes are registered and running, triggering data partition table integrity check", - registeredDataNodes.size()); - + LOGGER.info( + "All {} DataNodes are registered and running, triggering data partition table integrity check", + registeredDataNodes.size()); + // Trigger integrity check asynchronously try { configManager.getProcedureManager().dataPartitionTableIntegrityCheck(); @@ -1383,15 +1386,19 @@ private void checkAndTriggerIntegrityCheck() { LOGGER.error("Failed to submit data partition table integrity check procedure", e); } } else { - LOGGER.debug("Not all DataNodes are ready yet. Registered: {}, Running: {}", - registeredDataNodes.size(), - (int) registeredDataNodes.stream() - .filter(dataNode -> { - Integer dataNodeId = dataNode.getLocation().getDataNodeId(); - NodeStatus status = getLoadManager().getLoadCache().getNodeStatus(dataNodeId); - return status == NodeStatus.Running; - }) - .count()); + LOGGER.debug( + "Not all DataNodes are ready yet. Registered: {}, Running: {}", + registeredDataNodes.size(), + (int) + registeredDataNodes.stream() + .filter( + dataNode -> { + Integer dataNodeId = dataNode.getLocation().getDataNodeId(); + NodeStatus status = + getLoadManager().getLoadCache().getNodeStatus(dataNodeId); + return status == NodeStatus.Running; + }) + .count()); } } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index dbb47019e136d..50522c5e66c93 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -24,6 +24,7 @@ import org.apache.iotdb.common.rpc.thrift.TSStatus; import org.apache.iotdb.common.rpc.thrift.TSeriesPartitionSlot; import org.apache.iotdb.common.rpc.thrift.TTimePartitionSlot; +import org.apache.iotdb.commons.concurrent.threadpool.ScheduledExecutorUtil; import org.apache.iotdb.commons.enums.DataPartitionTableGeneratorState; import org.apache.iotdb.commons.partition.DataPartitionTable; import org.apache.iotdb.commons.partition.SeriesPartitionTable; @@ -88,7 +89,7 @@ public class DataPartitionTableIntegrityCheckProcedure NodeManager dataNodeManager; private List allDataNodes = new ArrayList<>(); - //============Need serialize BEGIN=============/ + // ============Need serialize BEGIN=============/ /** Collected earliest timeslots from DataNodes: database -> earliest timeslot */ private Map earliestTimeslots = new ConcurrentHashMap<>(); @@ -104,14 +105,16 @@ public class DataPartitionTableIntegrityCheckProcedure private static Set failedDataNodes = new HashSet<>(); private static ScheduledExecutorService heartBeatExecutor; - //============Need serialize END=============/ + + // ============Need serialize END=============/ public DataPartitionTableIntegrityCheckProcedure() { super(); } @Override - protected Flow executeFromState(final ConfigNodeProcedureEnv env, final DataPartitionTableIntegrityCheckProcedureState state) + protected Flow executeFromState( + final ConfigNodeProcedureEnv env, final DataPartitionTableIntegrityCheckProcedureState state) throws InterruptedException { try { // Ensure to get the real-time DataNodes in the current cluster at every step @@ -143,7 +146,8 @@ protected Flow executeFromState(final ConfigNodeProcedureEnv env, final DataPart } @Override - protected void rollbackState(final ConfigNodeProcedureEnv env, final DataPartitionTableIntegrityCheckProcedureState state) + protected void rollbackState( + final ConfigNodeProcedureEnv env, final DataPartitionTableIntegrityCheckProcedureState state) throws IOException, InterruptedException, ProcedureException { switch (state) { case COLLECT_EARLIEST_TIMESLOTS: @@ -189,7 +193,8 @@ private Flow collectEarliestTimeslots() { } if (allDataNodes.isEmpty()) { - LOG.error("No DataNodes registered, no way to collect earliest timeslots, terminating procedure"); + LOG.error( + "No DataNodes registered, no way to collect earliest timeslots, terminating procedure"); setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); return Flow.HAS_MORE_STATE; } @@ -198,11 +203,20 @@ private Flow collectEarliestTimeslots() { allDataNodes.removeAll(skipDataNodes); for (TDataNodeConfiguration dataNode : allDataNodes) { try { - TGetEarliestTimeslotsResp resp = (TGetEarliestTimeslotsResp) SyncDataNodeClientPool.getInstance() - .sendSyncRequestToDataNodeWithGivenRetry(dataNode.getLocation().getInternalEndPoint(), null, CnToDnSyncRequestType.COLLECT_EARLIEST_TIMESLOTS, MAX_RETRY_COUNT); + TGetEarliestTimeslotsResp resp = + (TGetEarliestTimeslotsResp) + SyncDataNodeClientPool.getInstance() + .sendSyncRequestToDataNodeWithGivenRetry( + dataNode.getLocation().getInternalEndPoint(), + null, + CnToDnSyncRequestType.COLLECT_EARLIEST_TIMESLOTS, + MAX_RETRY_COUNT); if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { failedDataNodes.add(dataNode); - LOG.error("Failed to collected earliest timeslots from the DataNode[id={}], response status is {}", dataNode.getLocation().getDataNodeId(), resp.getStatus()); + LOG.error( + "Failed to collected earliest timeslots from the DataNode[id={}], response status is {}", + dataNode.getLocation().getDataNodeId(), + resp.getStatus()); continue; } @@ -215,9 +229,9 @@ private Flow collectEarliestTimeslots() { if (LOG.isDebugEnabled()) { LOG.debug( - "Collected earliest timeslots from the DataNode[id={}]: {}", - dataNode.getLocation().getDataNodeId(), - nodeTimeslots); + "Collected earliest timeslots from the DataNode[id={}]: {}", + dataNode.getLocation().getDataNodeId(), + nodeTimeslots); } } catch (Exception e) { LOG.error( @@ -237,7 +251,8 @@ private Flow collectEarliestTimeslots() { allDataNodes.size() - failedDataNodes.size()); } - if (failedDataNodes.size() == allDataNodes.size() && new HashSet<>(allDataNodes).containsAll(failedDataNodes)) { + if (failedDataNodes.size() == allDataNodes.size() + && new HashSet<>(allDataNodes).containsAll(failedDataNodes)) { setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); } else { setNextState(DataPartitionTableIntegrityCheckProcedureState.ANALYZE_MISSING_PARTITIONS); @@ -246,7 +261,8 @@ private Flow collectEarliestTimeslots() { } /** - * Analyze which data partitions are missing based on earliest timeslots. Identify data partitions of databases need to be repaired. + * Analyze which data partitions are missing based on earliest timeslots. Identify data partitions + * of databases need to be repaired. */ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { if (LOG.isDebugEnabled()) { @@ -254,7 +270,8 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { } if (earliestTimeslots.isEmpty()) { - LOG.error("No missing data partitions detected, nothing needs to be repaired, terminating procedure"); + LOG.error( + "No missing data partitions detected, nothing needs to be repaired, terminating procedure"); setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); return Flow.HAS_MORE_STATE; } @@ -266,24 +283,39 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { // Get current DataPartitionTable from ConfigManager Map>>> - localDataPartitionTable = getLocalDataPartitionTable(env, database); + localDataPartitionTable = getLocalDataPartitionTable(env, database); // Check if ConfigNode has a data partition that is associated with the earliestTimeslot - if (localDataPartitionTable == null || localDataPartitionTable.isEmpty() || localDataPartitionTable.get(database) == null || localDataPartitionTable.get(database).isEmpty()) { - LOG.error("No data partition table related to database {} was found from the ConfigNode", database); + if (localDataPartitionTable == null + || localDataPartitionTable.isEmpty() + || localDataPartitionTable.get(database) == null + || localDataPartitionTable.get(database).isEmpty()) { + lostDataPartitionsOfDatabases.add(database); + LOG.error( + "No data partition table related to database {} was found from the ConfigNode, and this issue needs to be repaired", + database); continue; } - Map>> seriesPartitionMap = localDataPartitionTable.get(database); + Map>> + seriesPartitionMap = localDataPartitionTable.get(database); for (Map.Entry>> - seriesPartitionEntry : seriesPartitionMap.entrySet()) { - Map> tTimePartitionSlotListMap = seriesPartitionEntry.getValue(); - tTimePartitionSlotListMap.keySet().forEach(slot -> { - if (!TimePartitionUtils.satisfyPartitionId(slot.getStartTime(), earliestTimeslot)) { - lostDataPartitionsOfDatabases.add(database); - LOG.warn("Database {} has lost timeslot {} in its data table partition, and this issue needs to be repaired", database, earliestTimeslot); - } - }); + seriesPartitionEntry : seriesPartitionMap.entrySet()) { + Map> tTimePartitionSlotListMap = + seriesPartitionEntry.getValue(); + tTimePartitionSlotListMap + .keySet() + .forEach( + slot -> { + if (!TimePartitionUtils.satisfyPartitionId( + slot.getStartTime(), earliestTimeslot)) { + lostDataPartitionsOfDatabases.add(database); + LOG.warn( + "Database {} has lost timeslot {} in its data table partition, and this issue needs to be repaired", + database, + earliestTimeslot); + } + }); } } @@ -300,23 +332,26 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { return Flow.HAS_MORE_STATE; } - private Map>>> getLocalDataPartitionTable(final ConfigNodeProcedureEnv env, final String database) { - Map> schemaPartitionTable = env.getConfigManager().getSchemaPartition(Collections.singletonMap(database, Collections.emptyList())) + private Map>>> + getLocalDataPartitionTable(final ConfigNodeProcedureEnv env, final String database) { + Map> schemaPartitionTable = + env.getConfigManager() + .getSchemaPartition(Collections.singletonMap(database, Collections.emptyList())) .getSchemaPartitionTable(); // Construct request for getting data partition final Map> partitionSlotsMap = new HashMap<>(); schemaPartitionTable.forEach( - (key, value) -> { - Map slotListMap = new HashMap<>(); - value - .keySet() - .forEach( - slot -> - slotListMap.put( - slot, new TTimeSlotList(Collections.emptyList(), true, true))); - partitionSlotsMap.put(key, slotListMap); - }); + (key, value) -> { + Map slotListMap = new HashMap<>(); + value + .keySet() + .forEach( + slot -> + slotListMap.put( + slot, new TTimeSlotList(Collections.emptyList(), true, true))); + partitionSlotsMap.put(key, slotListMap); + }); final GetDataPartitionPlan getDataPartitionPlan = new GetDataPartitionPlan(partitionSlotsMap); return env.getConfigManager().getDataPartition(getDataPartitionPlan).getDataPartitionTable(); } @@ -327,16 +362,21 @@ private Map(allDataNodes).containsAll(failedDataNodes)) { + if (failedDataNodes.size() == allDataNodes.size() + && new HashSet<>(allDataNodes).containsAll(failedDataNodes)) { setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); return Flow.HAS_MORE_STATE; } @@ -378,9 +428,7 @@ private Flow requestPartitionTables(final ConfigNodeProcedureEnv env) { return Flow.HAS_MORE_STATE; } - /** - * Check completion status of DataPartitionTable generation tasks. - */ + /** Check completion status of DataPartitionTable generation tasks. */ private void checkPartitionTableGenerationStatus() { if (LOG.isDebugEnabled()) { LOG.info("Checking DataPartitionTable generation completion status..."); @@ -392,36 +440,52 @@ private void checkPartitionTableGenerationStatus() { if (!dataPartitionTables.containsKey(dataNodeId)) { try { - TGenerateDataPartitionTableHeartbeatResp resp = (TGenerateDataPartitionTableHeartbeatResp) SyncDataNodeClientPool.getInstance() - .sendSyncRequestToDataNodeWithGivenRetry(dataNode.getLocation().getInternalEndPoint(), null, CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE_HEART_BEAT, MAX_RETRY_COUNT); - DataPartitionTableGeneratorState state = DataPartitionTableGeneratorState.getStateByCode(resp.getErrorCode()); + TGenerateDataPartitionTableHeartbeatResp resp = + (TGenerateDataPartitionTableHeartbeatResp) + SyncDataNodeClientPool.getInstance() + .sendSyncRequestToDataNodeWithGivenRetry( + dataNode.getLocation().getInternalEndPoint(), + null, + CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE_HEART_BEAT, + MAX_RETRY_COUNT); + DataPartitionTableGeneratorState state = + DataPartitionTableGeneratorState.getStateByCode(resp.getErrorCode()); if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOG.error("Failed to request DataPartitionTable generation heart beat from the DataNode[id={}], state is {}, response status is {}", dataNode.getLocation().getDataNodeId(), state, resp.getStatus()); + LOG.error( + "Failed to request DataPartitionTable generation heart beat from the DataNode[id={}], state is {}, response status is {}", + dataNode.getLocation().getDataNodeId(), + state, + resp.getStatus()); continue; } switch (state) { case SUCCESS: - LOG.info("DataNode {} completed DataPartitionTable generation, terminating heart beat", dataNodeId); + LOG.info( + "DataNode {} completed DataPartitionTable generation, terminating heart beat", + dataNodeId); completeCount++; break; case IN_PROGRESS: LOG.info("DataNode {} still generating DataPartitionTable", dataNodeId); break; case FAILED: - LOG.error("DataNode {} failed to generate DataPartitionTable, terminating heart beat", dataNodeId); + LOG.error( + "DataNode {} failed to generate DataPartitionTable, terminating heart beat", + dataNodeId); completeCount++; break; default: - LOG.error("DataNode {} returned unknown error code: {}", dataNodeId, resp.getErrorCode()); + LOG.error( + "DataNode {} returned unknown error code: {}", dataNodeId, resp.getErrorCode()); break; } } catch (Exception e) { LOG.error( - "Error checking DataPartitionTable status from DataNode {}: {}, terminating heart beat", - dataNodeId, - e.getMessage(), - e); + "Error checking DataPartitionTable status from DataNode {}: {}, terminating heart beat", + dataNodeId, + e.getMessage(), + e); completeCount++; } } else { @@ -434,9 +498,7 @@ private void checkPartitionTableGenerationStatus() { } } - /** - * Merge DataPartitionTables from all DataNodes into a final table. - */ + /** Merge DataPartitionTables from all DataNodes into a final table. */ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { if (LOG.isDebugEnabled()) { LOG.info("Merging DataPartitionTables from {} DataNodes...", dataPartitionTables.size()); @@ -456,46 +518,78 @@ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { for (String database : lostDataPartitionsOfDatabases) { // Get current DataPartitionTable from ConfigManager Map>>> - localDataPartitionTableMap = getLocalDataPartitionTable(env, database); + localDataPartitionTableMap = getLocalDataPartitionTable(env, database); // Check if ConfigNode has a data partition that is associated with the earliestTimeslot - if (localDataPartitionTableMap == null || localDataPartitionTableMap.isEmpty() || localDataPartitionTableMap.get(database) == null || localDataPartitionTableMap.get(database).isEmpty()) { - LOG.error("No data partition table related to database {} was found from the ConfigNode", database); + if (localDataPartitionTableMap == null + || localDataPartitionTableMap.isEmpty() + || localDataPartitionTableMap.get(database) == null + || localDataPartitionTableMap.get(database).isEmpty()) { + LOG.error( + "No data partition table related to database {} was found from the ConfigNode", + database); continue; } - localDataPartitionTableMap.values().forEach(map -> map.forEach((tSeriesPartitionSlot, seriesPartitionTableMap) -> { - if (tSeriesPartitionSlot == null || seriesPartitionTableMap == null || seriesPartitionTableMap.isEmpty()) { - return; - } - finalDataPartitionMap.computeIfAbsent(tSeriesPartitionSlot, k -> new SeriesPartitionTable(seriesPartitionTableMap)); - })); + localDataPartitionTableMap + .values() + .forEach( + map -> + map.forEach( + (tSeriesPartitionSlot, seriesPartitionTableMap) -> { + if (tSeriesPartitionSlot == null + || seriesPartitionTableMap == null + || seriesPartitionTableMap.isEmpty()) { + return; + } + finalDataPartitionMap.computeIfAbsent( + tSeriesPartitionSlot, + k -> new SeriesPartitionTable(seriesPartitionTableMap)); + })); } - finalDataPartitionMap.forEach((tSeriesPartitionSlot, seriesPartitionTable) -> { - dataPartitionTables.values().forEach(dataPartitionTable -> { - if (dataPartitionTable == null || dataPartitionTable.getDataPartitionMap() == null || dataPartitionTable.getDataPartitionMap().isEmpty()) { - return; - } - dataPartitionTable.getDataPartitionMap().forEach((dnSeriesPartitionSlot, dnDataPartitionTable) -> { - if (!tSeriesPartitionSlot.equals(dnSeriesPartitionSlot)) { - return; - } - - if (seriesPartitionTable == null || seriesPartitionTable.getSeriesPartitionMap() == null || seriesPartitionTable.getSeriesPartitionMap().isEmpty()) { - finalDataPartitionMap.put(tSeriesPartitionSlot, dnDataPartitionTable); - } - - // dnDataPartitionTable merged to seriesPartitionTable - dnDataPartitionTable.getSeriesPartitionMap().forEach((k, v) -> v.forEach(tConsensusGroupId -> { - if (seriesPartitionTable == null) { - return; - } - seriesPartitionTable.putDataPartition(k, tConsensusGroupId); - })); + finalDataPartitionMap.forEach( + (tSeriesPartitionSlot, seriesPartitionTable) -> { + dataPartitionTables + .values() + .forEach( + dataPartitionTable -> { + if (dataPartitionTable == null + || dataPartitionTable.getDataPartitionMap() == null + || dataPartitionTable.getDataPartitionMap().isEmpty()) { + return; + } + dataPartitionTable + .getDataPartitionMap() + .forEach( + (dnSeriesPartitionSlot, dnDataPartitionTable) -> { + if (!tSeriesPartitionSlot.equals(dnSeriesPartitionSlot)) { + return; + } + + if (seriesPartitionTable == null + || seriesPartitionTable.getSeriesPartitionMap() == null + || seriesPartitionTable.getSeriesPartitionMap().isEmpty()) { + finalDataPartitionMap.put( + tSeriesPartitionSlot, dnDataPartitionTable); + } + + // dnDataPartitionTable merged to seriesPartitionTable + dnDataPartitionTable + .getSeriesPartitionMap() + .forEach( + (k, v) -> + v.forEach( + tConsensusGroupId -> { + if (seriesPartitionTable == null) { + return; + } + seriesPartitionTable.putDataPartition( + k, tConsensusGroupId); + })); + }); + }); }); - }); - }); finalDataPartitionTable = new DataPartitionTable(finalDataPartitionMap); break; @@ -523,8 +617,8 @@ private Flow writePartitionTableToRaft(final ConfigNodeProcedureEnv env) { if (lostDataPartitionsOfDatabases.isEmpty()) { LOG.error("No database lost data partition table"); setFailure( - "DataPartitionTableIntegrityCheckProcedure", - new ProcedureException("No database lost data partition table for raft write")); + "DataPartitionTableIntegrityCheckProcedure", + new ProcedureException("No database lost data partition table for raft write")); return getFlow(); } @@ -541,7 +635,8 @@ private Flow writePartitionTableToRaft(final ConfigNodeProcedureEnv env) { try { CreateDataPartitionPlan createPlan = new CreateDataPartitionPlan(); Map assignedDataPartition = new HashMap<>(); - assignedDataPartition.put(lostDataPartitionsOfDatabases.stream().findFirst().get(), finalDataPartitionTable); + assignedDataPartition.put( + lostDataPartitionsOfDatabases.stream().findFirst().get(), finalDataPartitionTable); createPlan.setAssignedDataPartition(assignedDataPartition); TSStatus tsStatus = env.getConfigManager().getConsensusManager().write(createPlan); @@ -551,8 +646,8 @@ private Flow writePartitionTableToRaft(final ConfigNodeProcedureEnv env) { } else { LOG.error("Failed to write DataPartitionTable to raft log"); setFailure( - "DataPartitionTableIntegrityCheckProcedure", - new ProcedureException("Failed to write DataPartitionTable to raft log")); + "DataPartitionTableIntegrityCheckProcedure", + new ProcedureException("Failed to write DataPartitionTable to raft log")); } } catch (Exception e) { LOG.error("Error writing DataPartitionTable to raft log", e); @@ -592,7 +687,7 @@ public void serialize(final DataOutputStream stream) throws IOException { for (Map.Entry entry : dataPartitionTables.entrySet()) { stream.writeInt(entry.getKey()); try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); - ObjectOutputStream oos = new ObjectOutputStream(baos)) { + ObjectOutputStream oos = new ObjectOutputStream(baos)) { TTransport transport = new TIOStreamTransport(oos); TBinaryProtocol protocol = new TBinaryProtocol(transport); entry.getValue().serialize(oos, protocol); @@ -614,7 +709,7 @@ public void serialize(final DataOutputStream stream) throws IOException { if (finalDataPartitionTable != null) { try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); - ObjectOutputStream oos = new ObjectOutputStream(baos)) { + ObjectOutputStream oos = new ObjectOutputStream(baos)) { TTransport transport = new TIOStreamTransport(oos); TBinaryProtocol protocol = new TBinaryProtocol(transport); finalDataPartitionTable.serialize(oos, protocol); @@ -686,7 +781,7 @@ public void deserialize(final ByteBuffer byteBuffer) { byte[] bytes = new byte[size]; byteBuffer.get(bytes); try (ByteArrayInputStream bais = new ByteArrayInputStream(bytes); - ObjectInputStream ois = new ObjectInputStream(bais)) { + ObjectInputStream ois = new ObjectInputStream(bais)) { TTransport transport = new TIOStreamTransport(ois); TBinaryProtocol protocol = new TBinaryProtocol(transport); @@ -712,7 +807,7 @@ public void deserialize(final ByteBuffer byteBuffer) { byte[] finalDataPartitionTableBytes = new byte[finalDataPartitionTableSize]; byteBuffer.get(finalDataPartitionTableBytes); try (ByteArrayInputStream bais = new ByteArrayInputStream(finalDataPartitionTableBytes); - ObjectInputStream ois = new ObjectInputStream(bais)) { + ObjectInputStream ois = new ObjectInputStream(bais)) { TTransport transport = new TIOStreamTransport(ois); TBinaryProtocol protocol = new TBinaryProtocol(transport); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java index 9e4836a089cdf..01ae2499a9e80 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java @@ -69,6 +69,7 @@ import org.apache.iotdb.metrics.metricsets.net.NetMetrics; import org.apache.iotdb.metrics.metricsets.system.SystemMetrics; import org.apache.iotdb.rpc.TSStatusCode; + import org.apache.ratis.util.ExitUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -118,8 +119,7 @@ public class ConfigNode extends ServerCommandLine implements ConfigNodeMBean { private Future dataPartitionTableCheckFuture; private ExecutorService dataPartitionTableCheckExecutor = - IoTDBThreadPoolFactory.newSingleThreadExecutor( - "DATA_PARTITION_TABLE_CHECK"); + IoTDBThreadPoolFactory.newSingleThreadExecutor("DATA_PARTITION_TABLE_CHECK"); private final CountDownLatch latch = new CountDownLatch(1); @@ -222,29 +222,33 @@ public void active() { loadSecretKey(); loadHardwareCode(); - dataPartitionTableCheckFuture = dataPartitionTableCheckExecutor.submit(() -> { - LOGGER.info("Prepare to start dataPartitionTableIntegrityCheck after all datanodes are started up"); - Thread.sleep(CONF.getPartitionTableRecoverWaitAllDnUpTimeout()); - - while (latch.getCount() > 0) { - List dnList = configManager - .getLoadManager() - .filterDataNodeThroughStatus(NodeStatus.Running); - if (dnList != null && !dnList.isEmpty()) { - LOGGER.info("Starting dataPartitionTableIntegrityCheck..."); - TSStatus status = - configManager.getProcedureManager().dataPartitionTableIntegrityCheck(); - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.error("Data partition table integrity check failed!"); - } - latch.countDown(); - } else { - LOGGER.info("No running datanodes found, waiting..."); - Thread.sleep(5000); // 等待5秒后重新检查 - } - } - return null; - }); + dataPartitionTableCheckFuture = + dataPartitionTableCheckExecutor.submit( + () -> { + LOGGER.info( + "Prepare to start dataPartitionTableIntegrityCheck after all datanodes are started up"); + // Thread.sleep(CONF.getPartitionTableRecoverWaitAllDnUpTimeout()); + + while (latch.getCount() > 0) { + List dnList = + configManager + .getLoadManager() + .filterDataNodeThroughStatus(NodeStatus.Running); + if (dnList != null && !dnList.isEmpty()) { + LOGGER.info("Starting dataPartitionTableIntegrityCheck..."); + TSStatus status = + configManager.getProcedureManager().dataPartitionTableIntegrityCheck(); + if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + LOGGER.error("Data partition table integrity check failed!"); + } + latch.countDown(); + } else { + LOGGER.info("No running datanodes found, waiting..."); + Thread.sleep(5000); // 等待5秒后重新检查 + } + } + return null; + }); return; } else { saveSecretKey(); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java index 4c4d7a6928747..a5e89bb250dfb 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java @@ -1140,9 +1140,15 @@ public void loadProperties(TrimProperties properties) throws BadNodeUrlException loadTrustedUriPattern(properties); conf.setPartitionTableRecoverWorkerNum( - Integer.parseInt(properties.getProperty("partition_table_recover_worker_num", String.valueOf(conf.getPartitionTableRecoverWorkerNum())))); + Integer.parseInt( + properties.getProperty( + "partition_table_recover_worker_num", + String.valueOf(conf.getPartitionTableRecoverWorkerNum())))); conf.setPartitionTableRecoverMaxReadBytesPerSecond( - Integer.parseInt(properties.getProperty("partition_table_recover_max_read_bytes_per_second", String.valueOf(conf.getPartitionTableRecoverMaxReadBytesPerSecond())))); + Integer.parseInt( + properties.getProperty( + "partition_table_recover_max_read_bytes_per_second", + String.valueOf(conf.getPartitionTableRecoverMaxReadBytesPerSecond())))); conf.setIncludeNullValueInWriteThroughputMetric( Boolean.parseBoolean( diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java index 4ce321e9a536c..4f9a326f05223 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java @@ -33,6 +33,7 @@ import org.apache.iotdb.db.storageengine.dataregion.DataRegion; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileManager; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; + import org.apache.tsfile.file.metadata.IDeviceID; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -81,18 +82,28 @@ public class DataPartitionTableGenerator { private static final int EXECUTOR_MAX_TIMEOUT = 60; private static final LeakyBucketRateLimiter limiter = - new LeakyBucketRateLimiter((long) IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverMaxReadBytesPerSecond() * 1024 * 1024); + new LeakyBucketRateLimiter( + (long) + IoTDBDescriptor.getInstance() + .getConfig() + .getPartitionTableRecoverMaxReadBytesPerSecond() + * 1024 + * 1024); public static final String SCAN_FILE_SUFFIX_NAME = ".tsfile"; - public static final Set IGNORE_DATABASE = new HashSet() {{ - add("root.__audit"); - }}; + public static final Set IGNORE_DATABASE = + new HashSet() { + { + add("root.__audit"); + add("root.__system"); + } + }; public DataPartitionTableGenerator( - ExecutorService executor, - Set databases, - int seriesSlotNum, - String seriesPartitionExecutorClass) { + ExecutorService executor, + Set databases, + int seriesSlotNum, + String seriesPartitionExecutorClass) { this.executor = executor; this.databases = databases; this.seriesSlotNum = seriesSlotNum; @@ -105,7 +116,7 @@ public DataPartitionTableGenerator( Set databases, int seriesSlotNum, String seriesPartitionExecutorClass) { - this.dataDirectories = new String[]{dataDirectory}; + this.dataDirectories = new String[] {dataDirectory}; this.executor = executor; this.databases = databases; this.seriesSlotNum = seriesSlotNum; @@ -113,11 +124,11 @@ public DataPartitionTableGenerator( } public DataPartitionTableGenerator( - String[] dataDirectories, - ExecutorService executor, - Set databases, - int seriesSlotNum, - String seriesPartitionExecutorClass) { + String[] dataDirectories, + ExecutorService executor, + Set databases, + int seriesSlotNum, + String seriesPartitionExecutorClass) { this.dataDirectories = dataDirectories; this.executor = executor; this.databases = databases; @@ -132,10 +143,7 @@ public enum TaskStatus { FAILED } - /** - * Start generating DataPartitionTable asynchronously. - * - */ + /** Start generating DataPartitionTable asynchronously. */ public CompletableFuture startGeneration() { if (status != TaskStatus.NOT_STARTED) { throw new IllegalStateException("Task is already started or completed"); @@ -150,28 +158,29 @@ private void generateDataPartitionTableByMemory() { List> futures = new ArrayList<>(); SeriesPartitionExecutor seriesPartitionExecutor = - SeriesPartitionExecutor.getSeriesPartitionExecutor( - seriesPartitionExecutorClass, seriesSlotNum); + SeriesPartitionExecutor.getSeriesPartitionExecutor( + seriesPartitionExecutorClass, seriesSlotNum); for (DataRegion dataRegion : StorageEngine.getInstance().getAllDataRegions()) { CompletableFuture regionFuture = - CompletableFuture.runAsync( - () -> { - TsFileManager tsFileManager = dataRegion.getTsFileManager(); - String databaseName = dataRegion.getDatabaseName(); - if (!databases.contains(databaseName) || IGNORE_DATABASE.contains(databaseName)) { - return; - } - - tsFileManager.readLock(); - List seqTsFileList = tsFileManager.getTsFileList(true); - List unseqTsFileList = tsFileManager.getTsFileList(false); - tsFileManager.readUnlock(); - - constructDataPartitionMap(seqTsFileList, seriesPartitionExecutor, dataPartitionMap); - constructDataPartitionMap(unseqTsFileList, seriesPartitionExecutor, dataPartitionMap); - }, - executor); + CompletableFuture.runAsync( + () -> { + TsFileManager tsFileManager = dataRegion.getTsFileManager(); + String databaseName = dataRegion.getDatabaseName(); + if (!databases.contains(databaseName) || IGNORE_DATABASE.contains(databaseName)) { + return; + } + + tsFileManager.readLock(); + List seqTsFileList = tsFileManager.getTsFileList(true); + List unseqTsFileList = tsFileManager.getTsFileList(false); + tsFileManager.readUnlock(); + + constructDataPartitionMap(seqTsFileList, seriesPartitionExecutor, dataPartitionMap); + constructDataPartitionMap( + unseqTsFileList, seriesPartitionExecutor, dataPartitionMap); + }, + executor); futures.add(regionFuture); } @@ -188,7 +197,10 @@ private void generateDataPartitionTableByMemory() { status = TaskStatus.COMPLETED; } - private static void constructDataPartitionMap(List seqTsFileList, SeriesPartitionExecutor seriesPartitionExecutor, Map dataPartitionMap) { + private static void constructDataPartitionMap( + List seqTsFileList, + SeriesPartitionExecutor seriesPartitionExecutor, + Map dataPartitionMap) { for (TsFileResource tsFileResource : seqTsFileList) { Set devices = tsFileResource.getDevices(limiter); long timeSlotId = tsFileResource.getTsFileID().timePartitionId; @@ -199,9 +211,14 @@ private static void constructDataPartitionMap(List seqTsFileList consensusGroupId.setType(TConsensusGroupType.DataRegion); for (IDeviceID deviceId : devices) { - TSeriesPartitionSlot seriesSlotId = seriesPartitionExecutor.getSeriesPartitionSlot(deviceId); - TTimePartitionSlot timePartitionSlot = new TTimePartitionSlot(TimePartitionUtils.getTimeByPartitionId(timeSlotId)); - dataPartitionMap.computeIfAbsent(seriesSlotId, empty -> newSeriesPartitionTable(consensusGroupId, timeSlotId)).putDataPartition(timePartitionSlot, consensusGroupId); + TSeriesPartitionSlot seriesSlotId = + seriesPartitionExecutor.getSeriesPartitionSlot(deviceId); + TTimePartitionSlot timePartitionSlot = + new TTimePartitionSlot(TimePartitionUtils.getTimeByPartitionId(timeSlotId)); + dataPartitionMap + .computeIfAbsent( + seriesSlotId, empty -> newSeriesPartitionTable(consensusGroupId, timeSlotId)) + .putDataPartition(timePartitionSlot, consensusGroupId); } } } @@ -221,46 +238,49 @@ private void generateDataPartitionTable() throws IOException { // Process all data directories for (String dataDirectory : dataDirectories) { LOG.info("Processing data directory: {}", dataDirectory); - + // First layer: database directories Files.list(Paths.get(dataDirectory)) .filter(Files::isDirectory) - .forEach(sequenceTypePath -> { - try { - Files.list(sequenceTypePath) + .forEach( + sequenceTypePath -> { + try { + Files.list(sequenceTypePath) .filter(Files::isDirectory) - .forEach(dbPath -> { - String databaseName = dbPath.getFileName().toString(); - if (!databases.contains(databaseName) || IGNORE_DATABASE.contains(databaseName)) { - return; - } - - if (LOG.isDebugEnabled()) { - LOG.debug("Processing database: {}", databaseName); - } - - try { - Files.list(dbPath) + .forEach( + dbPath -> { + String databaseName = dbPath.getFileName().toString(); + if (!databases.contains(databaseName) + || IGNORE_DATABASE.contains(databaseName)) { + return; + } + + if (LOG.isDebugEnabled()) { + LOG.debug("Processing database: {}", databaseName); + } + + try { + Files.list(dbPath) .filter(Files::isDirectory) .forEach( - regionPath -> { - processRegionDirectory( - regionPath, - databaseName, - dataPartitionMap, - executor, - futures); - }); - } catch (IOException e) { - LOG.error("Failed to process database directory: {}", dbPath, e); - failedFiles.incrementAndGet(); - } - }); - } catch (IOException e) { - LOG.error("Failed to process database directory: {}", sequenceTypePath, e); - failedFiles.incrementAndGet(); - } - }); + regionPath -> { + processRegionDirectory( + regionPath, + databaseName, + dataPartitionMap, + executor, + futures); + }); + } catch (IOException e) { + LOG.error("Failed to process database directory: {}", dbPath, e); + failedFiles.incrementAndGet(); + } + }); + } catch (IOException e) { + LOG.error("Failed to process database directory: {}", sequenceTypePath, e); + failedFiles.incrementAndGet(); + } + }); } // Wait for all tasks to complete @@ -352,10 +372,7 @@ private void processTimeSlotDirectory( .forEach( tsFilePath -> { processTsFile( - tsFilePath.toFile(), - consensusGroupId, - timeSlotLong, - dataPartitionMap); + tsFilePath.toFile(), consensusGroupId, timeSlotLong, dataPartitionMap); }); } catch (IOException e) { LOG.error("Failed to walk time slot directory: {}", timeSlotPath, e); @@ -380,9 +397,14 @@ private void processTsFile( seriesPartitionExecutorClass, seriesSlotNum); for (org.apache.tsfile.file.metadata.IDeviceID deviceId : devices) { - TSeriesPartitionSlot seriesSlotId = seriesPartitionExecutor.getSeriesPartitionSlot(deviceId); - TTimePartitionSlot timePartitionSlot = new TTimePartitionSlot(TimePartitionUtils.getTimeByPartitionId(timeSlotId)); - dataPartitionMap.computeIfAbsent(seriesSlotId, empty -> newSeriesPartitionTable(consensusGroupId, timeSlotId)).putDataPartition(timePartitionSlot, consensusGroupId); + TSeriesPartitionSlot seriesSlotId = + seriesPartitionExecutor.getSeriesPartitionSlot(deviceId); + TTimePartitionSlot timePartitionSlot = + new TTimePartitionSlot(TimePartitionUtils.getTimeByPartitionId(timeSlotId)); + dataPartitionMap + .computeIfAbsent( + seriesSlotId, empty -> newSeriesPartitionTable(consensusGroupId, timeSlotId)) + .putDataPartition(timePartitionSlot, consensusGroupId); } if (processedFiles.get() % 1000 == 0) { @@ -394,9 +416,11 @@ private void processTsFile( } } - private static SeriesPartitionTable newSeriesPartitionTable(TConsensusGroupId consensusGroupId, long timeSlotId) { + private static SeriesPartitionTable newSeriesPartitionTable( + TConsensusGroupId consensusGroupId, long timeSlotId) { SeriesPartitionTable seriesPartitionTable = new SeriesPartitionTable(); - TTimePartitionSlot timePartitionSlot = new TTimePartitionSlot(TimePartitionUtils.getTimeByPartitionId(timeSlotId)); + TTimePartitionSlot timePartitionSlot = + new TTimePartitionSlot(TimePartitionUtils.getTimeByPartitionId(timeSlotId)); seriesPartitionTable.putDataPartition(timePartitionSlot, consensusGroupId); return seriesPartitionTable; } @@ -407,29 +431,32 @@ private void countTotalFiles() throws IOException { for (String dataDirectory : dataDirectories) { Files.list(Paths.get(dataDirectory)) - .filter(Files::isDirectory) - .forEach(sequenceTypePath -> { - try { - Files.list(sequenceTypePath) - .filter(Files::isDirectory) - .forEach(dbPath -> { - String databaseName = dbPath.getFileName().toString(); - if (!databases.contains(databaseName) || IGNORE_DATABASE.contains(databaseName)) { - return; - } - - try { - Files.walk(dbPath) - .filter(Files::isRegularFile) - .filter(p -> p.toString().endsWith(SCAN_FILE_SUFFIX_NAME)) - .forEach(p -> fileCount.incrementAndGet()); - } catch (IOException e) { - LOG.error("countTotalFiles failed when scan {}", dbPath, e); - } - }); - } catch (IOException e) { - LOG.error("countTotalFiles failed when scan {}", sequenceTypePath, e); - } + .filter(Files::isDirectory) + .forEach( + sequenceTypePath -> { + try { + Files.list(sequenceTypePath) + .filter(Files::isDirectory) + .forEach( + dbPath -> { + String databaseName = dbPath.getFileName().toString(); + if (!databases.contains(databaseName) + || IGNORE_DATABASE.contains(databaseName)) { + return; + } + + try { + Files.walk(dbPath) + .filter(Files::isRegularFile) + .filter(p -> p.toString().endsWith(SCAN_FILE_SUFFIX_NAME)) + .forEach(p -> fileCount.incrementAndGet()); + } catch (IOException e) { + LOG.error("countTotalFiles failed when scan {}", dbPath, e); + } + }); + } catch (IOException e) { + LOG.error("countTotalFiles failed when scan {}", sequenceTypePath, e); + } }); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index 6e5b615cc1b27..ab4f1523516aa 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -19,7 +19,6 @@ package org.apache.iotdb.db.protocol.thrift.impl; -import com.google.common.collect.ImmutableList; import org.apache.iotdb.common.rpc.thrift.TConfigNodeLocation; import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; @@ -322,6 +321,8 @@ import org.apache.iotdb.service.rpc.thrift.TSInsertRecordReq; import org.apache.iotdb.trigger.api.enums.FailureStrategy; import org.apache.iotdb.trigger.api.enums.TriggerEvent; + +import com.google.common.collect.ImmutableList; import org.apache.thrift.TException; import org.apache.thrift.protocol.TBinaryProtocol; import org.apache.thrift.transport.TIOStreamTransport; @@ -433,22 +434,22 @@ public class DataNodeInternalRPCServiceImpl implements IDataNodeRPCService.Iface 0L, TimeUnit.SECONDS, new ArrayBlockingQueue<>( - IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum()), + IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum()), new IoTThreadFactory(ThreadName.FIND_EARLIEST_TIME_SLOT_PARALLEL_POOL.getName()), ThreadName.FIND_EARLIEST_TIME_SLOT_PARALLEL_POOL.getName(), new ThreadPoolExecutor.CallerRunsPolicy()); private final ExecutorService partitionTableRecoverExecutor = - new WrappedThreadPoolExecutor( - 0, - IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum(), - 0L, - TimeUnit.SECONDS, - new ArrayBlockingQueue<>( - IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum()), - new IoTThreadFactory(ThreadName.DATA_PARTITION_RECOVER_PARALLEL_POOL.getName()), - ThreadName.DATA_PARTITION_RECOVER_PARALLEL_POOL.getName(), - new ThreadPoolExecutor.CallerRunsPolicy()); + new WrappedThreadPoolExecutor( + 0, + IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum(), + 0L, + TimeUnit.SECONDS, + new ArrayBlockingQueue<>( + IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum()), + new IoTThreadFactory(ThreadName.DATA_PARTITION_RECOVER_PARALLEL_POOL.getName()), + ThreadName.DATA_PARTITION_RECOVER_PARALLEL_POOL.getName(), + new ThreadPoolExecutor.CallerRunsPolicy()); private Map databaseEarliestRegionMap = new ConcurrentHashMap<>(); @@ -3200,7 +3201,8 @@ public TGetEarliestTimeslotsResp getEarliestTimeslots() { } @Override - public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataPartitionTableReq req) { + public TGenerateDataPartitionTableResp generateDataPartitionTable( + TGenerateDataPartitionTableReq req) { TGenerateDataPartitionTableResp resp = new TGenerateDataPartitionTableResp(); byte[] empty = new byte[0]; @@ -3221,7 +3223,11 @@ public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataP IoTDBDescriptor.getInstance().getConfig().getSeriesPartitionExecutorClass(); currentGenerator = - new DataPartitionTableGenerator(partitionTableRecoverExecutor, req.getDatabases(), seriesSlotNum, seriesPartitionExecutorClass); + new DataPartitionTableGenerator( + partitionTableRecoverExecutor, + req.getDatabases(), + seriesSlotNum, + seriesPartitionExecutorClass); currentTaskId = System.currentTimeMillis(); // Start generation synchronously for now to return the data partition table immediately @@ -3251,7 +3257,7 @@ public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataP resp.setDataPartitionTable(empty); resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); resp.setMessage( - "DataPartitionTable generation failed: " + currentGenerator.getErrorMessage()); + "DataPartitionTable generation failed: " + currentGenerator.getErrorMessage()); resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); break; } @@ -3301,7 +3307,8 @@ public TGenerateDataPartitionTableHeartbeatResp generateDataPartitionTableHeartb break; case FAILED: resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); - resp.setMessage("DataPartitionTable generation failed: " + currentGenerator.getErrorMessage()); + resp.setMessage( + "DataPartitionTable generation failed: " + currentGenerator.getErrorMessage()); resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); break; default: @@ -3333,20 +3340,24 @@ private void processDataDirectoryForEarliestTimeslots( try { Files.list(sequenceTypePath) .filter(Files::isDirectory) - .forEach(dbPath -> { - String databaseName = dbPath.getFileName().toString(); - if (DataPartitionTableGenerator.IGNORE_DATABASE.contains(databaseName)) { - return; - } - databaseEarliestRegionMap.computeIfAbsent(databaseName, key -> Long.MAX_VALUE); - long earliestTimeslot = findEarliestTimeslotInDatabase(dbPath.toFile()); + .forEach( + dbPath -> { + String databaseName = dbPath.getFileName().toString(); + if (DataPartitionTableGenerator.IGNORE_DATABASE.contains( + databaseName)) { + return; + } + databaseEarliestRegionMap.computeIfAbsent( + databaseName, key -> Long.MAX_VALUE); + long earliestTimeslot = findEarliestTimeslotInDatabase(dbPath.toFile()); - if (earliestTimeslot != Long.MAX_VALUE) { - earliestTimeslots.merge(databaseName, earliestTimeslot, Math::min); - } - }); + if (earliestTimeslot != Long.MAX_VALUE) { + earliestTimeslots.merge(databaseName, earliestTimeslot, Math::min); + } + }); } catch (IOException e) { - LOGGER.error("Failed to process data directory: {}", sequenceTypePath.toFile(), e); + LOGGER.error( + "Failed to process data directory: {}", sequenceTypePath.toFile(), e); } }); } catch (IOException e) { @@ -3363,31 +3374,49 @@ private long findEarliestTimeslotInDatabase(File databaseDir) { Files.list(databaseDir.toPath()) .filter(Files::isDirectory) .forEach( - regionPath -> { - Future future = findEarliestTimeSlotExecutor.submit(() -> { - try { - Files.list(regionPath) - .filter(Files::isDirectory) - .forEach(timeSlotPath -> { - try { - Optional matchedFile = Files.find(timeSlotPath, 1, (path, attrs) -> attrs.isRegularFile() && path.toString().endsWith(DataPartitionTableGenerator.SCAN_FILE_SUFFIX_NAME)).findFirst(); - if (!matchedFile.isPresent()) { - return; - } - String timeSlotName = timeSlotPath.getFileName().toString(); - long timeslot = Long.parseLong(timeSlotName); - if (timeslot < databaseEarliestRegionMap.get(databaseName)) { - databaseEarliestRegionMap.put(databaseName, timeslot); - } - } catch (IOException e) { - LOGGER.error("Failed to find any {} files in the {} directory", DataPartitionTableGenerator.SCAN_FILE_SUFFIX_NAME, timeSlotPath, e); - } - }); - } catch (IOException e) { - LOGGER.error("Failed to scan {}", regionPath, e); - } - }); - futureList.add(future); + regionPath -> { + Future future = + findEarliestTimeSlotExecutor.submit( + () -> { + try { + Files.list(regionPath) + .filter(Files::isDirectory) + .forEach( + timeSlotPath -> { + try { + Optional matchedFile = + Files.find( + timeSlotPath, + 1, + (path, attrs) -> + attrs.isRegularFile() + && path.toString() + .endsWith( + DataPartitionTableGenerator + .SCAN_FILE_SUFFIX_NAME)) + .findFirst(); + if (!matchedFile.isPresent()) { + return; + } + String timeSlotName = timeSlotPath.getFileName().toString(); + long timeslot = Long.parseLong(timeSlotName); + if (timeslot + < databaseEarliestRegionMap.get(databaseName)) { + databaseEarliestRegionMap.put(databaseName, timeslot); + } + } catch (IOException e) { + LOGGER.error( + "Failed to find any {} files in the {} directory", + DataPartitionTableGenerator.SCAN_FILE_SUFFIX_NAME, + timeSlotPath, + e); + } + }); + } catch (IOException e) { + LOGGER.error("Failed to scan {}", regionPath, e); + } + }); + futureList.add(future); }); } catch (IOException e) { LOGGER.error("Failed to walk database directory: {}", databaseDir, e); @@ -3407,7 +3436,7 @@ private long findEarliestTimeslotInDatabase(File databaseDir) { /** Serialize DataPartitionTable to ByteBuffer for RPC transmission. */ private byte[] serializeDataPartitionTable(DataPartitionTable dataPartitionTable) { try (PublicBAOS baos = new PublicBAOS(); - DataOutputStream oos = new DataOutputStream(baos)) { + DataOutputStream oos = new DataOutputStream(baos)) { TTransport transport = new TIOStreamTransport(oos); TBinaryProtocol protocol = new TBinaryProtocol(transport); dataPartitionTable.serialize(oos, protocol); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ArrayDeviceTimeIndex.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ArrayDeviceTimeIndex.java index 71a761a813731..a3262ddd37a1a 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ArrayDeviceTimeIndex.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ArrayDeviceTimeIndex.java @@ -173,7 +173,8 @@ public Set getDevices(String tsFilePath, TsFileResource tsFileResourc } @Override - public Set getDevices(String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter) { + public Set getDevices( + String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter) { return deviceToIndex.keySet(); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java index a0a725c85d73d..059663c5a6aea 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java @@ -122,11 +122,12 @@ public Set getDevices(String tsFilePath, TsFileResource tsFileResourc } @Override - public Set getDevices(String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter) { + public Set getDevices( + String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter) { tsFileResource.readLock(); try (InputStream inputStream = - FSFactoryProducer.getFSFactory() - .getBufferedInputStream(tsFilePath + TsFileResource.RESOURCE_SUFFIX)) { + FSFactoryProducer.getFSFactory() + .getBufferedInputStream(tsFilePath + TsFileResource.RESOURCE_SUFFIX)) { // The first byte is VERSION_NUMBER, second byte is timeIndexType. byte[] bytes = ReadWriteIOUtils.readBytes(inputStream, 2); limiter.acquire(bytes.length); @@ -141,15 +142,15 @@ public Set getDevices(String tsFilePath, TsFileResource tsFileResourc return Collections.emptySet(); } else { logger.error( - "Can't read file {} from disk ", tsFilePath + TsFileResource.RESOURCE_SUFFIX, e); + "Can't read file {} from disk ", tsFilePath + TsFileResource.RESOURCE_SUFFIX, e); throw new RuntimeException( - "Can't read file " + tsFilePath + TsFileResource.RESOURCE_SUFFIX + " from disk"); + "Can't read file " + tsFilePath + TsFileResource.RESOURCE_SUFFIX + " from disk"); } } catch (Exception e) { logger.error( - "Failed to get devices from tsfile: {}", tsFilePath + TsFileResource.RESOURCE_SUFFIX, e); + "Failed to get devices from tsfile: {}", tsFilePath + TsFileResource.RESOURCE_SUFFIX, e); throw new RuntimeException( - "Failed to get devices from tsfile: " + tsFilePath + TsFileResource.RESOURCE_SUFFIX); + "Failed to get devices from tsfile: " + tsFilePath + TsFileResource.RESOURCE_SUFFIX); } finally { tsFileResource.readUnlock(); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java index 5f94703a944ba..400c478df5054 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java @@ -23,6 +23,7 @@ import org.apache.iotdb.commons.utils.rateLimiter.LeakyBucketRateLimiter; import org.apache.iotdb.db.exception.load.PartitionViolationException; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; + import org.apache.tsfile.file.metadata.IDeviceID; import org.apache.tsfile.utils.Pair; import org.apache.tsfile.utils.ReadWriteIOUtils; @@ -79,7 +80,8 @@ ITimeIndex deserialize(InputStream inputStream, IDeviceID.Deserializer deseriali * * @return device names */ - Set getDevices(String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter); + Set getDevices( + String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter); /** * @return whether end time is empty (Long.MIN_VALUE) diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java index adf276dd8e2a8..066e1bea5bfd7 100644 --- a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java @@ -28,7 +28,6 @@ import org.apache.iotdb.commons.consensus.DataRegionId; import org.apache.iotdb.commons.consensus.SchemaRegionId; import org.apache.iotdb.commons.exception.MetadataException; -import org.apache.iotdb.commons.partition.DataPartitionTable; import org.apache.iotdb.commons.path.MeasurementPath; import org.apache.iotdb.commons.path.PartialPath; import org.apache.iotdb.consensus.ConsensusFactory; @@ -54,13 +53,11 @@ import org.apache.iotdb.db.storageengine.dataregion.DataRegion; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; import org.apache.iotdb.db.utils.EnvironmentUtils; -import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableReq; -import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableResp; -import org.apache.iotdb.mpp.rpc.thrift.TGetEarliestTimeslotsResp; import org.apache.iotdb.mpp.rpc.thrift.TPlanNode; import org.apache.iotdb.mpp.rpc.thrift.TSendBatchPlanNodeReq; import org.apache.iotdb.mpp.rpc.thrift.TSendBatchPlanNodeResp; import org.apache.iotdb.mpp.rpc.thrift.TSendSinglePlanNodeReq; + import org.apache.ratis.util.FileUtils; import org.apache.tsfile.enums.TSDataType; import org.apache.tsfile.file.metadata.enums.CompressionType; @@ -80,16 +77,14 @@ import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Objects; -import java.util.Set; public class DataNodeInternalRPCServiceImplTest { private static final Logger LOG = - LoggerFactory.getLogger(DataNodeInternalRPCServiceImplTest.class); + LoggerFactory.getLogger(DataNodeInternalRPCServiceImplTest.class); private static final IoTDBConfig conf = IoTDBDescriptor.getInstance().getConfig(); DataNodeInternalRPCServiceImpl dataNodeInternalRPCServiceImpl; private static IConsensus instance; diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/ServerCommandLine.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/ServerCommandLine.java index 7901f9cc36a1d..b700dbbb6b033 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/ServerCommandLine.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/ServerCommandLine.java @@ -18,6 +18,8 @@ */ package org.apache.iotdb.commons; +import org.apache.iotdb.commons.exception.IoTDBException; + import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.DefaultParser; @@ -26,7 +28,6 @@ import org.apache.commons.cli.OptionGroup; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; -import org.apache.iotdb.commons.exception.IoTDBException; import java.io.PrintWriter; import java.util.HashSet; diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java index d1a550c5ca1c9..4eeddff9db7f9 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java @@ -132,10 +132,15 @@ public static long getTimePartitionIdWithoutOverflow(long time) { public static long getTimeWithoutOverflow(long partitionId) { BigInteger bigTime = bigTimePartitionInterval.multiply(BigInteger.valueOf(partitionId)); - if (bigTime.compareTo(BigInteger.ZERO) > 0 || bigTime.remainder(bigTimePartitionInterval).equals(BigInteger.ZERO)) { + if (bigTime.compareTo(BigInteger.ZERO) > 0 + || bigTime.remainder(bigTimePartitionInterval).equals(BigInteger.ZERO)) { return bigTime.add(bigTimePartitionOrigin).longValue(); } - return BigInteger.valueOf(partitionId).add(BigInteger.ONE).multiply(bigTimePartitionInterval).add(bigTimePartitionOrigin).longValue(); + return BigInteger.valueOf(partitionId) + .add(BigInteger.ONE) + .multiply(bigTimePartitionInterval) + .add(bigTimePartitionOrigin) + .longValue(); } public static long getTimeByPartitionId(long partitionId) { @@ -155,7 +160,8 @@ public static boolean satisfyPartitionId(long startTime, long endTime, long part } public static boolean satisfyPartitionId(long startTime, long partitionId) { - long endTime = startTime >= timePartitionLowerBoundWithoutOverflow + long endTime = + startTime >= timePartitionLowerBoundWithoutOverflow ? Long.MAX_VALUE : (startTime + timePartitionInterval - 1); return satisfyPartitionId(startTime, endTime, partitionId); diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java index faff05c6ff69c..7af863db614b4 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java @@ -23,13 +23,8 @@ import java.util.concurrent.locks.LockSupport; /** - * A global leaky-bucket rate limiter for bytes throughput. - * Features: - * - Strict throughput limiting (no burst) - * - Smooth bandwidth shaping - * - Thread-safe - * - Fair for multi-thread - * - Low contention + * A global leaky-bucket rate limiter for bytes throughput. Features: - Strict throughput limiting + * (no burst) - Smooth bandwidth shaping - Thread-safe - Fair for multi-thread - Low contention */ public class LeakyBucketRateLimiter { /** bytes per second */ @@ -52,7 +47,7 @@ public LeakyBucketRateLimiter(long bytesPerSecond) { /** * Acquire permission for reading bytes. * - * This method will block if reading too fast. + *

This method will block if reading too fast. */ public void acquire(long bytes) { if (bytes <= 0) { @@ -95,9 +90,7 @@ public boolean tryAcquire(long bytes) { return false; } - /** - * Update rate dynamically. - */ + /** Update rate dynamically. */ public void setRate(long newBytesPerSecond) { if (newBytesPerSecond <= 0) { throw new IllegalArgumentException("bytesPerSecond must be > 0"); @@ -105,23 +98,17 @@ public void setRate(long newBytesPerSecond) { this.bytesPerSecond = newBytesPerSecond; } - /** - * Current rate. - */ + /** Current rate. */ public long getRate() { return bytesPerSecond; } - /** - * Total bytes processed. - */ + /** Total bytes processed. */ public long getTotalBytes() { return totalBytes.get(); } - /** - * Expected time based on bytes processed. - */ + /** Expected time based on bytes processed. */ private long expectedTimeNs(long totalBytes) { return startTimeNs + (totalBytes * 1_000_000_000L) / bytesPerSecond; } From 0448cb0aad9b81ca2ea56f24e6e7b3d3a9ed7204 Mon Sep 17 00:00:00 2001 From: libo Date: Thu, 12 Mar 2026 18:29:24 +0800 Subject: [PATCH 10/13] Avoid writing duplicate values --- .../iotdb/commons/partition/SeriesPartitionTable.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/SeriesPartitionTable.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/SeriesPartitionTable.java index f46344566dc32..ffb0413bc87e7 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/SeriesPartitionTable.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/SeriesPartitionTable.java @@ -73,7 +73,11 @@ public Map> getSeriesPartitionMap() } public void putDataPartition(TTimePartitionSlot timePartitionSlot, TConsensusGroupId groupId) { - seriesPartitionMap.computeIfAbsent(timePartitionSlot, empty -> new Vector<>()).add(groupId); + seriesPartitionMap.computeIfAbsent(timePartitionSlot, empty -> new Vector<>()); + List groupList = seriesPartitionMap.get(timePartitionSlot); + if (!groupList.contains(groupId)) { + groupList.add(groupId); + } } /** From 2468cf7689f073cdee5527de49dde0a3a71f35f2 Mon Sep 17 00:00:00 2001 From: libo Date: Thu, 12 Mar 2026 23:05:53 +0800 Subject: [PATCH 11/13] Fix bug when get no data partition table in the ConfigNode. --- ...PartitionTableIntegrityCheckProcedure.java | 123 ++++++++++-------- 1 file changed, 67 insertions(+), 56 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index 50522c5e66c93..ce433c113be0a 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -291,7 +291,7 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { || localDataPartitionTable.get(database) == null || localDataPartitionTable.get(database).isEmpty()) { lostDataPartitionsOfDatabases.add(database); - LOG.error( + LOG.warn( "No data partition table related to database {} was found from the ConfigNode, and this issue needs to be repaired", database); continue; @@ -510,10 +510,7 @@ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { return Flow.HAS_MORE_STATE; } - int failedCnt = 0; - while (failedCnt < MAX_RETRY_COUNT) { - try { - Map finalDataPartitionMap = new HashMap<>(); + Map finalDataPartitionMap = new HashMap<>(); for (String database : lostDataPartitionsOfDatabases) { // Get current DataPartitionTable from ConfigManager @@ -525,8 +522,8 @@ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { || localDataPartitionTableMap.isEmpty() || localDataPartitionTableMap.get(database) == null || localDataPartitionTableMap.get(database).isEmpty()) { - LOG.error( - "No data partition table related to database {} was found from the ConfigNode", + LOG.warn( + "No data partition table related to database {} was found from the ConfigNode, use data partition table of DataNode directly", database); continue; } @@ -548,60 +545,74 @@ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { })); } - finalDataPartitionMap.forEach( - (tSeriesPartitionSlot, seriesPartitionTable) -> { - dataPartitionTables + if (finalDataPartitionMap.isEmpty()) { + dataPartitionTables .values() .forEach( - dataPartitionTable -> { - if (dataPartitionTable == null - || dataPartitionTable.getDataPartitionMap() == null - || dataPartitionTable.getDataPartitionMap().isEmpty()) { - return; - } - dataPartitionTable - .getDataPartitionMap() + dataPartitionTable -> { + if (dataPartitionTable == null + || dataPartitionTable.getDataPartitionMap() == null + || dataPartitionTable.getDataPartitionMap().isEmpty()) { + return; + } + dataPartitionTable + .getDataPartitionMap().forEach( + (dnSeriesPartitionSlot, dnSeriesPartitionTable) -> { + if (dnSeriesPartitionSlot == null + || dnSeriesPartitionTable == null) { + return; + } + finalDataPartitionMap.computeIfAbsent( + dnSeriesPartitionSlot, + k -> dnSeriesPartitionTable); + }); + }); + } else { + finalDataPartitionMap.forEach( + (tSeriesPartitionSlot, seriesPartitionTable) -> { + dataPartitionTables + .values() .forEach( - (dnSeriesPartitionSlot, dnDataPartitionTable) -> { - if (!tSeriesPartitionSlot.equals(dnSeriesPartitionSlot)) { - return; - } - - if (seriesPartitionTable == null - || seriesPartitionTable.getSeriesPartitionMap() == null - || seriesPartitionTable.getSeriesPartitionMap().isEmpty()) { - finalDataPartitionMap.put( - tSeriesPartitionSlot, dnDataPartitionTable); - } - - // dnDataPartitionTable merged to seriesPartitionTable - dnDataPartitionTable - .getSeriesPartitionMap() - .forEach( - (k, v) -> - v.forEach( - tConsensusGroupId -> { - if (seriesPartitionTable == null) { - return; - } - seriesPartitionTable.putDataPartition( - k, tConsensusGroupId); - })); - }); - }); - }); + dataPartitionTable -> { + if (dataPartitionTable == null + || dataPartitionTable.getDataPartitionMap() == null + || dataPartitionTable.getDataPartitionMap().isEmpty()) { + return; + } + dataPartitionTable + .getDataPartitionMap() + .forEach( + (dnSeriesPartitionSlot, dnSeriesPartitionTable) -> { + if (!tSeriesPartitionSlot.equals(dnSeriesPartitionSlot)) { + return; + } + + if (seriesPartitionTable == null + || seriesPartitionTable.getSeriesPartitionMap() == null + || seriesPartitionTable.getSeriesPartitionMap().isEmpty()) { + finalDataPartitionMap.put( + tSeriesPartitionSlot, dnSeriesPartitionTable); + } + + // dnDataPartitionTable merged to seriesPartitionTable + dnSeriesPartitionTable + .getSeriesPartitionMap() + .forEach( + (k, v) -> + v.forEach( + tConsensusGroupId -> { + if (seriesPartitionTable == null) { + return; + } + seriesPartitionTable.putDataPartition( + k, tConsensusGroupId); + })); + }); + }); + }); + } finalDataPartitionTable = new DataPartitionTable(finalDataPartitionMap); - break; - } catch (Exception e) { - LOG.error("Failed to merge DataPartitionTables", e); - setFailure("DataPartitionTableIntegrityCheckProcedure", e); - failedCnt++; - if (failedCnt >= MAX_RETRY_COUNT) { - return Flow.NO_MORE_STATE; - } - } - } LOG.info("DataPartitionTable merge completed successfully"); setNextState(DataPartitionTableIntegrityCheckProcedureState.WRITE_PARTITION_TABLE_TO_RAFT); From 52e3fdec55896bf39c5fd6be9facd3cc315e5f35 Mon Sep 17 00:00:00 2001 From: libo Date: Fri, 13 Mar 2026 11:13:04 +0800 Subject: [PATCH 12/13] Add a license description. --- ...tionTableIntegrityCheckProcedureState.java | 25 +++++++++++++++---- .../DataPartitionTableGeneratorState.java | 19 ++++++++++++++ 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/DataPartitionTableIntegrityCheckProcedureState.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/DataPartitionTableIntegrityCheckProcedureState.java index 7028adf9b4b9a..2173ea8ef4589 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/DataPartitionTableIntegrityCheckProcedureState.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/DataPartitionTableIntegrityCheckProcedureState.java @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.apache.iotdb.confignode.procedure.state; public enum DataPartitionTableIntegrityCheckProcedureState { @@ -10,9 +29,5 @@ public enum DataPartitionTableIntegrityCheckProcedureState { /** Merge DataPartitionTables from all DataNodes */ MERGE_PARTITION_TABLES, /** Write final DataPartitionTable to raft log */ - WRITE_PARTITION_TABLE_TO_RAFT, - /** Procedure completed successfully */ - SUCCESS, - /** Procedure failed */ - FAILED + WRITE_PARTITION_TABLE_TO_RAFT } diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/enums/DataPartitionTableGeneratorState.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/enums/DataPartitionTableGeneratorState.java index 0d0d09c182e05..a07f6e313cdb2 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/enums/DataPartitionTableGeneratorState.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/enums/DataPartitionTableGeneratorState.java @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.apache.iotdb.commons.enums; public enum DataPartitionTableGeneratorState { From 7c7693d63545c49cb17152a5e60dfcfec031a80a Mon Sep 17 00:00:00 2001 From: libo Date: Fri, 13 Mar 2026 11:52:11 +0800 Subject: [PATCH 13/13] mvn spotless:apply --- ...PartitionTableIntegrityCheckProcedure.java | 210 +++++++++--------- 1 file changed, 106 insertions(+), 104 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index ce433c113be0a..a417e78ec6afd 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -44,6 +44,7 @@ import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableResp; import org.apache.iotdb.mpp.rpc.thrift.TGetEarliestTimeslotsResp; import org.apache.iotdb.rpc.TSStatusCode; + import org.apache.thrift.TException; import org.apache.thrift.protocol.TBinaryProtocol; import org.apache.thrift.transport.TIOStreamTransport; @@ -373,10 +374,12 @@ private Flow requestPartitionTables(final ConfigNodeProcedureEnv env) { return Flow.HAS_MORE_STATE; } - ScheduledExecutorUtil.safelyScheduleAtFixedRate(heartBeatExecutor, this::checkPartitionTableGenerationStatus, - 0, - HEART_BEAT_REQUEST_RATE, - TimeUnit.MILLISECONDS); + ScheduledExecutorUtil.safelyScheduleAtFixedRate( + heartBeatExecutor, + this::checkPartitionTableGenerationStatus, + 0, + HEART_BEAT_REQUEST_RATE, + TimeUnit.MILLISECONDS); allDataNodes.removeAll(skipDataNodes); allDataNodes.removeAll(failedDataNodes); @@ -510,109 +513,108 @@ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { return Flow.HAS_MORE_STATE; } - Map finalDataPartitionMap = new HashMap<>(); - - for (String database : lostDataPartitionsOfDatabases) { - // Get current DataPartitionTable from ConfigManager - Map>>> - localDataPartitionTableMap = getLocalDataPartitionTable(env, database); - - // Check if ConfigNode has a data partition that is associated with the earliestTimeslot - if (localDataPartitionTableMap == null - || localDataPartitionTableMap.isEmpty() - || localDataPartitionTableMap.get(database) == null - || localDataPartitionTableMap.get(database).isEmpty()) { - LOG.warn( - "No data partition table related to database {} was found from the ConfigNode, use data partition table of DataNode directly", - database); - continue; - } + Map finalDataPartitionMap = new HashMap<>(); - localDataPartitionTableMap - .values() - .forEach( - map -> - map.forEach( - (tSeriesPartitionSlot, seriesPartitionTableMap) -> { - if (tSeriesPartitionSlot == null - || seriesPartitionTableMap == null - || seriesPartitionTableMap.isEmpty()) { - return; - } - finalDataPartitionMap.computeIfAbsent( - tSeriesPartitionSlot, - k -> new SeriesPartitionTable(seriesPartitionTableMap)); - })); - } + for (String database : lostDataPartitionsOfDatabases) { + // Get current DataPartitionTable from ConfigManager + Map>>> + localDataPartitionTableMap = getLocalDataPartitionTable(env, database); - if (finalDataPartitionMap.isEmpty()) { - dataPartitionTables - .values() - .forEach( - dataPartitionTable -> { - if (dataPartitionTable == null - || dataPartitionTable.getDataPartitionMap() == null - || dataPartitionTable.getDataPartitionMap().isEmpty()) { - return; - } - dataPartitionTable - .getDataPartitionMap().forEach( - (dnSeriesPartitionSlot, dnSeriesPartitionTable) -> { - if (dnSeriesPartitionSlot == null - || dnSeriesPartitionTable == null) { - return; - } - finalDataPartitionMap.computeIfAbsent( - dnSeriesPartitionSlot, - k -> dnSeriesPartitionTable); - }); - }); - } else { - finalDataPartitionMap.forEach( - (tSeriesPartitionSlot, seriesPartitionTable) -> { - dataPartitionTables - .values() - .forEach( - dataPartitionTable -> { - if (dataPartitionTable == null - || dataPartitionTable.getDataPartitionMap() == null - || dataPartitionTable.getDataPartitionMap().isEmpty()) { - return; - } - dataPartitionTable - .getDataPartitionMap() - .forEach( - (dnSeriesPartitionSlot, dnSeriesPartitionTable) -> { - if (!tSeriesPartitionSlot.equals(dnSeriesPartitionSlot)) { - return; - } - - if (seriesPartitionTable == null - || seriesPartitionTable.getSeriesPartitionMap() == null - || seriesPartitionTable.getSeriesPartitionMap().isEmpty()) { - finalDataPartitionMap.put( - tSeriesPartitionSlot, dnSeriesPartitionTable); - } - - // dnDataPartitionTable merged to seriesPartitionTable - dnSeriesPartitionTable - .getSeriesPartitionMap() - .forEach( - (k, v) -> - v.forEach( - tConsensusGroupId -> { - if (seriesPartitionTable == null) { - return; - } - seriesPartitionTable.putDataPartition( - k, tConsensusGroupId); - })); - }); - }); - }); - } + // Check if ConfigNode has a data partition that is associated with the earliestTimeslot + if (localDataPartitionTableMap == null + || localDataPartitionTableMap.isEmpty() + || localDataPartitionTableMap.get(database) == null + || localDataPartitionTableMap.get(database).isEmpty()) { + LOG.warn( + "No data partition table related to database {} was found from the ConfigNode, use data partition table of DataNode directly", + database); + continue; + } - finalDataPartitionTable = new DataPartitionTable(finalDataPartitionMap); + localDataPartitionTableMap + .values() + .forEach( + map -> + map.forEach( + (tSeriesPartitionSlot, seriesPartitionTableMap) -> { + if (tSeriesPartitionSlot == null + || seriesPartitionTableMap == null + || seriesPartitionTableMap.isEmpty()) { + return; + } + finalDataPartitionMap.computeIfAbsent( + tSeriesPartitionSlot, + k -> new SeriesPartitionTable(seriesPartitionTableMap)); + })); + } + + if (finalDataPartitionMap.isEmpty()) { + dataPartitionTables + .values() + .forEach( + dataPartitionTable -> { + if (dataPartitionTable == null + || dataPartitionTable.getDataPartitionMap() == null + || dataPartitionTable.getDataPartitionMap().isEmpty()) { + return; + } + dataPartitionTable + .getDataPartitionMap() + .forEach( + (dnSeriesPartitionSlot, dnSeriesPartitionTable) -> { + if (dnSeriesPartitionSlot == null || dnSeriesPartitionTable == null) { + return; + } + finalDataPartitionMap.computeIfAbsent( + dnSeriesPartitionSlot, k -> dnSeriesPartitionTable); + }); + }); + } else { + finalDataPartitionMap.forEach( + (tSeriesPartitionSlot, seriesPartitionTable) -> { + dataPartitionTables + .values() + .forEach( + dataPartitionTable -> { + if (dataPartitionTable == null + || dataPartitionTable.getDataPartitionMap() == null + || dataPartitionTable.getDataPartitionMap().isEmpty()) { + return; + } + dataPartitionTable + .getDataPartitionMap() + .forEach( + (dnSeriesPartitionSlot, dnSeriesPartitionTable) -> { + if (!tSeriesPartitionSlot.equals(dnSeriesPartitionSlot)) { + return; + } + + if (seriesPartitionTable == null + || seriesPartitionTable.getSeriesPartitionMap() == null + || seriesPartitionTable.getSeriesPartitionMap().isEmpty()) { + finalDataPartitionMap.put( + tSeriesPartitionSlot, dnSeriesPartitionTable); + } + + // dnDataPartitionTable merged to seriesPartitionTable + dnSeriesPartitionTable + .getSeriesPartitionMap() + .forEach( + (k, v) -> + v.forEach( + tConsensusGroupId -> { + if (seriesPartitionTable == null) { + return; + } + seriesPartitionTable.putDataPartition( + k, tConsensusGroupId); + })); + }); + }); + }); + } + + finalDataPartitionTable = new DataPartitionTable(finalDataPartitionMap); LOG.info("DataPartitionTable merge completed successfully"); setNextState(DataPartitionTableIntegrityCheckProcedureState.WRITE_PARTITION_TABLE_TO_RAFT);