From ee1d27ee06685d70367577954527cea8a5dae07b Mon Sep 17 00:00:00 2001 From: Peng Junzhi <201250214@smail.nju.edu.cn> Date: Sat, 18 Apr 2026 00:35:19 +0800 Subject: [PATCH] Fix region migration retries and peer recreation --- .../procedure/env/RegionMaintainHandler.java | 16 +++++++++++----- .../apache/iotdb/consensus/iot/IoTConsensus.java | 9 ++++++++- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RegionMaintainHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RegionMaintainHandler.java index 5043becc1fad5..34d4802c1c4f4 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RegionMaintainHandler.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RegionMaintainHandler.java @@ -96,6 +96,7 @@ public class RegionMaintainHandler { private static final Logger LOGGER = LoggerFactory.getLogger(RegionMaintainHandler.class); + private static final int DELETE_OLD_REGION_PEER_RPC_RETRY_NUM = 3; private static final ConfigNodeConfig CONF = ConfigNodeDescriptor.getInstance().getConf(); @@ -175,7 +176,10 @@ public TSStatus createNewRegionPeer(TConsensusGroupId regionId, TDataNodeLocatio || IOT_CONSENSUS_V2.equals(CONF.getDataRegionConsensusProtocolClass()))) { // parameter of createPeer for MultiLeader should be all peers currentPeerNodes = new ArrayList<>(regionReplicaNodes); - currentPeerNodes.add(destDataNode); + if (currentPeerNodes.stream() + .noneMatch(node -> node.getDataNodeId() == destDataNode.getDataNodeId())) { + currentPeerNodes.add(destDataNode); + } } else { // parameter of createPeer for Ratis can be empty currentPeerNodes = Collections.emptyList(); @@ -299,15 +303,17 @@ public TSStatus submitDeleteOldRegionPeerTask( TMaintainPeerReq maintainPeerReq = new TMaintainPeerReq(regionId, originalDataNode, procedureId); - // Always use full retries regardless of node status, because after a cluster crash the - // target DataNode may be Unknown but still in the process of restarting. + // RemoveRegionPeerProcedure already retries DELETE_OLD_REGION_PEER at the procedure level. + // Keep each RPC attempt bounded so a permanently down original DataNode does not block the + // procedure for minutes, while still tolerating a briefly restarting node after cluster crash. status = (TSStatus) SyncDataNodeClientPool.getInstance() - .sendSyncRequestToDataNodeWithRetry( + .sendSyncRequestToDataNodeWithGivenRetry( originalDataNode.getInternalEndPoint(), maintainPeerReq, - CnToDnSyncRequestType.DELETE_OLD_REGION_PEER); + CnToDnSyncRequestType.DELETE_OLD_REGION_PEER, + DELETE_OLD_REGION_PEER_RPC_RETRY_NUM); LOGGER.info( "{}, Send action deleteOldRegionPeer finished, regionId: {}, dataNodeId: {}", REGION_MIGRATE_PROCESS, diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java index 959191ca2d6d3..5e59c1a727368 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java @@ -273,10 +273,17 @@ public void createLocalPeer(ConsensusGroupId groupId, List peers) String path = buildPeerDir(storageDir, groupId); File file = new File(path); - if (!file.mkdirs()) { + if (!file.exists() && !file.mkdirs()) { logger.warn("Unable to create consensus dir for group {} at {}", groupId, path); return null; } + if (!file.isDirectory()) { + logger.warn( + "Consensus dir path for group {} exists but is not a directory: {}", + groupId, + path); + return null; + } IoTConsensusServerImpl impl = new IoTConsensusServerImpl(