From b5fda9d703890801fae31dc554ce09d7727f836f Mon Sep 17 00:00:00 2001 From: Yongzao <532741407@qq.com> Date: Thu, 11 Jun 2026 12:07:46 +0800 Subject: [PATCH 1/5] Fix MIGRATE REGION falsely reported complete on ConfigNode leader switch When the ConfigNode leader is gracefully stopped (or loses leadership) while AddRegionPeerProcedure is waiting for the coordinator DataNode's AddRegionPeer task to finish, RegionMaintainHandler.waitTaskFinish() is interrupted and returns PROCESSING. The DO_ADD_REGION_PEER state previously treated PROCESSING as a no-op terminal state (return Flow.NO_MORE_STATE), silently ending the AddRegionPeerProcedure without success or rollback. The parent RegionMigrateProcedure had already persisted at CHECK_ADD_REGION_PEER, so the new leader resumed there directly. Its isDataNodeContainsRegion() check only inspects the partition table's location list, which is written at CREATE_NEW_REGION_PEER (long before the snapshot finishes transferring). It therefore passed, the source replica was removed, and the migration was declared a success while the destination replica was still in Adding state and had not received the snapshot. Queries against the region returned incorrect results during the gap (observed: ~17 min until the destination became active). Fix: in the PROCESSING branch, stay at DO_ADD_REGION_PEER and persist it (HAS_MORE_STATE) instead of ending. After recovery the new leader re-enters DO_ADD_REGION_PEER and re-polls the coordinator task until it truly reaches SUCCESS or FAIL. The re-poll is idempotent: the isStateDeserialized() guard skips re-submitting after a restart, and the coordinator DataNode dedups by taskId (putIfAbsent) even on a same-process re-execute, so the AddRegionPeer task is never submitted twice. If the coordinator crashed and lost its task table, the poll returns TASK_NOT_EXIST and falls through to the existing FAIL/rollback path. Add cnLeaderSwitchDuringDoAddPeerTest for each consensus protocol (IoTConsensus, IoTConsensusV2 batch/stream, Ratis). Existing daily ConfigNode-crash ITs all use stopForcibly() (SIGKILL), which kills the process before it can run the PROCESSING branch; the new test uses a graceful stop() (SIGTERM) of the leader among 3 ConfigNodes so the interrupted PROCESSING path is actually exercised across a real leader switch. --- ...RegionOperationReliabilityITFramework.java | 43 +++++++++++++++++++ ...DBRegionMigrateConfigNodeCrashIoTV1IT.java | 25 +++++++++++ ...ionMigrateConfigNodeCrashIoTV2BatchIT.java | 25 +++++++++++ ...onMigrateConfigNodeCrashIoTV2StreamIT.java | 25 +++++++++++ ...egionMigrateConfigNodeCrashForRatisIT.java | 25 +++++++++++ .../confignode/i18n/ProcedureMessages.java | 2 +- .../confignode/i18n/ProcedureMessages.java | 2 +- .../impl/region/AddRegionPeerProcedure.java | 14 +++++- 8 files changed, 158 insertions(+), 3 deletions(-) diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/IoTDBRegionOperationReliabilityITFramework.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/IoTDBRegionOperationReliabilityITFramework.java index 9b9c4ad41a1f6..b2da179dfb7e9 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/IoTDBRegionOperationReliabilityITFramework.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/IoTDBRegionOperationReliabilityITFramework.java @@ -119,6 +119,27 @@ public class IoTDBRegionOperationReliabilityITFramework { LOGGER.info("Cluster has been restarted"); }; + /** + * Gracefully stop (SIGTERM, not a forcible kill) the ConfigNode that hit the kill point, then + * restart it. A graceful stop lets the ConfigNode run its shutdown hooks, which interrupts the + * in-flight region-operation procedure worker. This reproduces a leader switch / graceful + * shutdown during AddRegionPeer: the interrupted {@code waitTaskFinish()} returns PROCESSING + * while the AddRegionPeerTask is still running on the coordinator DataNode. The procedure must + * NOT silently end here, otherwise the parent RegionMigrateProcedure would falsely treat AddPeer + * as complete and remove the source replica before the destination replica is actually Running. + * See AddRegionPeerProcedure#executeFromState DO_ADD_REGION_PEER PROCESSING branch. + */ + public static Consumer actionOfGracefullyRestartConfigNode = + context -> { + Assert.assertTrue(context.getNodeWrapper() instanceof ConfigNodeWrapper); + context.getNodeWrapper().stop(); + LOGGER.info("ConfigNode {} gracefully stopped.", context.getNodeWrapper().getId()); + Assert.assertFalse(context.getNodeWrapper().isAlive()); + context.getNodeWrapper().start(); + LOGGER.info("ConfigNode {} restarted.", context.getNodeWrapper().getId()); + Assert.assertTrue(context.getNodeWrapper().isAlive()); + }; + @Before public void setUp() throws Exception { EnvFactory.getEnv() @@ -155,6 +176,28 @@ public void successTest( killNode); } + public void successTestWithAction( + final int dataReplicateFactor, + final int schemaReplicationFactor, + final int configNodeNum, + final int dataNodeNum, + KeySetView killConfigNodeKeywords, + KeySetView killDataNodeKeywords, + Consumer actionWhenDetectKeyWords, + KillNode killNode) + throws Exception { + generalTestWithAllOptions( + dataReplicateFactor, + schemaReplicationFactor, + configNodeNum, + dataNodeNum, + killConfigNodeKeywords, + killDataNodeKeywords, + actionWhenDetectKeyWords, + true, + killNode); + } + public void failTest( final int dataReplicateFactor, final int schemaReplicationFactor, diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv1/IoTDBRegionMigrateConfigNodeCrashIoTV1IT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv1/IoTDBRegionMigrateConfigNodeCrashIoTV1IT.java index ebda0c36ee327..b9fb27f97fd27 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv1/IoTDBRegionMigrateConfigNodeCrashIoTV1IT.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv1/IoTDBRegionMigrateConfigNodeCrashIoTV1IT.java @@ -28,6 +28,7 @@ import org.apache.iotdb.consensus.ConsensusFactory; import org.apache.iotdb.it.env.EnvFactory; import org.apache.iotdb.it.framework.IoTDBTestRunner; +import org.apache.iotdb.itbase.category.ClusterIT; import org.apache.iotdb.itbase.category.DailyIT; import org.junit.Before; @@ -92,6 +93,30 @@ public void testCnCrashDuringDoAddPeer() throws Exception { KillNode.CONFIG_NODE); } + /** + * Gracefully restart (not forcibly kill) the ConfigNode leader while AddRegionPeer is waiting for + * the coordinator's task to finish (DO_ADD_REGION_PEER). The graceful shutdown interrupts the + * procedure worker, so waitTaskFinish() returns PROCESSING. The migration must still finish + * correctly after a leader switch: previously the AddRegionPeerProcedure silently ended on + * PROCESSING, letting the parent procedure remove the source replica before the destination + * replica was actually Running. Uses 3 ConfigNodes so a real leader switch happens. + */ + @Test + // Temporarily also categorized as ClusterIT so the per-PR Cluster IT (1C3D) job runs it for + // validation; will be narrowed back to DailyIT-only before merge. + @Category({DailyIT.class, ClusterIT.class}) + public void cnLeaderSwitchDuringDoAddPeerTest() throws Exception { + successTestWithAction( + 1, + 1, + 3, + 2, + buildSet(AddRegionPeerState.DO_ADD_REGION_PEER), + noKillPoints(), + actionOfGracefullyRestartConfigNode, + KillNode.CONFIG_NODE); + } + @Test public void cnCrashDuringUpdateCacheTest() throws Exception { successTest( diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/batch/IoTDBRegionMigrateConfigNodeCrashIoTV2BatchIT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/batch/IoTDBRegionMigrateConfigNodeCrashIoTV2BatchIT.java index bc4f477b6bd84..63fbbe12516fe 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/batch/IoTDBRegionMigrateConfigNodeCrashIoTV2BatchIT.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/batch/IoTDBRegionMigrateConfigNodeCrashIoTV2BatchIT.java @@ -26,6 +26,7 @@ import org.apache.iotdb.confignode.procedure.state.RegionTransitionState; import org.apache.iotdb.confignode.procedure.state.RemoveRegionPeerState; import org.apache.iotdb.it.framework.IoTDBTestRunner; +import org.apache.iotdb.itbase.category.ClusterIT; import org.apache.iotdb.itbase.category.DailyIT; import org.junit.Ignore; @@ -79,6 +80,30 @@ public void testCnCrashDuringDoAddPeer() throws Exception { KillNode.CONFIG_NODE); } + /** + * Gracefully restart (not forcibly kill) the ConfigNode leader while AddRegionPeer is waiting for + * the coordinator's task to finish (DO_ADD_REGION_PEER). The graceful shutdown interrupts the + * procedure worker, so waitTaskFinish() returns PROCESSING. The migration must still finish + * correctly after a leader switch: previously the AddRegionPeerProcedure silently ended on + * PROCESSING, letting the parent procedure remove the source replica before the destination + * replica was actually Running. Uses 3 ConfigNodes so a real leader switch happens. + */ + @Test + // Temporarily also categorized as ClusterIT so the per-PR Cluster IT (1C3D) job runs it for + // validation; will be narrowed back to DailyIT-only before merge. + @Category({DailyIT.class, ClusterIT.class}) + public void cnLeaderSwitchDuringDoAddPeerTest() throws Exception { + successTestWithAction( + 1, + 1, + 3, + 2, + buildSet(AddRegionPeerState.DO_ADD_REGION_PEER), + noKillPoints(), + actionOfGracefullyRestartConfigNode, + KillNode.CONFIG_NODE); + } + @Test public void cnCrashDuringUpdateCacheTest() throws Exception { successTest( diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/stream/IoTDBRegionMigrateConfigNodeCrashIoTV2StreamIT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/stream/IoTDBRegionMigrateConfigNodeCrashIoTV2StreamIT.java index 39b5953de4a15..3fece46382501 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/stream/IoTDBRegionMigrateConfigNodeCrashIoTV2StreamIT.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/stream/IoTDBRegionMigrateConfigNodeCrashIoTV2StreamIT.java @@ -28,6 +28,7 @@ import org.apache.iotdb.consensus.ConsensusFactory; import org.apache.iotdb.it.env.EnvFactory; import org.apache.iotdb.it.framework.IoTDBTestRunner; +import org.apache.iotdb.itbase.category.ClusterIT; import org.apache.iotdb.itbase.category.DailyIT; import org.junit.Before; @@ -93,6 +94,30 @@ public void testCnCrashDuringDoAddPeer() throws Exception { KillNode.CONFIG_NODE); } + /** + * Gracefully restart (not forcibly kill) the ConfigNode leader while AddRegionPeer is waiting for + * the coordinator's task to finish (DO_ADD_REGION_PEER). The graceful shutdown interrupts the + * procedure worker, so waitTaskFinish() returns PROCESSING. The migration must still finish + * correctly after a leader switch: previously the AddRegionPeerProcedure silently ended on + * PROCESSING, letting the parent procedure remove the source replica before the destination + * replica was actually Running. Uses 3 ConfigNodes so a real leader switch happens. + */ + @Test + // Temporarily also categorized as ClusterIT so the per-PR Cluster IT (1C3D) job runs it for + // validation; will be narrowed back to DailyIT-only before merge. + @Category({DailyIT.class, ClusterIT.class}) + public void cnLeaderSwitchDuringDoAddPeerTest() throws Exception { + successTestWithAction( + 1, + 1, + 3, + 2, + buildSet(AddRegionPeerState.DO_ADD_REGION_PEER), + noKillPoints(), + actionOfGracefullyRestartConfigNode, + KillNode.CONFIG_NODE); + } + @Test public void cnCrashDuringUpdateCacheTest() throws Exception { successTest( diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/ratis/IoTDBRegionMigrateConfigNodeCrashForRatisIT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/ratis/IoTDBRegionMigrateConfigNodeCrashForRatisIT.java index 67b318691a945..82b72ceb1eb78 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/ratis/IoTDBRegionMigrateConfigNodeCrashForRatisIT.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/ratis/IoTDBRegionMigrateConfigNodeCrashForRatisIT.java @@ -25,6 +25,7 @@ import org.apache.iotdb.confignode.procedure.state.AddRegionPeerState; import org.apache.iotdb.confignode.procedure.state.RemoveRegionPeerState; import org.apache.iotdb.it.framework.IoTDBTestRunner; +import org.apache.iotdb.itbase.category.ClusterIT; import org.apache.iotdb.itbase.category.DailyIT; import org.junit.Test; @@ -76,6 +77,30 @@ public void cnCrashDuringDoAddPeerTest() throws Exception { KillNode.CONFIG_NODE); } + /** + * Gracefully restart (not forcibly kill) the ConfigNode leader while AddRegionPeer is waiting for + * the coordinator's task to finish (DO_ADD_REGION_PEER). The graceful shutdown interrupts the + * procedure worker, so waitTaskFinish() returns PROCESSING. The migration must still finish + * correctly after a leader switch: previously the AddRegionPeerProcedure silently ended on + * PROCESSING, letting the parent procedure remove the source replica before the destination + * replica was actually Running. Uses 3 ConfigNodes so a real leader switch happens. + */ + @Test + // Temporarily also categorized as ClusterIT so the per-PR Cluster IT (1C3D) job runs it for + // validation; will be narrowed back to DailyIT-only before merge. + @Category({DailyIT.class, ClusterIT.class}) + public void cnLeaderSwitchDuringDoAddPeerTest() throws Exception { + successTestWithAction( + 1, + 1, + 3, + 2, + buildSet(AddRegionPeerState.DO_ADD_REGION_PEER), + noKillPoints(), + actionOfGracefullyRestartConfigNode, + KillNode.CONFIG_NODE); + } + @Test public void cnCrashDuringUpdateCacheTest() throws Exception { successTest( diff --git a/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java b/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java index a43e5151d4042..f4ae087b159b7 100644 --- a/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java +++ b/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java @@ -1001,7 +1001,7 @@ public final class ProcedureMessages { public static final String VALIDATE_TABLE_FOR_TABLE_WHEN_SETTING_PROPERTIES = "Validate table for table {}.{} when setting properties"; public static final String WAITTASKFINISH_RETURNS_PROCESSING_WHICH_MEANS_THE_WAITING_HAS_BEEN_INTERRUPTED = - "waitTaskFinish() returns PROCESSING, which means the waiting has been interrupted, this procedure will end without rollback"; + "waitTaskFinish() returns PROCESSING, which means the waiting has been interrupted (ConfigNode shutdown or leader change); the AddRegionPeer task is still running on the coordinator, this procedure will stay at DO_ADD_REGION_PEER and resume polling after recovery"; public static final String FAILED_TO_CREATE_DATABASE_THE_TTL_SHOULD_BE_NON_NEGATIVE = "Failed to create database. The TTL should be non-negative."; public static final String FAILED_TO_CREATE_DATABASE_THE_DATAREGIONGROUPNUM_SHOULD_BE_POSITIVE = "Failed to create database. The dataRegionGroupNum should be positive."; diff --git a/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java b/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java index dbad526cce7b8..126055a7c57c7 100644 --- a/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java +++ b/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java @@ -999,7 +999,7 @@ public final class ProcedureMessages { public static final String VALIDATE_TABLE_FOR_TABLE_WHEN_SETTING_PROPERTIES = "Validate table for table {}.{} when setting properties"; public static final String WAITTASKFINISH_RETURNS_PROCESSING_WHICH_MEANS_THE_WAITING_HAS_BEEN_INTERRUPTED = - "waitTaskFinish() returns PROCESSING, which means the waiting has been interrupted, this procedure will end without rollback"; + "waitTaskFinish() 返回 PROCESSING,表示等待被中断(ConfigNode 关闭或主节点切换);AddRegionPeer 任务仍在协调者上运行,该流程将停留在 DO_ADD_REGION_PEER 状态,恢复后继续轮询"; public static final String FAILED_TO_CREATE_DATABASE_THE_TTL_SHOULD_BE_NON_NEGATIVE = "创建数据库失败。TTL 不能为负数。"; public static final String FAILED_TO_CREATE_DATABASE_THE_DATAREGIONGROUPNUM_SHOULD_BE_POSITIVE = "创建数据库失败。dataRegionGroupNum 应为正数。"; diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/AddRegionPeerProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/AddRegionPeerProcedure.java index d09647a332f5b..dac9a6bae0458 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/AddRegionPeerProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/AddRegionPeerProcedure.java @@ -124,10 +124,22 @@ protected Flow executeFromState(ConfigNodeProcedureEnv env, AddRegionPeerState s return warnAndRollBackAndNoMoreState( env, handler, String.format("%s result is %s", state, result.getTaskStatus())); case PROCESSING: + // waitTaskFinish() only returns PROCESSING when its polling loop was interrupted by + // an InterruptedException, i.e. this ConfigNode is shutting down / losing leadership + // (a user CANCEL or a coordinator disconnection both go through the FAIL branch + // above). The AddRegionPeerTask is still running on the coordinator DataNode, so we + // must NOT silently end here: doing so would let the parent RegionMigrateProcedure + // proceed to CHECK_ADD_REGION_PEER / REMOVE_REGION_PEER and remove the source replica + // before the destination replica has actually finished receiving the snapshot. + // Instead, stay in DO_ADD_REGION_PEER and persist it; after recovery the new leader + // re-enters this state and re-polls the still-running coordinator task (the + // isStateDeserialized() guard above prevents re-submitting the task) until it really + // reaches SUCCESS or FAIL. LOGGER.info( ProcedureMessages .WAITTASKFINISH_RETURNS_PROCESSING_WHICH_MEANS_THE_WAITING_HAS_BEEN_INTERRUPTED); - return Flow.NO_MORE_STATE; + setNextState(AddRegionPeerState.DO_ADD_REGION_PEER); + break outerSwitch; case SUCCESS: setNextState(UPDATE_REGION_LOCATION_CACHE); break outerSwitch; From e20f3f9238d1c44288649c218acb63305af5d373 Mon Sep 17 00:00:00 2001 From: Yongzao <532741407@qq.com> Date: Thu, 11 Jun 2026 13:26:07 +0800 Subject: [PATCH 2/5] Narrow cnLeaderSwitchDuringDoAddPeerTest back to DailyIT-only The temporary @Category({DailyIT.class, ClusterIT.class}) tags were only to validate the new tests on the per-PR Cluster IT pipeline. That run is green, so narrow them back to the class-level DailyIT category (remove the method-level @Category, the temporary comment, and the now-unused ClusterIT import). --- .../daily/iotv1/IoTDBRegionMigrateConfigNodeCrashIoTV1IT.java | 4 ---- .../batch/IoTDBRegionMigrateConfigNodeCrashIoTV2BatchIT.java | 4 ---- .../IoTDBRegionMigrateConfigNodeCrashIoTV2StreamIT.java | 4 ---- .../ratis/IoTDBRegionMigrateConfigNodeCrashForRatisIT.java | 4 ---- 4 files changed, 16 deletions(-) diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv1/IoTDBRegionMigrateConfigNodeCrashIoTV1IT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv1/IoTDBRegionMigrateConfigNodeCrashIoTV1IT.java index b9fb27f97fd27..895b483772770 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv1/IoTDBRegionMigrateConfigNodeCrashIoTV1IT.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv1/IoTDBRegionMigrateConfigNodeCrashIoTV1IT.java @@ -28,7 +28,6 @@ import org.apache.iotdb.consensus.ConsensusFactory; import org.apache.iotdb.it.env.EnvFactory; import org.apache.iotdb.it.framework.IoTDBTestRunner; -import org.apache.iotdb.itbase.category.ClusterIT; import org.apache.iotdb.itbase.category.DailyIT; import org.junit.Before; @@ -102,9 +101,6 @@ public void testCnCrashDuringDoAddPeer() throws Exception { * replica was actually Running. Uses 3 ConfigNodes so a real leader switch happens. */ @Test - // Temporarily also categorized as ClusterIT so the per-PR Cluster IT (1C3D) job runs it for - // validation; will be narrowed back to DailyIT-only before merge. - @Category({DailyIT.class, ClusterIT.class}) public void cnLeaderSwitchDuringDoAddPeerTest() throws Exception { successTestWithAction( 1, diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/batch/IoTDBRegionMigrateConfigNodeCrashIoTV2BatchIT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/batch/IoTDBRegionMigrateConfigNodeCrashIoTV2BatchIT.java index 63fbbe12516fe..64bca2b5aed60 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/batch/IoTDBRegionMigrateConfigNodeCrashIoTV2BatchIT.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/batch/IoTDBRegionMigrateConfigNodeCrashIoTV2BatchIT.java @@ -26,7 +26,6 @@ import org.apache.iotdb.confignode.procedure.state.RegionTransitionState; import org.apache.iotdb.confignode.procedure.state.RemoveRegionPeerState; import org.apache.iotdb.it.framework.IoTDBTestRunner; -import org.apache.iotdb.itbase.category.ClusterIT; import org.apache.iotdb.itbase.category.DailyIT; import org.junit.Ignore; @@ -89,9 +88,6 @@ public void testCnCrashDuringDoAddPeer() throws Exception { * replica was actually Running. Uses 3 ConfigNodes so a real leader switch happens. */ @Test - // Temporarily also categorized as ClusterIT so the per-PR Cluster IT (1C3D) job runs it for - // validation; will be narrowed back to DailyIT-only before merge. - @Category({DailyIT.class, ClusterIT.class}) public void cnLeaderSwitchDuringDoAddPeerTest() throws Exception { successTestWithAction( 1, diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/stream/IoTDBRegionMigrateConfigNodeCrashIoTV2StreamIT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/stream/IoTDBRegionMigrateConfigNodeCrashIoTV2StreamIT.java index 3fece46382501..9778cb5b0545b 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/stream/IoTDBRegionMigrateConfigNodeCrashIoTV2StreamIT.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/stream/IoTDBRegionMigrateConfigNodeCrashIoTV2StreamIT.java @@ -28,7 +28,6 @@ import org.apache.iotdb.consensus.ConsensusFactory; import org.apache.iotdb.it.env.EnvFactory; import org.apache.iotdb.it.framework.IoTDBTestRunner; -import org.apache.iotdb.itbase.category.ClusterIT; import org.apache.iotdb.itbase.category.DailyIT; import org.junit.Before; @@ -103,9 +102,6 @@ public void testCnCrashDuringDoAddPeer() throws Exception { * replica was actually Running. Uses 3 ConfigNodes so a real leader switch happens. */ @Test - // Temporarily also categorized as ClusterIT so the per-PR Cluster IT (1C3D) job runs it for - // validation; will be narrowed back to DailyIT-only before merge. - @Category({DailyIT.class, ClusterIT.class}) public void cnLeaderSwitchDuringDoAddPeerTest() throws Exception { successTestWithAction( 1, diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/ratis/IoTDBRegionMigrateConfigNodeCrashForRatisIT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/ratis/IoTDBRegionMigrateConfigNodeCrashForRatisIT.java index 82b72ceb1eb78..2740dcaf60924 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/ratis/IoTDBRegionMigrateConfigNodeCrashForRatisIT.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/ratis/IoTDBRegionMigrateConfigNodeCrashForRatisIT.java @@ -25,7 +25,6 @@ import org.apache.iotdb.confignode.procedure.state.AddRegionPeerState; import org.apache.iotdb.confignode.procedure.state.RemoveRegionPeerState; import org.apache.iotdb.it.framework.IoTDBTestRunner; -import org.apache.iotdb.itbase.category.ClusterIT; import org.apache.iotdb.itbase.category.DailyIT; import org.junit.Test; @@ -86,9 +85,6 @@ public void cnCrashDuringDoAddPeerTest() throws Exception { * replica was actually Running. Uses 3 ConfigNodes so a real leader switch happens. */ @Test - // Temporarily also categorized as ClusterIT so the per-PR Cluster IT (1C3D) job runs it for - // validation; will be narrowed back to DailyIT-only before merge. - @Category({DailyIT.class, ClusterIT.class}) public void cnLeaderSwitchDuringDoAddPeerTest() throws Exception { successTestWithAction( 1, From 6dd498e1dcf8943ab9b8ffbd299e605bfa6f8868 Mon Sep 17 00:00:00 2001 From: Yongzao <532741407@qq.com> Date: Thu, 11 Jun 2026 16:57:45 +0800 Subject: [PATCH 3/5] Address review: avoid reExecute loop on shutdown and make the IT exercise PROCESSING Two issues raised in review of the leader-switch fix: 1. Returning HAS_MORE_STATE for the same DO_ADD_REGION_PEER state makes ProcedureExecutor.executeProcedure re-execute the procedure in place (subprocs[0] == proc -> reExecute = true). On shutdown / leader switch the worker is interrupted, waitTaskFinish() returns PROCESSING, and the procedure parks at DO_ADD_REGION_PEER again. Because stopExecutor() runs executor.stop() and executor.join() before store.stop(), store.isRunning() is still true while join() waits for this worker, so the inner reExecute loop would spin (re-submitting / re-polling) and join() would hang. Fix (framework layer): the inner loop now also checks the executor's own running flag - `if (!isRunning() || !store.isRunning()) return;` - so the worker exits cleanly when the executor is stopping. The persisted state lets the next leader resume. Hardening (procedure layer): only submit the AddRegionPeerTask when getCycles() == 0 (first entry) in addition to the existing isStateDeserialized() guard, so an in-place re-entry never re-submits. 2. cnLeaderSwitchDuringDoAddPeerTest used the DO_ADD_REGION_PEER kill point, which fires after the task is submitted but before waitTaskFinish() starts polling. If AddPeer finished quickly the procedure took the SUCCESS branch and the new PROCESSING branch was never exercised (confirmed: the prior CI run's logs contain no "returns PROCESSING"). Fix: add a RegionMaintainKillPoints.WAIT_TASK_FINISH_POLLING kill point fired from inside waitTaskFinish() once the first poll confirms the task is still PROCESSING - i.e. when the worker is provably blocked in the wait. The graceful stop then deterministically interrupts waitTaskFinish(). The test registers this kill point and additionally asserts a ConfigNode log contains "returns PROCESSING", so it fails loudly if the branch is skipped. --- ...RegionOperationReliabilityITFramework.java | 32 +++++++++++++++++ ...DBRegionMigrateConfigNodeCrashIoTV1IT.java | 18 ++++++---- ...ionMigrateConfigNodeCrashIoTV2BatchIT.java | 18 ++++++---- ...onMigrateConfigNodeCrashIoTV2StreamIT.java | 18 ++++++---- ...egionMigrateConfigNodeCrashForRatisIT.java | 18 ++++++---- .../procedure/ProcedureExecutor.java | 10 +++++- .../procedure/env/RegionMaintainHandler.java | 23 +++++++++++++ .../impl/region/AddRegionPeerProcedure.java | 16 +++++++-- .../KillPoint/RegionMaintainKillPoints.java | 34 +++++++++++++++++++ 9 files changed, 155 insertions(+), 32 deletions(-) create mode 100644 iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/KillPoint/RegionMaintainKillPoints.java diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/IoTDBRegionOperationReliabilityITFramework.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/IoTDBRegionOperationReliabilityITFramework.java index b2da179dfb7e9..9bc03afebb291 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/IoTDBRegionOperationReliabilityITFramework.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/IoTDBRegionOperationReliabilityITFramework.java @@ -58,6 +58,9 @@ import java.io.File; import java.io.IOException; import java.io.InputStreamReader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; @@ -81,6 +84,7 @@ import java.util.function.Consumer; import java.util.function.Predicate; import java.util.stream.Collectors; +import java.util.stream.Stream; import static org.apache.iotdb.util.MagicUtils.makeItCloseQuietly; @@ -890,4 +894,32 @@ protected static Map> getRegionStatusMap(Session s } return result; } + + /** + * Fail the test unless at least one ConfigNode's log contains {@code substring}. Used to assert + * that a specific code path was actually exercised (e.g. the AddRegionPeer PROCESSING branch when + * the leader was gracefully stopped). The old leader writes the line before it restarts, so we + * scan every ConfigNode's {@code log_confignode_all.log}. + */ + protected static void assertSomeConfigNodeLogContains(String substring) { + List configNodeWrappers = EnvFactory.getEnv().getConfigNodeWrapperList(); + for (ConfigNodeWrapper configNodeWrapper : configNodeWrappers) { + Path logPath = Paths.get(configNodeWrapper.getNodePath(), "logs", "log_confignode_all.log"); + if (!Files.exists(logPath)) { + continue; + } + try (Stream lines = Files.lines(logPath)) { + if (lines.anyMatch(line -> line.contains(substring))) { + LOGGER.info("Found expected log line containing \"{}\" in {}", substring, logPath); + return; + } + } catch (IOException e) { + LOGGER.warn("Failed to read ConfigNode log {}", logPath, e); + } + } + Assert.fail( + "Expected at least one ConfigNode log to contain \"" + + substring + + "\", but none did. The code path under test was not exercised."); + } } diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv1/IoTDBRegionMigrateConfigNodeCrashIoTV1IT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv1/IoTDBRegionMigrateConfigNodeCrashIoTV1IT.java index 895b483772770..d0485d7afb83e 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv1/IoTDBRegionMigrateConfigNodeCrashIoTV1IT.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv1/IoTDBRegionMigrateConfigNodeCrashIoTV1IT.java @@ -21,6 +21,7 @@ import org.apache.iotdb.commons.utils.KillPoint.KillNode; import org.apache.iotdb.commons.utils.KillPoint.KillPoint; +import org.apache.iotdb.commons.utils.KillPoint.RegionMaintainKillPoints; import org.apache.iotdb.confignode.it.regionmigration.IoTDBRegionOperationReliabilityITFramework; import org.apache.iotdb.confignode.procedure.state.AddRegionPeerState; import org.apache.iotdb.confignode.procedure.state.RegionTransitionState; @@ -93,12 +94,14 @@ public void testCnCrashDuringDoAddPeer() throws Exception { } /** - * Gracefully restart (not forcibly kill) the ConfigNode leader while AddRegionPeer is waiting for - * the coordinator's task to finish (DO_ADD_REGION_PEER). The graceful shutdown interrupts the - * procedure worker, so waitTaskFinish() returns PROCESSING. The migration must still finish - * correctly after a leader switch: previously the AddRegionPeerProcedure silently ended on - * PROCESSING, letting the parent procedure remove the source replica before the destination - * replica was actually Running. Uses 3 ConfigNodes so a real leader switch happens. + * Gracefully restart (not forcibly kill) the ConfigNode leader while AddRegionPeer is blocked in + * waitTaskFinish() polling the coordinator. The kill point fires from inside waitTaskFinish() + * (after the first poll confirms the task is still running), so the graceful shutdown + * deterministically interrupts that wait and waitTaskFinish() returns PROCESSING. The migration + * must still finish correctly after a leader switch: previously the AddRegionPeerProcedure + * silently ended on PROCESSING, letting the parent procedure remove the source replica before the + * destination replica was actually Running. Uses 3 ConfigNodes so a real leader switch happens, + * and asserts the PROCESSING branch was actually exercised. */ @Test public void cnLeaderSwitchDuringDoAddPeerTest() throws Exception { @@ -107,10 +110,11 @@ public void cnLeaderSwitchDuringDoAddPeerTest() throws Exception { 1, 3, 2, - buildSet(AddRegionPeerState.DO_ADD_REGION_PEER), + buildSet(RegionMaintainKillPoints.WAIT_TASK_FINISH_POLLING), noKillPoints(), actionOfGracefullyRestartConfigNode, KillNode.CONFIG_NODE); + assertSomeConfigNodeLogContains("returns PROCESSING"); } @Test diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/batch/IoTDBRegionMigrateConfigNodeCrashIoTV2BatchIT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/batch/IoTDBRegionMigrateConfigNodeCrashIoTV2BatchIT.java index 64bca2b5aed60..8868bf67d6cc8 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/batch/IoTDBRegionMigrateConfigNodeCrashIoTV2BatchIT.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/batch/IoTDBRegionMigrateConfigNodeCrashIoTV2BatchIT.java @@ -21,6 +21,7 @@ import org.apache.iotdb.commons.utils.KillPoint.KillNode; import org.apache.iotdb.commons.utils.KillPoint.KillPoint; +import org.apache.iotdb.commons.utils.KillPoint.RegionMaintainKillPoints; import org.apache.iotdb.confignode.it.regionmigration.IoTDBRegionOperationReliabilityITFramework; import org.apache.iotdb.confignode.procedure.state.AddRegionPeerState; import org.apache.iotdb.confignode.procedure.state.RegionTransitionState; @@ -80,12 +81,14 @@ public void testCnCrashDuringDoAddPeer() throws Exception { } /** - * Gracefully restart (not forcibly kill) the ConfigNode leader while AddRegionPeer is waiting for - * the coordinator's task to finish (DO_ADD_REGION_PEER). The graceful shutdown interrupts the - * procedure worker, so waitTaskFinish() returns PROCESSING. The migration must still finish - * correctly after a leader switch: previously the AddRegionPeerProcedure silently ended on - * PROCESSING, letting the parent procedure remove the source replica before the destination - * replica was actually Running. Uses 3 ConfigNodes so a real leader switch happens. + * Gracefully restart (not forcibly kill) the ConfigNode leader while AddRegionPeer is blocked in + * waitTaskFinish() polling the coordinator. The kill point fires from inside waitTaskFinish() + * (after the first poll confirms the task is still running), so the graceful shutdown + * deterministically interrupts that wait and waitTaskFinish() returns PROCESSING. The migration + * must still finish correctly after a leader switch: previously the AddRegionPeerProcedure + * silently ended on PROCESSING, letting the parent procedure remove the source replica before the + * destination replica was actually Running. Uses 3 ConfigNodes so a real leader switch happens, + * and asserts the PROCESSING branch was actually exercised. */ @Test public void cnLeaderSwitchDuringDoAddPeerTest() throws Exception { @@ -94,10 +97,11 @@ public void cnLeaderSwitchDuringDoAddPeerTest() throws Exception { 1, 3, 2, - buildSet(AddRegionPeerState.DO_ADD_REGION_PEER), + buildSet(RegionMaintainKillPoints.WAIT_TASK_FINISH_POLLING), noKillPoints(), actionOfGracefullyRestartConfigNode, KillNode.CONFIG_NODE); + assertSomeConfigNodeLogContains("returns PROCESSING"); } @Test diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/stream/IoTDBRegionMigrateConfigNodeCrashIoTV2StreamIT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/stream/IoTDBRegionMigrateConfigNodeCrashIoTV2StreamIT.java index 9778cb5b0545b..8b3653610e39f 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/stream/IoTDBRegionMigrateConfigNodeCrashIoTV2StreamIT.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/stream/IoTDBRegionMigrateConfigNodeCrashIoTV2StreamIT.java @@ -21,6 +21,7 @@ import org.apache.iotdb.commons.utils.KillPoint.KillNode; import org.apache.iotdb.commons.utils.KillPoint.KillPoint; +import org.apache.iotdb.commons.utils.KillPoint.RegionMaintainKillPoints; import org.apache.iotdb.confignode.it.regionmigration.IoTDBRegionOperationReliabilityITFramework; import org.apache.iotdb.confignode.procedure.state.AddRegionPeerState; import org.apache.iotdb.confignode.procedure.state.RegionTransitionState; @@ -94,12 +95,14 @@ public void testCnCrashDuringDoAddPeer() throws Exception { } /** - * Gracefully restart (not forcibly kill) the ConfigNode leader while AddRegionPeer is waiting for - * the coordinator's task to finish (DO_ADD_REGION_PEER). The graceful shutdown interrupts the - * procedure worker, so waitTaskFinish() returns PROCESSING. The migration must still finish - * correctly after a leader switch: previously the AddRegionPeerProcedure silently ended on - * PROCESSING, letting the parent procedure remove the source replica before the destination - * replica was actually Running. Uses 3 ConfigNodes so a real leader switch happens. + * Gracefully restart (not forcibly kill) the ConfigNode leader while AddRegionPeer is blocked in + * waitTaskFinish() polling the coordinator. The kill point fires from inside waitTaskFinish() + * (after the first poll confirms the task is still running), so the graceful shutdown + * deterministically interrupts that wait and waitTaskFinish() returns PROCESSING. The migration + * must still finish correctly after a leader switch: previously the AddRegionPeerProcedure + * silently ended on PROCESSING, letting the parent procedure remove the source replica before the + * destination replica was actually Running. Uses 3 ConfigNodes so a real leader switch happens, + * and asserts the PROCESSING branch was actually exercised. */ @Test public void cnLeaderSwitchDuringDoAddPeerTest() throws Exception { @@ -108,10 +111,11 @@ public void cnLeaderSwitchDuringDoAddPeerTest() throws Exception { 1, 3, 2, - buildSet(AddRegionPeerState.DO_ADD_REGION_PEER), + buildSet(RegionMaintainKillPoints.WAIT_TASK_FINISH_POLLING), noKillPoints(), actionOfGracefullyRestartConfigNode, KillNode.CONFIG_NODE); + assertSomeConfigNodeLogContains("returns PROCESSING"); } @Test diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/ratis/IoTDBRegionMigrateConfigNodeCrashForRatisIT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/ratis/IoTDBRegionMigrateConfigNodeCrashForRatisIT.java index 2740dcaf60924..e18f236b5bccf 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/ratis/IoTDBRegionMigrateConfigNodeCrashForRatisIT.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/ratis/IoTDBRegionMigrateConfigNodeCrashForRatisIT.java @@ -21,6 +21,7 @@ import org.apache.iotdb.commons.utils.KillPoint.KillNode; import org.apache.iotdb.commons.utils.KillPoint.KillPoint; +import org.apache.iotdb.commons.utils.KillPoint.RegionMaintainKillPoints; import org.apache.iotdb.confignode.it.regionmigration.IoTDBRegionMigrateITFrameworkForRatis; import org.apache.iotdb.confignode.procedure.state.AddRegionPeerState; import org.apache.iotdb.confignode.procedure.state.RemoveRegionPeerState; @@ -77,12 +78,14 @@ public void cnCrashDuringDoAddPeerTest() throws Exception { } /** - * Gracefully restart (not forcibly kill) the ConfigNode leader while AddRegionPeer is waiting for - * the coordinator's task to finish (DO_ADD_REGION_PEER). The graceful shutdown interrupts the - * procedure worker, so waitTaskFinish() returns PROCESSING. The migration must still finish - * correctly after a leader switch: previously the AddRegionPeerProcedure silently ended on - * PROCESSING, letting the parent procedure remove the source replica before the destination - * replica was actually Running. Uses 3 ConfigNodes so a real leader switch happens. + * Gracefully restart (not forcibly kill) the ConfigNode leader while AddRegionPeer is blocked in + * waitTaskFinish() polling the coordinator. The kill point fires from inside waitTaskFinish() + * (after the first poll confirms the task is still running), so the graceful shutdown + * deterministically interrupts that wait and waitTaskFinish() returns PROCESSING. The migration + * must still finish correctly after a leader switch: previously the AddRegionPeerProcedure + * silently ended on PROCESSING, letting the parent procedure remove the source replica before the + * destination replica was actually Running. Uses 3 ConfigNodes so a real leader switch happens, + * and asserts the PROCESSING branch was actually exercised. */ @Test public void cnLeaderSwitchDuringDoAddPeerTest() throws Exception { @@ -91,10 +94,11 @@ public void cnLeaderSwitchDuringDoAddPeerTest() throws Exception { 1, 3, 2, - buildSet(AddRegionPeerState.DO_ADD_REGION_PEER), + buildSet(RegionMaintainKillPoints.WAIT_TASK_FINISH_POLLING), noKillPoints(), actionOfGracefullyRestartConfigNode, KillNode.CONFIG_NODE); + assertSomeConfigNodeLogContains("returns PROCESSING"); } @Test diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/ProcedureExecutor.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/ProcedureExecutor.java index 82afea3859fd4..b0e5d73783571 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/ProcedureExecutor.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/ProcedureExecutor.java @@ -454,7 +454,15 @@ private void executeProcedure(RootProcedureStack rootProcStack, Procedure p updateStoreOnExecution(rootProcStack, proc, subprocs); - if (!store.isRunning()) { + // Stop the in-place re-execution loop once this executor is shutting down (e.g. ConfigNode + // leader switch / restart). Checking store.isRunning() alone is not enough: stopExecutor() + // calls executor.stop() and executor.join() before store.stop(), so the store is still + // running while join() waits for this very worker to finish. Without also checking the + // executor's own running flag, a procedure that keeps returning HAS_MORE_STATE for the same + // state (e.g. AddRegionPeerProcedure parking at DO_ADD_REGION_PEER after waitTaskFinish() is + // interrupted) would re-execute forever here and join() would hang. The persisted state lets + // the next leader resume from where it stopped. + if (!isRunning() || !store.isRunning()) { return; } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RegionMaintainHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RegionMaintainHandler.java index 8637e6c8fe781..9ce0c9f72a3c8 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RegionMaintainHandler.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RegionMaintainHandler.java @@ -89,6 +89,7 @@ import static org.apache.iotdb.commons.pipe.config.constant.PipeSourceConstant.EXTRACTOR_IOTDB_USER_KEY; import static org.apache.iotdb.commons.pipe.config.constant.PipeSourceConstant.EXTRACTOR_KEY; import static org.apache.iotdb.commons.pipe.config.constant.PipeSourceConstant.EXTRACTOR_REALTIME_MODE_KEY; +import static org.apache.iotdb.commons.utils.KillPoint.KillPoint.setKillPoint; import static org.apache.iotdb.confignode.conf.ConfigNodeConstant.REGION_MIGRATE_PROCESS; import static org.apache.iotdb.consensus.ConsensusFactory.IOT_CONSENSUS; import static org.apache.iotdb.consensus.ConsensusFactory.IOT_CONSENSUS_V2; @@ -360,10 +361,26 @@ public Map resetPeerList( // TODO: will use 'procedure yield' to refactor later public TRegionMigrateResult waitTaskFinish(long taskId, TDataNodeLocation dataNodeLocation) { + return waitTaskFinish(taskId, dataNodeLocation, null); + } + + /** + * Poll the coordinator DataNode until the region-maintain task identified by {@code taskId} + * reaches a terminal state. + * + * @param killPoint if non-null, fired once right after the first poll confirms the task is still + * PROCESSING. At that point the worker thread is provably blocked inside this method, so + * tests can use the kill point to deterministically interrupt the wait (e.g. by gracefully + * stopping the ConfigNode leader) and exercise the interrupted-PROCESSING path. It is a no-op + * outside integration tests. + */ + public > TRegionMigrateResult waitTaskFinish( + long taskId, TDataNodeLocation dataNodeLocation, T killPoint) { final long MAX_DISCONNECTION_TOLERATE_MS = 600_000; final long INITIAL_DISCONNECTION_TOLERATE_MS = 60_000; long startTime = System.nanoTime(); long lastReportTime = System.nanoTime(); + boolean killPointTriggered = false; while (true) { try (SyncDataNodeInternalServiceClient dataNodeClient = dataNodeClientManager.borrowClient(dataNodeLocation.getInternalEndPoint())) { @@ -372,6 +389,12 @@ public TRegionMigrateResult waitTaskFinish(long taskId, TDataNodeLocation dataNo if (report.getTaskStatus() != TRegionMaintainTaskStatus.PROCESSING) { return report; } + // The task is confirmed still running and this thread is blocked here, so it is now safe to + // fire the kill point that tests use to interrupt waitTaskFinish() deterministically. + if (killPoint != null && !killPointTriggered) { + setKillPoint(killPoint); + killPointTriggered = true; + } } catch (Exception ignore) { } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/AddRegionPeerProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/AddRegionPeerProcedure.java index dac9a6bae0458..acce4b2e8491e 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/AddRegionPeerProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/AddRegionPeerProcedure.java @@ -26,6 +26,7 @@ import org.apache.iotdb.commons.exception.runtime.ThriftSerDeException; import org.apache.iotdb.commons.queryengine.utils.DateTimeUtils; import org.apache.iotdb.commons.utils.CommonDateTimeUtils; +import org.apache.iotdb.commons.utils.KillPoint.RegionMaintainKillPoints; import org.apache.iotdb.commons.utils.ThriftCommonsSerDeUtils; import org.apache.iotdb.confignode.i18n.ProcedureMessages; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; @@ -104,8 +105,15 @@ protected Flow executeFromState(ConfigNodeProcedureEnv env, AddRegionPeerState s break; case DO_ADD_REGION_PEER: handler.forceUpdateRegionCache(regionId, targetDataNode, RegionStatus.Adding); - // We don't want to re-submit AddRegionPeerTask when leader change or ConfigNode reboot - if (!this.isStateDeserialized()) { + // Only submit the AddRegionPeerTask on the very first entry of this state. We must NOT + // re-submit when: + // - the state was restored from disk after a leader change / ConfigNode reboot + // (isStateDeserialized()), or + // - this state is being re-entered in place because a previous attempt parked here on + // PROCESSING (getCycles() > 0, see the PROCESSING branch below). + // The coordinator DataNode also dedups by taskId, so a duplicate submit would be a no-op, + // but skipping it here avoids the useless RPC and keeps the re-poll cheap. + if (!this.isStateDeserialized() && getCycles() == 0) { TSStatus tsStatus = handler.submitAddRegionPeerTask( this.getProcId(), targetDataNode, regionId, coordinator); @@ -115,7 +123,9 @@ protected Flow executeFromState(ConfigNodeProcedureEnv env, AddRegionPeerState s env, handler, "submit DO_ADD_REGION_PEER task fail"); } } - TRegionMigrateResult result = handler.waitTaskFinish(this.getProcId(), coordinator); + TRegionMigrateResult result = + handler.waitTaskFinish( + this.getProcId(), coordinator, RegionMaintainKillPoints.WAIT_TASK_FINISH_POLLING); switch (result.getTaskStatus()) { case TASK_NOT_EXIST: // coordinator crashed and lost its task table diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/KillPoint/RegionMaintainKillPoints.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/KillPoint/RegionMaintainKillPoints.java new file mode 100644 index 0000000000000..da906b845fdaa --- /dev/null +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/KillPoint/RegionMaintainKillPoints.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.commons.utils.KillPoint; + +/** Kill points for the ConfigNode-side region-maintain procedures (AddPeer / RemovePeer). */ +public enum RegionMaintainKillPoints { + /** + * Fired from {@code RegionMaintainHandler.waitTaskFinish()} once the coordinator DataNode has + * confirmed the task is still running (the first poll returned PROCESSING). Unlike the + * AddRegionPeerState.DO_ADD_REGION_PEER kill point, which fires right after the task is submitted + * and before waitTaskFinish() starts polling, this kill point guarantees the procedure worker is + * actually blocked inside waitTaskFinish(). Tests use it to deterministically interrupt + * waitTaskFinish() (e.g. by gracefully stopping the ConfigNode leader) so the PROCESSING branch + * is exercised instead of racing the task to completion. + */ + WAIT_TASK_FINISH_POLLING, +} From 316e2b867b697860bb929cabd4e76781f9a90297 Mon Sep 17 00:00:00 2001 From: Yongzao <532741407@qq.com> Date: Thu, 11 Jun 2026 17:27:51 +0800 Subject: [PATCH 4/5] Temporarily run cnLeaderSwitchDuringDoAddPeerTest on per-PR Cluster IT Tag the four cnLeaderSwitchDuringDoAddPeerTest methods with ClusterIT (in addition to DailyIT) so the per-PR Cluster IT pipeline runs them and we can confirm the new WAIT_TASK_FINISH_POLLING kill point deterministically drives the PROCESSING branch (asserted via the new ConfigNode-log check). This will be reverted to DailyIT-only before merge. --- .../daily/iotv1/IoTDBRegionMigrateConfigNodeCrashIoTV1IT.java | 4 ++++ .../batch/IoTDBRegionMigrateConfigNodeCrashIoTV2BatchIT.java | 4 ++++ .../IoTDBRegionMigrateConfigNodeCrashIoTV2StreamIT.java | 4 ++++ .../ratis/IoTDBRegionMigrateConfigNodeCrashForRatisIT.java | 4 ++++ 4 files changed, 16 insertions(+) diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv1/IoTDBRegionMigrateConfigNodeCrashIoTV1IT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv1/IoTDBRegionMigrateConfigNodeCrashIoTV1IT.java index d0485d7afb83e..1f5793617939a 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv1/IoTDBRegionMigrateConfigNodeCrashIoTV1IT.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv1/IoTDBRegionMigrateConfigNodeCrashIoTV1IT.java @@ -29,6 +29,7 @@ import org.apache.iotdb.consensus.ConsensusFactory; import org.apache.iotdb.it.env.EnvFactory; import org.apache.iotdb.it.framework.IoTDBTestRunner; +import org.apache.iotdb.itbase.category.ClusterIT; import org.apache.iotdb.itbase.category.DailyIT; import org.junit.Before; @@ -104,6 +105,9 @@ public void testCnCrashDuringDoAddPeer() throws Exception { * and asserts the PROCESSING branch was actually exercised. */ @Test + // Temporarily also categorized as ClusterIT so the per-PR Cluster IT (1C3D) job runs it for + // validation; will be narrowed back to DailyIT-only before merge. + @Category({DailyIT.class, ClusterIT.class}) public void cnLeaderSwitchDuringDoAddPeerTest() throws Exception { successTestWithAction( 1, diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/batch/IoTDBRegionMigrateConfigNodeCrashIoTV2BatchIT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/batch/IoTDBRegionMigrateConfigNodeCrashIoTV2BatchIT.java index 8868bf67d6cc8..6df7ae0fa14e6 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/batch/IoTDBRegionMigrateConfigNodeCrashIoTV2BatchIT.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/batch/IoTDBRegionMigrateConfigNodeCrashIoTV2BatchIT.java @@ -27,6 +27,7 @@ import org.apache.iotdb.confignode.procedure.state.RegionTransitionState; import org.apache.iotdb.confignode.procedure.state.RemoveRegionPeerState; import org.apache.iotdb.it.framework.IoTDBTestRunner; +import org.apache.iotdb.itbase.category.ClusterIT; import org.apache.iotdb.itbase.category.DailyIT; import org.junit.Ignore; @@ -91,6 +92,9 @@ public void testCnCrashDuringDoAddPeer() throws Exception { * and asserts the PROCESSING branch was actually exercised. */ @Test + // Temporarily also categorized as ClusterIT so the per-PR Cluster IT (1C3D) job runs it for + // validation; will be narrowed back to DailyIT-only before merge. + @Category({DailyIT.class, ClusterIT.class}) public void cnLeaderSwitchDuringDoAddPeerTest() throws Exception { successTestWithAction( 1, diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/stream/IoTDBRegionMigrateConfigNodeCrashIoTV2StreamIT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/stream/IoTDBRegionMigrateConfigNodeCrashIoTV2StreamIT.java index 8b3653610e39f..ec5fab397cdf1 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/stream/IoTDBRegionMigrateConfigNodeCrashIoTV2StreamIT.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/stream/IoTDBRegionMigrateConfigNodeCrashIoTV2StreamIT.java @@ -29,6 +29,7 @@ import org.apache.iotdb.consensus.ConsensusFactory; import org.apache.iotdb.it.env.EnvFactory; import org.apache.iotdb.it.framework.IoTDBTestRunner; +import org.apache.iotdb.itbase.category.ClusterIT; import org.apache.iotdb.itbase.category.DailyIT; import org.junit.Before; @@ -105,6 +106,9 @@ public void testCnCrashDuringDoAddPeer() throws Exception { * and asserts the PROCESSING branch was actually exercised. */ @Test + // Temporarily also categorized as ClusterIT so the per-PR Cluster IT (1C3D) job runs it for + // validation; will be narrowed back to DailyIT-only before merge. + @Category({DailyIT.class, ClusterIT.class}) public void cnLeaderSwitchDuringDoAddPeerTest() throws Exception { successTestWithAction( 1, diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/ratis/IoTDBRegionMigrateConfigNodeCrashForRatisIT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/ratis/IoTDBRegionMigrateConfigNodeCrashForRatisIT.java index e18f236b5bccf..df01a2393d7cd 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/ratis/IoTDBRegionMigrateConfigNodeCrashForRatisIT.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/ratis/IoTDBRegionMigrateConfigNodeCrashForRatisIT.java @@ -26,6 +26,7 @@ import org.apache.iotdb.confignode.procedure.state.AddRegionPeerState; import org.apache.iotdb.confignode.procedure.state.RemoveRegionPeerState; import org.apache.iotdb.it.framework.IoTDBTestRunner; +import org.apache.iotdb.itbase.category.ClusterIT; import org.apache.iotdb.itbase.category.DailyIT; import org.junit.Test; @@ -88,6 +89,9 @@ public void cnCrashDuringDoAddPeerTest() throws Exception { * and asserts the PROCESSING branch was actually exercised. */ @Test + // Temporarily also categorized as ClusterIT so the per-PR Cluster IT (1C3D) job runs it for + // validation; will be narrowed back to DailyIT-only before merge. + @Category({DailyIT.class, ClusterIT.class}) public void cnLeaderSwitchDuringDoAddPeerTest() throws Exception { successTestWithAction( 1, From 9cc0f631660712649a69744f2aa9ca86e7499b2c Mon Sep 17 00:00:00 2001 From: Yongzao <532741407@qq.com> Date: Thu, 11 Jun 2026 18:30:10 +0800 Subject: [PATCH 5/5] Drop the flaky returns-PROCESSING log assertion; rely on the kill point The per-PR Cluster IT run showed cnLeaderSwitchDuringDoAddPeerTest failing on assertSomeConfigNodeLogContains("returns PROCESSING") for all four consensus protocols, even though the migration itself completed correctly. The WAIT_TASK_FINISH_POLLING kill point did fire (confirmed on disk), i.e. the worker reached waitTaskFinish() and was interrupted by the graceful stop - but the "returns PROCESSING" line is logged during the ConfigNode shutdown sequence, so logback's async appender is often already torn down and the line never reaches disk. Asserting on a log line emitted during shutdown is inherently racy. The assertion is also redundant: generalTestWithAllOptions already calls checkKillPointsAllTriggered(), which fails the test unless the WAIT_TASK_FINISH_POLLING kill point fired - and that kill point is emitted only from inside waitTaskFinish() after the first poll confirms the task is still PROCESSING. So the framework already guarantees the worker was blocked in the wait when the leader was stopped, i.e. the interrupted-PROCESSING branch was exercised. Combined with the migration-success check, that is a reliable and stronger guarantee than scanning for a shutdown-time log line. Remove assertSomeConfigNodeLogContains and its helper/imports. --- ...RegionOperationReliabilityITFramework.java | 32 ------------------- ...DBRegionMigrateConfigNodeCrashIoTV1IT.java | 7 ++-- ...ionMigrateConfigNodeCrashIoTV2BatchIT.java | 7 ++-- ...onMigrateConfigNodeCrashIoTV2StreamIT.java | 7 ++-- ...egionMigrateConfigNodeCrashForRatisIT.java | 7 ++-- 5 files changed, 16 insertions(+), 44 deletions(-) diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/IoTDBRegionOperationReliabilityITFramework.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/IoTDBRegionOperationReliabilityITFramework.java index 9bc03afebb291..b2da179dfb7e9 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/IoTDBRegionOperationReliabilityITFramework.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/IoTDBRegionOperationReliabilityITFramework.java @@ -58,9 +58,6 @@ import java.io.File; import java.io.IOException; import java.io.InputStreamReader; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; @@ -84,7 +81,6 @@ import java.util.function.Consumer; import java.util.function.Predicate; import java.util.stream.Collectors; -import java.util.stream.Stream; import static org.apache.iotdb.util.MagicUtils.makeItCloseQuietly; @@ -894,32 +890,4 @@ protected static Map> getRegionStatusMap(Session s } return result; } - - /** - * Fail the test unless at least one ConfigNode's log contains {@code substring}. Used to assert - * that a specific code path was actually exercised (e.g. the AddRegionPeer PROCESSING branch when - * the leader was gracefully stopped). The old leader writes the line before it restarts, so we - * scan every ConfigNode's {@code log_confignode_all.log}. - */ - protected static void assertSomeConfigNodeLogContains(String substring) { - List configNodeWrappers = EnvFactory.getEnv().getConfigNodeWrapperList(); - for (ConfigNodeWrapper configNodeWrapper : configNodeWrappers) { - Path logPath = Paths.get(configNodeWrapper.getNodePath(), "logs", "log_confignode_all.log"); - if (!Files.exists(logPath)) { - continue; - } - try (Stream lines = Files.lines(logPath)) { - if (lines.anyMatch(line -> line.contains(substring))) { - LOGGER.info("Found expected log line containing \"{}\" in {}", substring, logPath); - return; - } - } catch (IOException e) { - LOGGER.warn("Failed to read ConfigNode log {}", logPath, e); - } - } - Assert.fail( - "Expected at least one ConfigNode log to contain \"" - + substring - + "\", but none did. The code path under test was not exercised."); - } } diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv1/IoTDBRegionMigrateConfigNodeCrashIoTV1IT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv1/IoTDBRegionMigrateConfigNodeCrashIoTV1IT.java index 1f5793617939a..db01ab312e333 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv1/IoTDBRegionMigrateConfigNodeCrashIoTV1IT.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv1/IoTDBRegionMigrateConfigNodeCrashIoTV1IT.java @@ -101,8 +101,10 @@ public void testCnCrashDuringDoAddPeer() throws Exception { * deterministically interrupts that wait and waitTaskFinish() returns PROCESSING. The migration * must still finish correctly after a leader switch: previously the AddRegionPeerProcedure * silently ended on PROCESSING, letting the parent procedure remove the source replica before the - * destination replica was actually Running. Uses 3 ConfigNodes so a real leader switch happens, - * and asserts the PROCESSING branch was actually exercised. + * destination replica was actually Running. Uses 3 ConfigNodes so a real leader switch happens. + * The framework requires the WAIT_TASK_FINISH_POLLING kill point to fire, which can only happen + * once the worker is blocked inside waitTaskFinish(), so the PROCESSING branch is guaranteed to + * be exercised. */ @Test // Temporarily also categorized as ClusterIT so the per-PR Cluster IT (1C3D) job runs it for @@ -118,7 +120,6 @@ public void cnLeaderSwitchDuringDoAddPeerTest() throws Exception { noKillPoints(), actionOfGracefullyRestartConfigNode, KillNode.CONFIG_NODE); - assertSomeConfigNodeLogContains("returns PROCESSING"); } @Test diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/batch/IoTDBRegionMigrateConfigNodeCrashIoTV2BatchIT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/batch/IoTDBRegionMigrateConfigNodeCrashIoTV2BatchIT.java index 6df7ae0fa14e6..adec48f883a34 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/batch/IoTDBRegionMigrateConfigNodeCrashIoTV2BatchIT.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/batch/IoTDBRegionMigrateConfigNodeCrashIoTV2BatchIT.java @@ -88,8 +88,10 @@ public void testCnCrashDuringDoAddPeer() throws Exception { * deterministically interrupts that wait and waitTaskFinish() returns PROCESSING. The migration * must still finish correctly after a leader switch: previously the AddRegionPeerProcedure * silently ended on PROCESSING, letting the parent procedure remove the source replica before the - * destination replica was actually Running. Uses 3 ConfigNodes so a real leader switch happens, - * and asserts the PROCESSING branch was actually exercised. + * destination replica was actually Running. Uses 3 ConfigNodes so a real leader switch happens. + * The framework requires the WAIT_TASK_FINISH_POLLING kill point to fire, which can only happen + * once the worker is blocked inside waitTaskFinish(), so the PROCESSING branch is guaranteed to + * be exercised. */ @Test // Temporarily also categorized as ClusterIT so the per-PR Cluster IT (1C3D) job runs it for @@ -105,7 +107,6 @@ public void cnLeaderSwitchDuringDoAddPeerTest() throws Exception { noKillPoints(), actionOfGracefullyRestartConfigNode, KillNode.CONFIG_NODE); - assertSomeConfigNodeLogContains("returns PROCESSING"); } @Test diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/stream/IoTDBRegionMigrateConfigNodeCrashIoTV2StreamIT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/stream/IoTDBRegionMigrateConfigNodeCrashIoTV2StreamIT.java index ec5fab397cdf1..6ee82324bf46d 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/stream/IoTDBRegionMigrateConfigNodeCrashIoTV2StreamIT.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/iotv2/stream/IoTDBRegionMigrateConfigNodeCrashIoTV2StreamIT.java @@ -102,8 +102,10 @@ public void testCnCrashDuringDoAddPeer() throws Exception { * deterministically interrupts that wait and waitTaskFinish() returns PROCESSING. The migration * must still finish correctly after a leader switch: previously the AddRegionPeerProcedure * silently ended on PROCESSING, letting the parent procedure remove the source replica before the - * destination replica was actually Running. Uses 3 ConfigNodes so a real leader switch happens, - * and asserts the PROCESSING branch was actually exercised. + * destination replica was actually Running. Uses 3 ConfigNodes so a real leader switch happens. + * The framework requires the WAIT_TASK_FINISH_POLLING kill point to fire, which can only happen + * once the worker is blocked inside waitTaskFinish(), so the PROCESSING branch is guaranteed to + * be exercised. */ @Test // Temporarily also categorized as ClusterIT so the per-PR Cluster IT (1C3D) job runs it for @@ -119,7 +121,6 @@ public void cnLeaderSwitchDuringDoAddPeerTest() throws Exception { noKillPoints(), actionOfGracefullyRestartConfigNode, KillNode.CONFIG_NODE); - assertSomeConfigNodeLogContains("returns PROCESSING"); } @Test diff --git a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/ratis/IoTDBRegionMigrateConfigNodeCrashForRatisIT.java b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/ratis/IoTDBRegionMigrateConfigNodeCrashForRatisIT.java index df01a2393d7cd..e6f9f1f298428 100644 --- a/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/ratis/IoTDBRegionMigrateConfigNodeCrashForRatisIT.java +++ b/integration-test/src/test/java/org/apache/iotdb/confignode/it/regionmigration/pass/daily/ratis/IoTDBRegionMigrateConfigNodeCrashForRatisIT.java @@ -85,8 +85,10 @@ public void cnCrashDuringDoAddPeerTest() throws Exception { * deterministically interrupts that wait and waitTaskFinish() returns PROCESSING. The migration * must still finish correctly after a leader switch: previously the AddRegionPeerProcedure * silently ended on PROCESSING, letting the parent procedure remove the source replica before the - * destination replica was actually Running. Uses 3 ConfigNodes so a real leader switch happens, - * and asserts the PROCESSING branch was actually exercised. + * destination replica was actually Running. Uses 3 ConfigNodes so a real leader switch happens. + * The framework requires the WAIT_TASK_FINISH_POLLING kill point to fire, which can only happen + * once the worker is blocked inside waitTaskFinish(), so the PROCESSING branch is guaranteed to + * be exercised. */ @Test // Temporarily also categorized as ClusterIT so the per-PR Cluster IT (1C3D) job runs it for @@ -102,7 +104,6 @@ public void cnLeaderSwitchDuringDoAddPeerTest() throws Exception { noKillPoints(), actionOfGracefullyRestartConfigNode, KillNode.CONFIG_NODE); - assertSomeConfigNodeLogContains("returns PROCESSING"); } @Test