From 4056e853e01381b2ab875d7297c0f974dd01f1ae Mon Sep 17 00:00:00 2001 From: Arun Sarin Date: Thu, 2 Apr 2026 01:45:51 +0530 Subject: [PATCH 1/2] HDDS-8703. [Snapshot] Integration test for SnapshotDeletingService during OM failover --- .../snapshot/TestOzoneManagerHASnapshot.java | 140 ++++++++++++++++++ 1 file changed, 140 insertions(+) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOzoneManagerHASnapshot.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOzoneManagerHASnapshot.java index b6008ab3d2e2..d2f0ec7c4d59 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOzoneManagerHASnapshot.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOzoneManagerHASnapshot.java @@ -25,6 +25,7 @@ import static org.apache.ozone.test.LambdaTestUtils.await; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -300,6 +301,145 @@ public void testSnapshotChainManagerRestore() throws Exception { assertFalse(metadataManager.getSnapshotChainManager().isSnapshotChainCorrupted()); } + /** + * Tests that SnapshotDeletingService (SDS) correctly handles an OM leader + * failover. The old leader's SDS is suspended (simulating SDS being blocked + * or mid-cleanup) when a snapshot is queued for deletion. After the leader + * failover, the new leader's SDS picks up the pending work and correctly + * purges the snapshot. (HDDS-8703) + */ + @Test + public void testSnapshotDeletingServiceDuringOMFailover() + throws Exception { + OzoneManager oldLeader = cluster.getOMLeader(); + String oldLeaderId = oldLeader.getOMNodeId(); + + // Create keys and a snapshot so there is data to clean up. + int numKeys = 5; + for (int i = 0; i < numKeys; i++) { + createFileKey(ozoneBucket, "key-" + RandomStringUtils.secure().nextNumeric(10)); + } + + String snapshotName = "snap-" + RandomStringUtils.secure().nextNumeric(10); + createSnapshot(volumeName, bucketName, snapshotName); + + // Suspend SDS on the current leader before the snapshot is deleted, + // simulating SDS being blocked while a cleanup is pending. + oldLeader.getKeyManager().getSnapshotDeletingService().suspend(); + + // Delete the snapshot — marks it as SNAPSHOT_DELETED in the DB. + store.deleteSnapshot(volumeName, bucketName, snapshotName); + String tableKey = SnapshotInfo.getTableKey(volumeName, bucketName, snapshotName); + + // Wait for the snapshot entry to reach SNAPSHOT_DELETED state on old leader. + GenericTestUtils.waitFor(() -> { + try { + SnapshotInfo info = oldLeader.getMetadataManager() + .getSnapshotInfoTable().get(tableKey); + return info != null + && info.getSnapshotStatus() == SnapshotInfo.SnapshotStatus.SNAPSHOT_DELETED; + } catch (IOException e) { + throw new RuntimeException(e); + } + }, 1000, 30000); + + try { + // Trigger OM leader failover: with 3 OMs and quorum=2, the remaining + // two nodes elect a new leader. + cluster.shutdownOzoneManager(oldLeader); + cluster.waitForLeaderOM(); + + OzoneManager newLeader = cluster.getOMLeader(); + assertNotNull(newLeader); + // Confirm that a genuinely different OM node became leader. + assertNotEquals(oldLeaderId, newLeader.getOMNodeId()); + + // The new leader's SDS (not suspended) must process the pending deleted + // snapshot and purge it from the DB, even though the old leader's SDS + // never ran the cleanup. + checkSnapshotIsPurgedFromDB(newLeader, tableKey); + + // Verify the snapshot chain is not corrupted after the cleanup. + OmMetadataManagerImpl metadataManager = + (OmMetadataManagerImpl) newLeader.getMetadataManager(); + assertFalse(metadataManager.getSnapshotChainManager().isSnapshotChainCorrupted()); + } finally { + // Restore the 3-node cluster for subsequent tests. + cluster.restartOzoneManager(oldLeader, true); + } + } + + /** + * Tests that SDS on the new leader correctly handles multiple snapshots + * queued for deletion after an OM leader failover. After the failover, all + * pending deletions should be completed and the snapshot chain should remain + * consistent. (HDDS-8703) + */ + @Test + public void testSnapshotDeletingServiceWithMultipleSnapshotsDuringFailover() + throws Exception { + OzoneManager oldLeader = cluster.getOMLeader(); + String oldLeaderId = oldLeader.getOMNodeId(); + + int numSnapshots = 3; + List snapshotNames = new ArrayList<>(); + List tableKeys = new ArrayList<>(); + + // Create multiple snapshots, each capturing distinct state. + for (int i = 0; i < numSnapshots; i++) { + createFileKey(ozoneBucket, "key-" + RandomStringUtils.secure().nextNumeric(10)); + String snapshotName = "snap-" + RandomStringUtils.secure().nextNumeric(10); + createSnapshot(volumeName, bucketName, snapshotName); + snapshotNames.add(snapshotName); + tableKeys.add(SnapshotInfo.getTableKey(volumeName, bucketName, snapshotName)); + } + + // Suspend SDS on the current leader so no cleanup starts yet. + oldLeader.getKeyManager().getSnapshotDeletingService().suspend(); + + // Queue all snapshots for deletion. + for (String snapshotName : snapshotNames) { + store.deleteSnapshot(volumeName, bucketName, snapshotName); + } + + // Wait for every snapshot to be marked SNAPSHOT_DELETED on the old leader. + for (String tableKey : tableKeys) { + GenericTestUtils.waitFor(() -> { + try { + SnapshotInfo info = oldLeader.getMetadataManager() + .getSnapshotInfoTable().get(tableKey); + return info != null + && info.getSnapshotStatus() == SnapshotInfo.SnapshotStatus.SNAPSHOT_DELETED; + } catch (IOException e) { + throw new RuntimeException(e); + } + }, 1000, 30000); + } + + try { + // Trigger leader failover. + cluster.shutdownOzoneManager(oldLeader); + cluster.waitForLeaderOM(); + + OzoneManager newLeader = cluster.getOMLeader(); + assertNotNull(newLeader); + assertNotEquals(oldLeaderId, newLeader.getOMNodeId()); + + // The new leader's SDS must purge all deleted snapshots. + for (String tableKey : tableKeys) { + checkSnapshotIsPurgedFromDB(newLeader, tableKey); + } + + // Verify snapshot chain integrity after all cleanups. + OmMetadataManagerImpl metadataManager = + (OmMetadataManagerImpl) newLeader.getMetadataManager(); + assertFalse(metadataManager.getSnapshotChainManager().isSnapshotChainCorrupted()); + } finally { + // Restore the 3-node cluster for subsequent tests. + cluster.restartOzoneManager(oldLeader, true); + } + } + private void createFileKey(OzoneBucket bucket, String keyName) throws IOException { byte[] value = RandomStringUtils.secure().nextAscii(10240).getBytes(UTF_8); From ec812ecdc478a837158fb2e16fb99dd4dc5333c1 Mon Sep 17 00:00:00 2001 From: Arun Sarin Date: Thu, 2 Apr 2026 15:42:34 +0530 Subject: [PATCH 2/2] HDDS-8703. Addressed Review comments --- .../snapshot/TestOzoneManagerHASnapshot.java | 98 ++++--------------- 1 file changed, 19 insertions(+), 79 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOzoneManagerHASnapshot.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOzoneManagerHASnapshot.java index d2f0ec7c4d59..9ba27a94f116 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOzoneManagerHASnapshot.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOzoneManagerHASnapshot.java @@ -62,6 +62,8 @@ import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; /** * Tests snapshot in OM HA setup. @@ -303,89 +305,23 @@ public void testSnapshotChainManagerRestore() throws Exception { /** * Tests that SnapshotDeletingService (SDS) correctly handles an OM leader - * failover. The old leader's SDS is suspended (simulating SDS being blocked - * or mid-cleanup) when a snapshot is queued for deletion. After the leader - * failover, the new leader's SDS picks up the pending work and correctly - * purges the snapshot. (HDDS-8703) - */ - @Test - public void testSnapshotDeletingServiceDuringOMFailover() - throws Exception { - OzoneManager oldLeader = cluster.getOMLeader(); - String oldLeaderId = oldLeader.getOMNodeId(); - - // Create keys and a snapshot so there is data to clean up. - int numKeys = 5; - for (int i = 0; i < numKeys; i++) { - createFileKey(ozoneBucket, "key-" + RandomStringUtils.secure().nextNumeric(10)); - } - - String snapshotName = "snap-" + RandomStringUtils.secure().nextNumeric(10); - createSnapshot(volumeName, bucketName, snapshotName); - - // Suspend SDS on the current leader before the snapshot is deleted, - // simulating SDS being blocked while a cleanup is pending. - oldLeader.getKeyManager().getSnapshotDeletingService().suspend(); - - // Delete the snapshot — marks it as SNAPSHOT_DELETED in the DB. - store.deleteSnapshot(volumeName, bucketName, snapshotName); - String tableKey = SnapshotInfo.getTableKey(volumeName, bucketName, snapshotName); - - // Wait for the snapshot entry to reach SNAPSHOT_DELETED state on old leader. - GenericTestUtils.waitFor(() -> { - try { - SnapshotInfo info = oldLeader.getMetadataManager() - .getSnapshotInfoTable().get(tableKey); - return info != null - && info.getSnapshotStatus() == SnapshotInfo.SnapshotStatus.SNAPSHOT_DELETED; - } catch (IOException e) { - throw new RuntimeException(e); - } - }, 1000, 30000); - - try { - // Trigger OM leader failover: with 3 OMs and quorum=2, the remaining - // two nodes elect a new leader. - cluster.shutdownOzoneManager(oldLeader); - cluster.waitForLeaderOM(); - - OzoneManager newLeader = cluster.getOMLeader(); - assertNotNull(newLeader); - // Confirm that a genuinely different OM node became leader. - assertNotEquals(oldLeaderId, newLeader.getOMNodeId()); - - // The new leader's SDS (not suspended) must process the pending deleted - // snapshot and purge it from the DB, even though the old leader's SDS - // never ran the cleanup. - checkSnapshotIsPurgedFromDB(newLeader, tableKey); - - // Verify the snapshot chain is not corrupted after the cleanup. - OmMetadataManagerImpl metadataManager = - (OmMetadataManagerImpl) newLeader.getMetadataManager(); - assertFalse(metadataManager.getSnapshotChainManager().isSnapshotChainCorrupted()); - } finally { - // Restore the 3-node cluster for subsequent tests. - cluster.restartOzoneManager(oldLeader, true); - } - } - - /** - * Tests that SDS on the new leader correctly handles multiple snapshots - * queued for deletion after an OM leader failover. After the failover, all - * pending deletions should be completed and the snapshot chain should remain + * failover with {@code numSnapshots} snapshots queued for deletion. The old + * leader's SDS is suspended (simulating SDS being blocked or mid-cleanup) + * before the failover. After the failover, the new leader's SDS must pick up + * all pending deletions, purge them from the DB, and leave the snapshot chain * consistent. (HDDS-8703) */ - @Test - public void testSnapshotDeletingServiceWithMultipleSnapshotsDuringFailover() + @ParameterizedTest + @ValueSource(ints = {1, 3}) + public void testSnapshotDeletingServiceDuringOMFailover(int numSnapshots) throws Exception { OzoneManager oldLeader = cluster.getOMLeader(); String oldLeaderId = oldLeader.getOMNodeId(); - int numSnapshots = 3; List snapshotNames = new ArrayList<>(); List tableKeys = new ArrayList<>(); - // Create multiple snapshots, each capturing distinct state. + // Create numSnapshots snapshots, each capturing distinct state. for (int i = 0; i < numSnapshots; i++) { createFileKey(ozoneBucket, "key-" + RandomStringUtils.secure().nextNumeric(10)); String snapshotName = "snap-" + RandomStringUtils.secure().nextNumeric(10); @@ -394,7 +330,8 @@ public void testSnapshotDeletingServiceWithMultipleSnapshotsDuringFailover() tableKeys.add(SnapshotInfo.getTableKey(volumeName, bucketName, snapshotName)); } - // Suspend SDS on the current leader so no cleanup starts yet. + // Suspend SDS on the current leader before any snapshot is deleted, + // simulating SDS being blocked while cleanup is pending. oldLeader.getKeyManager().getSnapshotDeletingService().suspend(); // Queue all snapshots for deletion. @@ -402,7 +339,7 @@ public void testSnapshotDeletingServiceWithMultipleSnapshotsDuringFailover() store.deleteSnapshot(volumeName, bucketName, snapshotName); } - // Wait for every snapshot to be marked SNAPSHOT_DELETED on the old leader. + // Wait for every snapshot to reach SNAPSHOT_DELETED state on the old leader. for (String tableKey : tableKeys) { GenericTestUtils.waitFor(() -> { try { @@ -417,20 +354,23 @@ public void testSnapshotDeletingServiceWithMultipleSnapshotsDuringFailover() } try { - // Trigger leader failover. + // Trigger OM leader failover: with 3 OMs and quorum=2, the remaining + // two nodes elect a new leader. cluster.shutdownOzoneManager(oldLeader); cluster.waitForLeaderOM(); OzoneManager newLeader = cluster.getOMLeader(); assertNotNull(newLeader); + // Confirm that a genuinely different OM node became leader. assertNotEquals(oldLeaderId, newLeader.getOMNodeId()); - // The new leader's SDS must purge all deleted snapshots. + // The new leader's SDS (not suspended) must purge all deleted snapshots, + // even though the old leader's SDS never ran the cleanup. for (String tableKey : tableKeys) { checkSnapshotIsPurgedFromDB(newLeader, tableKey); } - // Verify snapshot chain integrity after all cleanups. + // Verify the snapshot chain is not corrupted after all cleanups. OmMetadataManagerImpl metadataManager = (OmMetadataManagerImpl) newLeader.getMetadataManager(); assertFalse(metadataManager.getSnapshotChainManager().isSnapshotChainCorrupted());