diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOzoneManagerHASnapshot.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOzoneManagerHASnapshot.java index b6008ab3d2e2..9ba27a94f116 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOzoneManagerHASnapshot.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOzoneManagerHASnapshot.java @@ -25,6 +25,7 @@ import static org.apache.ozone.test.LambdaTestUtils.await; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -61,6 +62,8 @@ import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; /** * Tests snapshot in OM HA setup. @@ -300,6 +303,83 @@ public void testSnapshotChainManagerRestore() throws Exception { assertFalse(metadataManager.getSnapshotChainManager().isSnapshotChainCorrupted()); } + /** + * Tests that SnapshotDeletingService (SDS) correctly handles an OM leader + * failover with {@code numSnapshots} snapshots queued for deletion. The old + * leader's SDS is suspended (simulating SDS being blocked or mid-cleanup) + * before the failover. After the failover, the new leader's SDS must pick up + * all pending deletions, purge them from the DB, and leave the snapshot chain + * consistent. (HDDS-8703) + */ + @ParameterizedTest + @ValueSource(ints = {1, 3}) + public void testSnapshotDeletingServiceDuringOMFailover(int numSnapshots) + throws Exception { + OzoneManager oldLeader = cluster.getOMLeader(); + String oldLeaderId = oldLeader.getOMNodeId(); + + List snapshotNames = new ArrayList<>(); + List tableKeys = new ArrayList<>(); + + // Create numSnapshots snapshots, each capturing distinct state. + for (int i = 0; i < numSnapshots; i++) { + createFileKey(ozoneBucket, "key-" + RandomStringUtils.secure().nextNumeric(10)); + String snapshotName = "snap-" + RandomStringUtils.secure().nextNumeric(10); + createSnapshot(volumeName, bucketName, snapshotName); + snapshotNames.add(snapshotName); + tableKeys.add(SnapshotInfo.getTableKey(volumeName, bucketName, snapshotName)); + } + + // Suspend SDS on the current leader before any snapshot is deleted, + // simulating SDS being blocked while cleanup is pending. + oldLeader.getKeyManager().getSnapshotDeletingService().suspend(); + + // Queue all snapshots for deletion. + for (String snapshotName : snapshotNames) { + store.deleteSnapshot(volumeName, bucketName, snapshotName); + } + + // Wait for every snapshot to reach SNAPSHOT_DELETED state on the old leader. + for (String tableKey : tableKeys) { + GenericTestUtils.waitFor(() -> { + try { + SnapshotInfo info = oldLeader.getMetadataManager() + .getSnapshotInfoTable().get(tableKey); + return info != null + && info.getSnapshotStatus() == SnapshotInfo.SnapshotStatus.SNAPSHOT_DELETED; + } catch (IOException e) { + throw new RuntimeException(e); + } + }, 1000, 30000); + } + + try { + // Trigger OM leader failover: with 3 OMs and quorum=2, the remaining + // two nodes elect a new leader. + cluster.shutdownOzoneManager(oldLeader); + cluster.waitForLeaderOM(); + + OzoneManager newLeader = cluster.getOMLeader(); + assertNotNull(newLeader); + // Confirm that a genuinely different OM node became leader. + assertNotEquals(oldLeaderId, newLeader.getOMNodeId()); + + // The new leader's SDS (not suspended) must purge all deleted snapshots, + // even though the old leader's SDS never ran the cleanup. + for (String tableKey : tableKeys) { + checkSnapshotIsPurgedFromDB(newLeader, tableKey); + } + + // Verify the snapshot chain is not corrupted after all cleanups. + OmMetadataManagerImpl metadataManager = + (OmMetadataManagerImpl) newLeader.getMetadataManager(); + assertFalse(metadataManager.getSnapshotChainManager().isSnapshotChainCorrupted()); + } finally { + // Restore the 3-node cluster for subsequent tests. + cluster.restartOzoneManager(oldLeader, true); + } + } + private void createFileKey(OzoneBucket bucket, String keyName) throws IOException { byte[] value = RandomStringUtils.secure().nextAscii(10240).getBytes(UTF_8);