From 4056e853e01381b2ab875d7297c0f974dd01f1ae Mon Sep 17 00:00:00 2001
From: Arun Sarin <arunsarin22@gmail.com>
Date: Thu, 2 Apr 2026 01:45:51 +0530
Subject: [PATCH 1/2] HDDS-8703. [Snapshot] Integration test for
 SnapshotDeletingService during OM failover

---
 .../snapshot/TestOzoneManagerHASnapshot.java  | 140 ++++++++++++++++++
 1 file changed, 140 insertions(+)

diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOzoneManagerHASnapshot.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOzoneManagerHASnapshot.java
index b6008ab3d2e2..d2f0ec7c4d59 100644
--- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOzoneManagerHASnapshot.java
+++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOzoneManagerHASnapshot.java
@@ -25,6 +25,7 @@
 import static org.apache.ozone.test.LambdaTestUtils.await;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotEquals;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
@@ -300,6 +301,145 @@ public void testSnapshotChainManagerRestore() throws Exception {
     assertFalse(metadataManager.getSnapshotChainManager().isSnapshotChainCorrupted());
   }
 
+  /**
+   * Tests that SnapshotDeletingService (SDS) correctly handles an OM leader
+   * failover. The old leader's SDS is suspended (simulating SDS being blocked
+   * or mid-cleanup) when a snapshot is queued for deletion. After the leader
+   * failover, the new leader's SDS picks up the pending work and correctly
+   * purges the snapshot. (HDDS-8703)
+   */
+  @Test
+  public void testSnapshotDeletingServiceDuringOMFailover()
+      throws Exception {
+    OzoneManager oldLeader = cluster.getOMLeader();
+    String oldLeaderId = oldLeader.getOMNodeId();
+
+    // Create keys and a snapshot so there is data to clean up.
+    int numKeys = 5;
+    for (int i = 0; i < numKeys; i++) {
+      createFileKey(ozoneBucket, "key-" + RandomStringUtils.secure().nextNumeric(10));
+    }
+
+    String snapshotName = "snap-" + RandomStringUtils.secure().nextNumeric(10);
+    createSnapshot(volumeName, bucketName, snapshotName);
+
+    // Suspend SDS on the current leader before the snapshot is deleted,
+    // simulating SDS being blocked while a cleanup is pending.
+    oldLeader.getKeyManager().getSnapshotDeletingService().suspend();
+
+    // Delete the snapshot — marks it as SNAPSHOT_DELETED in the DB.
+    store.deleteSnapshot(volumeName, bucketName, snapshotName);
+    String tableKey = SnapshotInfo.getTableKey(volumeName, bucketName, snapshotName);
+
+    // Wait for the snapshot entry to reach SNAPSHOT_DELETED state on old leader.
+    GenericTestUtils.waitFor(() -> {
+      try {
+        SnapshotInfo info = oldLeader.getMetadataManager()
+            .getSnapshotInfoTable().get(tableKey);
+        return info != null
+            && info.getSnapshotStatus() == SnapshotInfo.SnapshotStatus.SNAPSHOT_DELETED;
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+    }, 1000, 30000);
+
+    try {
+      // Trigger OM leader failover: with 3 OMs and quorum=2, the remaining
+      // two nodes elect a new leader.
+      cluster.shutdownOzoneManager(oldLeader);
+      cluster.waitForLeaderOM();
+
+      OzoneManager newLeader = cluster.getOMLeader();
+      assertNotNull(newLeader);
+      // Confirm that a genuinely different OM node became leader.
+      assertNotEquals(oldLeaderId, newLeader.getOMNodeId());
+
+      // The new leader's SDS (not suspended) must process the pending deleted
+      // snapshot and purge it from the DB, even though the old leader's SDS
+      // never ran the cleanup.
+      checkSnapshotIsPurgedFromDB(newLeader, tableKey);
+
+      // Verify the snapshot chain is not corrupted after the cleanup.
+      OmMetadataManagerImpl metadataManager =
+          (OmMetadataManagerImpl) newLeader.getMetadataManager();
+      assertFalse(metadataManager.getSnapshotChainManager().isSnapshotChainCorrupted());
+    } finally {
+      // Restore the 3-node cluster for subsequent tests.
+      cluster.restartOzoneManager(oldLeader, true);
+    }
+  }
+
+  /**
+   * Tests that SDS on the new leader correctly handles multiple snapshots
+   * queued for deletion after an OM leader failover. After the failover, all
+   * pending deletions should be completed and the snapshot chain should remain
+   * consistent. (HDDS-8703)
+   */
+  @Test
+  public void testSnapshotDeletingServiceWithMultipleSnapshotsDuringFailover()
+      throws Exception {
+    OzoneManager oldLeader = cluster.getOMLeader();
+    String oldLeaderId = oldLeader.getOMNodeId();
+
+    int numSnapshots = 3;
+    List<String> snapshotNames = new ArrayList<>();
+    List<String> tableKeys = new ArrayList<>();
+
+    // Create multiple snapshots, each capturing distinct state.
+    for (int i = 0; i < numSnapshots; i++) {
+      createFileKey(ozoneBucket, "key-" + RandomStringUtils.secure().nextNumeric(10));
+      String snapshotName = "snap-" + RandomStringUtils.secure().nextNumeric(10);
+      createSnapshot(volumeName, bucketName, snapshotName);
+      snapshotNames.add(snapshotName);
+      tableKeys.add(SnapshotInfo.getTableKey(volumeName, bucketName, snapshotName));
+    }
+
+    // Suspend SDS on the current leader so no cleanup starts yet.
+    oldLeader.getKeyManager().getSnapshotDeletingService().suspend();
+
+    // Queue all snapshots for deletion.
+    for (String snapshotName : snapshotNames) {
+      store.deleteSnapshot(volumeName, bucketName, snapshotName);
+    }
+
+    // Wait for every snapshot to be marked SNAPSHOT_DELETED on the old leader.
+    for (String tableKey : tableKeys) {
+      GenericTestUtils.waitFor(() -> {
+        try {
+          SnapshotInfo info = oldLeader.getMetadataManager()
+              .getSnapshotInfoTable().get(tableKey);
+          return info != null
+              && info.getSnapshotStatus() == SnapshotInfo.SnapshotStatus.SNAPSHOT_DELETED;
+        } catch (IOException e) {
+          throw new RuntimeException(e);
+        }
+      }, 1000, 30000);
+    }
+
+    try {
+      // Trigger leader failover.
+      cluster.shutdownOzoneManager(oldLeader);
+      cluster.waitForLeaderOM();
+
+      OzoneManager newLeader = cluster.getOMLeader();
+      assertNotNull(newLeader);
+      assertNotEquals(oldLeaderId, newLeader.getOMNodeId());
+
+      // The new leader's SDS must purge all deleted snapshots.
+      for (String tableKey : tableKeys) {
+        checkSnapshotIsPurgedFromDB(newLeader, tableKey);
+      }
+
+      // Verify snapshot chain integrity after all cleanups.
+      OmMetadataManagerImpl metadataManager =
+          (OmMetadataManagerImpl) newLeader.getMetadataManager();
+      assertFalse(metadataManager.getSnapshotChainManager().isSnapshotChainCorrupted());
+    } finally {
+      // Restore the 3-node cluster for subsequent tests.
+      cluster.restartOzoneManager(oldLeader, true);
+    }
+  }
+
   private void createFileKey(OzoneBucket bucket, String keyName)
       throws IOException {
     byte[] value = RandomStringUtils.secure().nextAscii(10240).getBytes(UTF_8);

From ec812ecdc478a837158fb2e16fb99dd4dc5333c1 Mon Sep 17 00:00:00 2001
From: Arun Sarin <arunsarin22@gmail.com>
Date: Thu, 2 Apr 2026 15:42:34 +0530
Subject: [PATCH 2/2] HDDS-8703. Addressed Review comments

---
 .../snapshot/TestOzoneManagerHASnapshot.java  | 98 ++++---------------
 1 file changed, 19 insertions(+), 79 deletions(-)

diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOzoneManagerHASnapshot.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOzoneManagerHASnapshot.java
index d2f0ec7c4d59..9ba27a94f116 100644
--- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOzoneManagerHASnapshot.java
+++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestOzoneManagerHASnapshot.java
@@ -62,6 +62,8 @@
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
 
 /**
  * Tests snapshot in OM HA setup.
@@ -303,89 +305,23 @@ public void testSnapshotChainManagerRestore() throws Exception {
 
   /**
    * Tests that SnapshotDeletingService (SDS) correctly handles an OM leader
-   * failover. The old leader's SDS is suspended (simulating SDS being blocked
-   * or mid-cleanup) when a snapshot is queued for deletion. After the leader
-   * failover, the new leader's SDS picks up the pending work and correctly
-   * purges the snapshot. (HDDS-8703)
-   */
-  @Test
-  public void testSnapshotDeletingServiceDuringOMFailover()
-      throws Exception {
-    OzoneManager oldLeader = cluster.getOMLeader();
-    String oldLeaderId = oldLeader.getOMNodeId();
-
-    // Create keys and a snapshot so there is data to clean up.
-    int numKeys = 5;
-    for (int i = 0; i < numKeys; i++) {
-      createFileKey(ozoneBucket, "key-" + RandomStringUtils.secure().nextNumeric(10));
-    }
-
-    String snapshotName = "snap-" + RandomStringUtils.secure().nextNumeric(10);
-    createSnapshot(volumeName, bucketName, snapshotName);
-
-    // Suspend SDS on the current leader before the snapshot is deleted,
-    // simulating SDS being blocked while a cleanup is pending.
-    oldLeader.getKeyManager().getSnapshotDeletingService().suspend();
-
-    // Delete the snapshot — marks it as SNAPSHOT_DELETED in the DB.
-    store.deleteSnapshot(volumeName, bucketName, snapshotName);
-    String tableKey = SnapshotInfo.getTableKey(volumeName, bucketName, snapshotName);
-
-    // Wait for the snapshot entry to reach SNAPSHOT_DELETED state on old leader.
-    GenericTestUtils.waitFor(() -> {
-      try {
-        SnapshotInfo info = oldLeader.getMetadataManager()
-            .getSnapshotInfoTable().get(tableKey);
-        return info != null
-            && info.getSnapshotStatus() == SnapshotInfo.SnapshotStatus.SNAPSHOT_DELETED;
-      } catch (IOException e) {
-        throw new RuntimeException(e);
-      }
-    }, 1000, 30000);
-
-    try {
-      // Trigger OM leader failover: with 3 OMs and quorum=2, the remaining
-      // two nodes elect a new leader.
-      cluster.shutdownOzoneManager(oldLeader);
-      cluster.waitForLeaderOM();
-
-      OzoneManager newLeader = cluster.getOMLeader();
-      assertNotNull(newLeader);
-      // Confirm that a genuinely different OM node became leader.
-      assertNotEquals(oldLeaderId, newLeader.getOMNodeId());
-
-      // The new leader's SDS (not suspended) must process the pending deleted
-      // snapshot and purge it from the DB, even though the old leader's SDS
-      // never ran the cleanup.
-      checkSnapshotIsPurgedFromDB(newLeader, tableKey);
-
-      // Verify the snapshot chain is not corrupted after the cleanup.
-      OmMetadataManagerImpl metadataManager =
-          (OmMetadataManagerImpl) newLeader.getMetadataManager();
-      assertFalse(metadataManager.getSnapshotChainManager().isSnapshotChainCorrupted());
-    } finally {
-      // Restore the 3-node cluster for subsequent tests.
-      cluster.restartOzoneManager(oldLeader, true);
-    }
-  }
-
-  /**
-   * Tests that SDS on the new leader correctly handles multiple snapshots
-   * queued for deletion after an OM leader failover. After the failover, all
-   * pending deletions should be completed and the snapshot chain should remain
+   * failover with {@code numSnapshots} snapshots queued for deletion. The old
+   * leader's SDS is suspended (simulating SDS being blocked or mid-cleanup)
+   * before the failover. After the failover, the new leader's SDS must pick up
+   * all pending deletions, purge them from the DB, and leave the snapshot chain
    * consistent. (HDDS-8703)
    */
-  @Test
-  public void testSnapshotDeletingServiceWithMultipleSnapshotsDuringFailover()
+  @ParameterizedTest
+  @ValueSource(ints = {1, 3})
+  public void testSnapshotDeletingServiceDuringOMFailover(int numSnapshots)
       throws Exception {
     OzoneManager oldLeader = cluster.getOMLeader();
     String oldLeaderId = oldLeader.getOMNodeId();
 
-    int numSnapshots = 3;
     List<String> snapshotNames = new ArrayList<>();
     List<String> tableKeys = new ArrayList<>();
 
-    // Create multiple snapshots, each capturing distinct state.
+    // Create numSnapshots snapshots, each capturing distinct state.
     for (int i = 0; i < numSnapshots; i++) {
       createFileKey(ozoneBucket, "key-" + RandomStringUtils.secure().nextNumeric(10));
       String snapshotName = "snap-" + RandomStringUtils.secure().nextNumeric(10);
@@ -394,7 +330,8 @@ public void testSnapshotDeletingServiceWithMultipleSnapshotsDuringFailover()
       tableKeys.add(SnapshotInfo.getTableKey(volumeName, bucketName, snapshotName));
     }
 
-    // Suspend SDS on the current leader so no cleanup starts yet.
+    // Suspend SDS on the current leader before any snapshot is deleted,
+    // simulating SDS being blocked while cleanup is pending.
     oldLeader.getKeyManager().getSnapshotDeletingService().suspend();
 
     // Queue all snapshots for deletion.
@@ -402,7 +339,7 @@ public void testSnapshotDeletingServiceWithMultipleSnapshotsDuringFailover()
       store.deleteSnapshot(volumeName, bucketName, snapshotName);
     }
 
-    // Wait for every snapshot to be marked SNAPSHOT_DELETED on the old leader.
+    // Wait for every snapshot to reach SNAPSHOT_DELETED state on the old leader.
     for (String tableKey : tableKeys) {
       GenericTestUtils.waitFor(() -> {
         try {
@@ -417,20 +354,23 @@ public void testSnapshotDeletingServiceWithMultipleSnapshotsDuringFailover()
     }
 
     try {
-      // Trigger leader failover.
+      // Trigger OM leader failover: with 3 OMs and quorum=2, the remaining
+      // two nodes elect a new leader.
       cluster.shutdownOzoneManager(oldLeader);
       cluster.waitForLeaderOM();
 
       OzoneManager newLeader = cluster.getOMLeader();
       assertNotNull(newLeader);
+      // Confirm that a genuinely different OM node became leader.
       assertNotEquals(oldLeaderId, newLeader.getOMNodeId());
 
-      // The new leader's SDS must purge all deleted snapshots.
+      // The new leader's SDS (not suspended) must purge all deleted snapshots,
+      // even though the old leader's SDS never ran the cleanup.
       for (String tableKey : tableKeys) {
         checkSnapshotIsPurgedFromDB(newLeader, tableKey);
       }
 
-      // Verify snapshot chain integrity after all cleanups.
+      // Verify the snapshot chain is not corrupted after all cleanups.
       OmMetadataManagerImpl metadataManager =
           (OmMetadataManagerImpl) newLeader.getMetadataManager();
       assertFalse(metadataManager.getSnapshotChainManager().isSnapshotChainCorrupted());