From 5fac1ab2cfdf04a1dee030b64de3984a1ff5d8cb Mon Sep 17 00:00:00 2001
From: Yang Guo <jimmy_guoyang@outlook.com>
Date: Sun, 24 May 2026 18:48:05 +0800
Subject: [PATCH 1/4] fix: stop-replica-failed

---
 .../coordinator/CoordinatorContext.java       |  68 +++
 .../CoordinatorEventProcessor.java            | 138 ++++-
 .../coordinator/CoordinatorRequestBatch.java  |  38 +-
 .../server/coordinator/TableManager.java      | 139 ++++-
 .../event/StopReplicaSendFailedEvent.java     |  41 ++
 .../statemachine/ReplicaState.java            |  10 +-
 .../statemachine/ReplicaStateMachine.java     |   4 +
 .../coordinator/CoordinatorContextTest.java   |  77 +++
 .../CoordinatorEventProcessorTest.java        | 541 ++++++++++++++++++
 .../CoordinatorRequestBatchTest.java          | 131 +++++
 .../server/coordinator/StopReplicaITCase.java | 326 +++++++++++
 .../server/coordinator/TableManagerTest.java  | 354 ++++++++++++
 .../statemachine/ReplicaStateMachineTest.java |  47 ++
 13 files changed, 1888 insertions(+), 26 deletions(-)
 create mode 100644 fluss-server/src/main/java/org/apache/fluss/server/coordinator/event/StopReplicaSendFailedEvent.java
 create mode 100644 fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorRequestBatchTest.java
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorContext.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorContext.java
index 92dfc5ee5c..59de15087b 100644
--- a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorContext.java
+++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorContext.java
@@ -98,6 +98,15 @@ public class CoordinatorContext {
 
     private final Set<TablePartition> partitionsToBeDeleted = new HashSet<>();
 
+    /**
+     * Tables/partitions temporarily ineligible for deletion retry. Marked when a replica's hosting
+     * TabletServer is offline or stopReplica RPC failed; cleared when the TabletServer reconnects.
+     * The value stores the reason for diagnostic logging.
+     */
+    private final Map<Long, String> tablesIneligibleForDeletion = new HashMap<>();
+
+    private final Map<TablePartition, String> partitionsIneligibleForDeletion = new HashMap<>();
+
     /**
      * A mapping from tablet server to offline buckets. When the leader replica of a table bucket
      * become offline, we'll put the entry tablet_server_id -> table_bucket to this map so that we
@@ -652,8 +661,63 @@ public void queuePartitionDeletion(Set<TablePartition> tablePartitions) {
         partitionsToBeDeleted.addAll(tablePartitions);
     }
 
+    /** Mark a table as ineligible for deletion (only effective if queued for deletion). */
+    public void markTableIneligibleForDeletion(long tableId, String reason) {
+        if (tablesToBeDeleted.contains(tableId)) {
+            String prev = tablesIneligibleForDeletion.put(tableId, reason);
+            if (prev == null) {
+                LOG.info("Marking table {} ineligible for deletion. Reason: {}", tableId, reason);
+            }
+        }
+    }
+
+    public void markPartitionIneligibleForDeletion(TablePartition tablePartition, String reason) {
+        if (partitionsToBeDeleted.contains(tablePartition)) {
+            String prev = partitionsIneligibleForDeletion.put(tablePartition, reason);
+            if (prev == null) {
+                LOG.info(
+                        "Marking partition {} ineligible for deletion. Reason: {}",
+                        tablePartition,
+                        reason);
+            }
+        }
+    }
+
+    /** Remove the ineligible mark, allowing deletion to be retried. */
+    public void markTableEligibleForDeletion(long tableId) {
+        tablesIneligibleForDeletion.remove(tableId);
+    }
+
+    public void markPartitionEligibleForDeletion(TablePartition tablePartition) {
+        partitionsIneligibleForDeletion.remove(tablePartition);
+    }
+
+    public boolean isTableIneligibleForDeletion(long tableId) {
+        return tablesIneligibleForDeletion.containsKey(tableId);
+    }
+
+    public boolean isPartitionIneligibleForDeletion(TablePartition tablePartition) {
+        return partitionsIneligibleForDeletion.containsKey(tablePartition);
+    }
+
+    public Set<TableBucketReplica> getReplicasInState(long tableId, ReplicaState state) {
+        return getAllReplicasForTable(tableId).stream()
+                .filter(r -> getReplicaState(r) == state)
+                .collect(Collectors.toSet());
+    }
+
+    public Set<TableBucketReplica> getReplicasInState(
+            TablePartition tablePartition, ReplicaState state) {
+        return getAllReplicasForPartition(
+                        tablePartition.getTableId(), tablePartition.getPartitionId())
+                .stream()
+                .filter(r -> getReplicaState(r) == state)
+                .collect(Collectors.toSet());
+    }
+
     public void removeTable(long tableId) {
         tablesToBeDeleted.remove(tableId);
+        tablesIneligibleForDeletion.remove(tableId);
         Map<Integer, List<Integer>> assignment = tableAssignments.remove(tableId);
         if (assignment != null) {
             // remove leadership info for each bucket from the context
@@ -671,6 +735,7 @@ public void removeTable(long tableId) {
 
     public void removePartition(TablePartition tablePartition) {
         partitionsToBeDeleted.remove(tablePartition);
+        partitionsIneligibleForDeletion.remove(tablePartition);
         Map<Integer, List<Integer>> assignment = partitionAssignments.remove(tablePartition);
         if (assignment != null) {
             // remove leadership info for each bucket from the context
@@ -728,6 +793,9 @@ private void clearTablesState() {
 
     public void resetContext() {
         tablesToBeDeleted.clear();
+        partitionsToBeDeleted.clear();
+        tablesIneligibleForDeletion.clear();
+        partitionsIneligibleForDeletion.clear();
         clearTablesState();
         liveTabletServers.clear();
         liveCoordinatorServers.clear();
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessor.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessor.java
index 96f39ef08a..ce4befe146 100644
--- a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessor.java
+++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessor.java
@@ -87,6 +87,7 @@
 import org.apache.fluss.server.coordinator.event.RebalanceEvent;
 import org.apache.fluss.server.coordinator.event.RemoveServerTagEvent;
 import org.apache.fluss.server.coordinator.event.SchemaChangeEvent;
+import org.apache.fluss.server.coordinator.event.StopReplicaSendFailedEvent;
 import org.apache.fluss.server.coordinator.event.TableRegistrationChangeEvent;
 import org.apache.fluss.server.coordinator.event.watcher.CoordinatorChangeWatcher;
 import org.apache.fluss.server.coordinator.event.watcher.TableChangeWatcher;
@@ -95,6 +96,7 @@
 import org.apache.fluss.server.coordinator.rebalance.RebalanceManager;
 import org.apache.fluss.server.coordinator.statemachine.ReplicaLeaderElection.ControlledShutdownLeaderElection;
 import org.apache.fluss.server.coordinator.statemachine.ReplicaLeaderElection.ReassignmentLeaderElection;
+import org.apache.fluss.server.coordinator.statemachine.ReplicaState;
 import org.apache.fluss.server.coordinator.statemachine.ReplicaStateMachine;
 import org.apache.fluss.server.coordinator.statemachine.TableBucketStateMachine;
 import org.apache.fluss.server.entity.AdjustIsrResultForBucket;
@@ -150,6 +152,7 @@
 import static org.apache.fluss.server.coordinator.statemachine.ReplicaState.NonExistentReplica;
 import static org.apache.fluss.server.coordinator.statemachine.ReplicaState.OfflineReplica;
 import static org.apache.fluss.server.coordinator.statemachine.ReplicaState.OnlineReplica;
+import static org.apache.fluss.server.coordinator.statemachine.ReplicaState.ReplicaDeletionIneligible;
 import static org.apache.fluss.server.coordinator.statemachine.ReplicaState.ReplicaDeletionStarted;
 import static org.apache.fluss.server.coordinator.statemachine.ReplicaState.ReplicaDeletionSuccessful;
 import static org.apache.fluss.server.coordinator.statemachine.ReplicaState.ReplicaMigrationStarted;
@@ -453,6 +456,10 @@ private void initCoordinatorContext() throws Exception {
                 "Load table and partition assignment success in {}ms when initializing coordinator context.",
                 System.currentTimeMillis() - start4loadAssignment);
 
+        // Rebuild ineligible marks lost during failover by scanning queued-for-deletion
+        // tables/partitions for replicas on offline tablet servers.
+        initIneligibleForDeletion();
+
         long end = System.currentTimeMillis();
         LOG.info("Current total {} tables in the cluster.", coordinatorContext.allTables().size());
         LOG.info(
@@ -464,6 +471,36 @@ private void initCoordinatorContext() throws Exception {
         LOG.info("End initializing coordinator context, cost {}ms", end - start);
     }
 
+    private void initIneligibleForDeletion() {
+        Set<Integer> liveServers = coordinatorContext.liveTabletServerSet();
+        for (long tableId : coordinatorContext.getTablesToBeDeleted()) {
+            for (TableBucketReplica replica : coordinatorContext.getAllReplicasForTable(tableId)) {
+                if (!liveServers.contains(replica.getReplica())) {
+                    coordinatorContext.markTableIneligibleForDeletion(
+                            tableId,
+                            "tabletServer "
+                                    + replica.getReplica()
+                                    + " hosting replica is offline at coordinator startup");
+                    break;
+                }
+            }
+        }
+        for (TablePartition partition : coordinatorContext.getPartitionsToBeDeleted()) {
+            for (TableBucketReplica replica :
+                    coordinatorContext.getAllReplicasForPartition(
+                            partition.getTableId(), partition.getPartitionId())) {
+                if (!liveServers.contains(replica.getReplica())) {
+                    coordinatorContext.markPartitionIneligibleForDeletion(
+                            partition,
+                            "tabletServer "
+                                    + replica.getReplica()
+                                    + " hosting replica is offline at coordinator startup");
+                    break;
+                }
+            }
+        }
+    }
+
     private void loadTableAssignment() throws Exception {
         List<String> assignmentTables = zooKeeperClient.getChildren(TableIdsZNode.path());
         Set<Long> deletedTables = new HashSet<>();
@@ -584,6 +621,8 @@ public void process(CoordinatorEvent event) {
                     (NotifyLeaderAndIsrResponseReceivedEvent) event);
         } else if (event instanceof DeleteReplicaResponseReceivedEvent) {
             processDeleteReplicaResponseReceived((DeleteReplicaResponseReceivedEvent) event);
+        } else if (event instanceof StopReplicaSendFailedEvent) {
+            processStopReplicaSendFailed((StopReplicaSendFailedEvent) event);
         } else if (event instanceof NewCoordinatorEvent) {
             processNewCoordinator((NewCoordinatorEvent) event);
         } else if (event instanceof DeadCoordinatorEvent) {
@@ -869,7 +908,8 @@ private void processDropTable(DropTableEvent dropTableEvent) {
         }
 
         coordinatorContext.queueTableDeletion(Collections.singleton(tableId));
-        tableManager.onDeleteTable(tableId);
+        // Route through resumeDeletions so that the eligibility check applies uniformly.
+        tableManager.resumeDeletions();
         if (dropTableEvent.isAutoPartitionTable()) {
             autoPartitionManager.removeAutoPartitionTable(tableId);
         }
@@ -904,7 +944,8 @@ private void processDropPartition(DropPartitionEvent dropPartitionEvent) {
         }
 
         coordinatorContext.queuePartitionDeletion(Collections.singleton(tablePartition));
-        tableManager.onDeletePartition(tableId, dropPartitionEvent.getPartitionId());
+        // Route through resumeDeletions so that the eligibility check applies uniformly.
+        tableManager.resumeDeletions();
         autoPartitionManager.removePartition(tableId, dropPartitionEvent.getPartitionName());
 
         // send update metadata request.
@@ -939,7 +980,9 @@ private void processDeleteReplicaResponseReceived(
         // clear the fail deleted number for the success deleted replicas
         coordinatorContext.clearFailDeleteNumbers(successDeletedReplicas);
 
-        // pick up the replicas to retry delete and replicas that considered as success delete
+        // Response-level failures (TS replied with per-bucket error): retry up to
+        // DELETE_TRY_TIMES then force-succeed. RPC-layer failures (no response) are
+        // handled separately via StopReplicaSendFailedEvent.
         Tuple2<Set<TableBucketReplica>, Set<TableBucketReplica>>
                 retryDeleteAndSuccessDeleteReplicas =
                         coordinatorContext.retryDeleteAndSuccessDeleteReplicas(failDeletedReplicas);
@@ -959,6 +1002,38 @@ private void processDeleteReplicaResponseReceived(
         }
     }
 
+    private void processStopReplicaSendFailed(StopReplicaSendFailedEvent event) {
+        // Transition replicas to ReplicaDeletionIneligible and mark the table/partition
+        // ineligible until the TabletServer reconnects. Only process replicas still in
+        // ReplicaDeletionStarted (parallel events may have already moved them).
+        Set<TableBucketReplica> stillInDeletion =
+                event.getFailedReplicas().stream()
+                        .filter(
+                                replica ->
+                                        coordinatorContext.getReplicaState(replica)
+                                                == ReplicaDeletionStarted)
+                        .collect(Collectors.toSet());
+        if (stillInDeletion.isEmpty()) {
+            return;
+        }
+        replicaStateMachine.handleStateChanges(stillInDeletion, ReplicaDeletionIneligible);
+        markFailedReplicasIneligible(stillInDeletion, "stopReplica RPC send failed");
+        tableManager.resumeDeletions();
+    }
+
+    private void markFailedReplicasIneligible(
+            Set<TableBucketReplica> failedReplicas, String reason) {
+        for (TableBucketReplica replica : failedReplicas) {
+            TableBucket tb = replica.getTableBucket();
+            if (tb.getPartitionId() != null) {
+                coordinatorContext.markPartitionIneligibleForDeletion(
+                        new TablePartition(tb.getTableId(), tb.getPartitionId()), reason);
+            } else {
+                coordinatorContext.markTableIneligibleForDeletion(tb.getTableId(), reason);
+            }
+        }
+    }
+
     private void processNotifyLeaderAndIsrResponseReceivedEvent(
             NotifyLeaderAndIsrResponseReceivedEvent notifyLeaderAndIsrResponseReceivedEvent) {
         // get the server that receives the response
@@ -1116,6 +1191,31 @@ private void processNewTabletServer(NewTabletServerEvent newTabletServerEvent) {
         // and offline partitions to see if those tablet servers become leaders for some/all
         // of those
         tableBucketStateMachine.triggerOnlineBucketStateChange();
+
+        // Clear ineligible marks for tables/partitions with replicas on the reconnecting
+        // server and resume their deletion.
+        Set<TableBucketReplica> replicasOnReconnectingServer =
+                coordinatorContext.replicasOnTabletServer(tabletServerId);
+        Set<Long> tablesToResume =
+                replicasOnReconnectingServer.stream()
+                        .map(r -> r.getTableBucket().getTableId())
+                        .filter(coordinatorContext::isTableQueuedForDeletion)
+                        .collect(Collectors.toSet());
+        Set<TablePartition> partitionsToResume =
+                replicasOnReconnectingServer.stream()
+                        .filter(r -> r.getTableBucket().getPartitionId() != null)
+                        .map(
+                                r ->
+                                        new TablePartition(
+                                                r.getTableBucket().getTableId(),
+                                                r.getTableBucket().getPartitionId()))
+                        .filter(coordinatorContext::isPartitionQueuedForDeletion)
+                        .collect(Collectors.toSet());
+        if (!tablesToResume.isEmpty() || !partitionsToResume.isEmpty()) {
+            tablesToResume.forEach(coordinatorContext::markTableEligibleForDeletion);
+            partitionsToResume.forEach(coordinatorContext::markPartitionEligibleForDeletion);
+            tableManager.resumeDeletions();
+        }
     }
 
     private void processDeadTabletServer(DeadTabletServerEvent deadTabletServerEvent) {
@@ -1175,6 +1275,38 @@ private void processDeadTabletServer(DeadTabletServerEvent deadTabletServerEvent
         // trigger OfflineReplica state change for those newly offline replicas
         replicaStateMachine.handleStateChanges(replicas, OfflineReplica);
 
+        // For to-be-deleted replicas on the dead server, transition them to
+        // ReplicaDeletionIneligible and mark the owning table/partition ineligible
+        // until the TabletServer reconnects.
+        Set<TableBucketReplica> deletedReplicasInDeletion = new HashSet<>();
+        Set<TableBucketReplica> deletedReplicasNotInDeletion = new HashSet<>();
+        for (TableBucketReplica replica :
+                coordinatorContext.replicasOnTabletServer(tabletServerId)) {
+            if (!coordinatorContext.isToBeDeleted(replica.getTableBucket())) {
+                continue;
+            }
+            ReplicaState state = coordinatorContext.getReplicaState(replica);
+            if (state == ReplicaDeletionStarted) {
+                deletedReplicasInDeletion.add(replica);
+            } else if (state == OnlineReplica || state == NewReplica || state == OfflineReplica) {
+                deletedReplicasNotInDeletion.add(replica);
+            }
+        }
+        if (!deletedReplicasNotInDeletion.isEmpty()) {
+            replicaStateMachine.handleStateChanges(deletedReplicasNotInDeletion, OfflineReplica);
+            replicaStateMachine.handleStateChanges(
+                    deletedReplicasNotInDeletion, ReplicaDeletionIneligible);
+            markFailedReplicasIneligible(deletedReplicasNotInDeletion, "tabletServer offline");
+        }
+        if (!deletedReplicasInDeletion.isEmpty()) {
+            replicaStateMachine.handleStateChanges(
+                    deletedReplicasInDeletion, ReplicaDeletionIneligible);
+            markFailedReplicasIneligible(deletedReplicasInDeletion, "tabletServer offline");
+        }
+        if (!deletedReplicasInDeletion.isEmpty() || !deletedReplicasNotInDeletion.isEmpty()) {
+            tableManager.resumeDeletions();
+        }
+
         // update tabletServer metadata cache by send updateMetadata request.
         updateTabletServerMetadataCache(serverInfos, null, null, bucketsWithOfflineLeader);
     }
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorRequestBatch.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorRequestBatch.java
index 3441053671..126e6098b2 100644
--- a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorRequestBatch.java
+++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorRequestBatch.java
@@ -38,6 +38,7 @@
 import org.apache.fluss.server.coordinator.event.DeleteReplicaResponseReceivedEvent;
 import org.apache.fluss.server.coordinator.event.EventManager;
 import org.apache.fluss.server.coordinator.event.NotifyLeaderAndIsrResponseReceivedEvent;
+import org.apache.fluss.server.coordinator.event.StopReplicaSendFailedEvent;
 import org.apache.fluss.server.entity.DeleteReplicaResultForBucket;
 import org.apache.fluss.server.entity.NotifyLeaderAndIsrData;
 import org.apache.fluss.server.metadata.BucketMetadata;
@@ -440,6 +441,7 @@ private void sendNotifyLeaderAndIsrRequest(int coordinatorEpoch) {
     }
 
     private void sendStopRequest(int coordinatorEpoch) {
+        Set<Integer> liveServers = coordinatorContext.liveTabletServerSet();
         for (Map.Entry<Integer, Map<TableBucket, PbStopReplicaReqForBucket>> stopReplciaEntry :
                 stopReplicaRequestMap.entrySet()) {
             // send request for each tablet server
@@ -459,18 +461,48 @@ private void sendStopRequest(int coordinatorEpoch) {
                             .map(t -> toTableBucket(t.getTableBucket()))
                             .collect(Collectors.toSet());
 
+            // If the TS is not live, the channel manager has no gateway and would silently
+            // drop the request. Surface the failure so deletion replicas can be marked
+            // ineligible instead of stuck in ReplicaDeletionStarted.
+            if (!liveServers.contains(serverId)) {
+                if (!deletedReplicaBuckets.isEmpty()) {
+                    LOG.warn(
+                            "Tablet server {} is not in the live set when sending stop replica "
+                                    + "request; surfacing as StopReplicaSendFailedEvent for {} "
+                                    + "deletion replica(s).",
+                            serverId,
+                            deletedReplicaBuckets.size());
+                    Set<TableBucketReplica> failedReplicas =
+                            deletedReplicaBuckets.stream()
+                                    .map(bucket -> new TableBucketReplica(bucket, serverId))
+                                    .collect(Collectors.toSet());
+                    eventManager.put(new StopReplicaSendFailedEvent(failedReplicas));
+                }
+                continue;
+            }
+
             coordinatorChannelManager.sendStopBucketReplicaRequest(
                     serverId,
                     stopReplicaRequest,
                     (response, throwable) -> {
                         if (throwable != null) {
-                            // todo: in FLUSS-55886145, we will introduce a sender thread to send
-                            // the request.
-                            // in here, we just ignore the error.
                             LOG.warn(
                                     "Failed to send stop replica request to tablet server {}.",
                                     serverId,
                                     throwable);
+                            // For deletion replicas, surface as StopReplicaSendFailedEvent so
+                            // they can be marked ineligible. Non-delete stop replica failures
+                            // are best-effort and can be ignored.
+                            if (!deletedReplicaBuckets.isEmpty()) {
+                                Set<TableBucketReplica> failedReplicas =
+                                        deletedReplicaBuckets.stream()
+                                                .map(
+                                                        bucket ->
+                                                                new TableBucketReplica(
+                                                                        bucket, serverId))
+                                                .collect(Collectors.toSet());
+                                eventManager.put(new StopReplicaSendFailedEvent(failedReplicas));
+                            }
                             return;
                         }
                         // handle the response
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/TableManager.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/TableManager.java
index b92a2dad6e..ab2c94dcbb 100644
--- a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/TableManager.java
+++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/TableManager.java
@@ -39,6 +39,7 @@
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ExecutorService;
+import java.util.stream.Collectors;
 
 /** A manager for tables. */
 public class TableManager {
@@ -186,19 +187,57 @@ public void onDeletePartition(long tableId, long partitionId) {
      *
      * <p>It does the following:
      *
-     * <p>1. Move all the replicas to offline state. This will send stop replica request to the
-     * replicas.
+     * <p>1. Skip replicas already in {@code ReplicaDeletionSuccessful}.
      *
-     * <p>2. Move all the replicas to deletion started state. This will send stop replica request
-     * with delete=true which will delete all persistent data from all the replicas of the all the
-     * respective buckets.
+     * <p>2. For dead replicas (owning TS not live): transition to {@code ReplicaDeletionIneligible}
+     * and mark the table/partition ineligible until the TS reconnects.
+     *
+     * <p>3. For alive replicas: transition through {@code OfflineReplica -> ReplicaDeletionStarted}
+     * which sends {@code stopReplica(delete=true)}.
      */
     private void onDeleteTableBucket(Set<TableBucketReplica> allReplicas) {
-        // to offline, send stop replica to all followers that are not in the OfflineReplica state
-        // so they stop sending fetch requests to the leader
-        replicaStateMachine.handleStateChanges(allReplicas, ReplicaState.OfflineReplica);
-        // to deletion started
-        replicaStateMachine.handleStateChanges(allReplicas, ReplicaState.ReplicaDeletionStarted);
+        Set<TableBucketReplica> pending =
+                allReplicas.stream()
+                        .filter(
+                                r ->
+                                        coordinatorContext.getReplicaState(r)
+                                                != ReplicaState.ReplicaDeletionSuccessful)
+                        .collect(Collectors.toSet());
+
+        Set<Integer> liveServers = coordinatorContext.liveTabletServerSet();
+        Set<TableBucketReplica> aliveReplicas = new HashSet<>();
+        Set<TableBucketReplica> deadReplicas = new HashSet<>();
+        for (TableBucketReplica replica : pending) {
+            if (liveServers.contains(replica.getReplica())) {
+                aliveReplicas.add(replica);
+            } else {
+                deadReplicas.add(replica);
+            }
+        }
+
+        if (!deadReplicas.isEmpty()) {
+            replicaStateMachine.handleStateChanges(deadReplicas, ReplicaState.OfflineReplica);
+            replicaStateMachine.handleStateChanges(
+                    deadReplicas, ReplicaState.ReplicaDeletionIneligible);
+            for (TableBucketReplica replica : deadReplicas) {
+                TableBucket tb = replica.getTableBucket();
+                if (tb.getPartitionId() != null) {
+                    coordinatorContext.markPartitionIneligibleForDeletion(
+                            new TablePartition(tb.getTableId(), tb.getPartitionId()),
+                            "tabletServer was not in live set at deletion kickoff");
+                } else {
+                    coordinatorContext.markTableIneligibleForDeletion(
+                            tb.getTableId(),
+                            "tabletServer was not in live set at deletion kickoff");
+                }
+            }
+        }
+
+        if (!aliveReplicas.isEmpty()) {
+            replicaStateMachine.handleStateChanges(aliveReplicas, ReplicaState.OfflineReplica);
+            replicaStateMachine.handleStateChanges(
+                    aliveReplicas, ReplicaState.ReplicaDeletionStarted);
+        }
     }
 
     public void resumeDeletions() {
@@ -208,19 +247,38 @@ public void resumeDeletions() {
 
     private void resumeTableDeletions() {
         Set<Long> tablesToBeDeleted = new HashSet<>(coordinatorContext.getTablesToBeDeleted());
+        Set<Long> tablesEligibleForRetry = new HashSet<>();
         Set<Long> eligibleTableDeletion = new HashSet<>();
 
         for (long tableId : tablesToBeDeleted) {
-            // if all replicas are marked as deleted successfully, then table deletion is done
+            // Step 1: if all replicas are marked as deleted successfully, complete deletion.
             if (coordinatorContext.areAllReplicasInState(
                     tableId, ReplicaState.ReplicaDeletionSuccessful)) {
                 completeDeleteTable(tableId);
                 LOG.info("Deletion of table with id {} successfully completed.", tableId);
+                continue;
+            }
+
+            // Step 2: if no replica is in ReplicaDeletionStarted AND any is Ineligible,
+            // transition Ineligible replicas back to OfflineReplica for retry.
+            if (!coordinatorContext.isAnyReplicaInState(
+                            tableId, ReplicaState.ReplicaDeletionStarted)
+                    && coordinatorContext.isAnyReplicaInState(
+                            tableId, ReplicaState.ReplicaDeletionIneligible)) {
+                tablesEligibleForRetry.add(tableId);
             }
+
+            // Step 3: only tables that pass the ineligible-flag guard get onDeleteTable.
             if (isEligibleForDeletion(tableId)) {
                 eligibleTableDeletion.add(tableId);
             }
         }
+
+        // Retry transitions run first so that eligible replicas can be re-started below.
+        if (!tablesEligibleForRetry.isEmpty()) {
+            retryDeletionForIneligibleReplicas(tablesEligibleForRetry);
+        }
+
         if (!eligibleTableDeletion.isEmpty()) {
             for (long tableId : eligibleTableDeletion) {
                 onDeleteTable(tableId);
@@ -231,19 +289,33 @@ private void resumeTableDeletions() {
     private void resumePartitionDeletions() {
         Set<TablePartition> partitionsToDelete =
                 new HashSet<>(coordinatorContext.getPartitionsToBeDeleted());
+        Set<TablePartition> partitionsEligibleForRetry = new HashSet<>();
         Set<TablePartition> eligiblePartitionDeletion = new HashSet<>();
 
         for (TablePartition partition : partitionsToDelete) {
-            // if all replicas are marked as deleted successfully, then partition deletion is done
             if (coordinatorContext.areAllReplicasInState(
                     partition, ReplicaState.ReplicaDeletionSuccessful)) {
                 completeDeletePartition(partition);
                 LOG.info("Deletion of partition {} successfully completed.", partition);
+                continue;
             }
+
+            if (!coordinatorContext.isAnyReplicaInState(
+                            partition, ReplicaState.ReplicaDeletionStarted)
+                    && coordinatorContext.isAnyReplicaInState(
+                            partition, ReplicaState.ReplicaDeletionIneligible)) {
+                partitionsEligibleForRetry.add(partition);
+            }
+
             if (isEligibleForDeletion(partition)) {
                 eligiblePartitionDeletion.add(partition);
             }
         }
+
+        if (!partitionsEligibleForRetry.isEmpty()) {
+            retryDeletionForIneligibleReplicasInPartitions(partitionsEligibleForRetry);
+        }
+
         if (!eligiblePartitionDeletion.isEmpty()) {
             for (TablePartition partition : eligiblePartitionDeletion) {
                 onDeletePartition(partition.getTableId(), partition.getPartitionId());
@@ -251,6 +323,38 @@ private void resumePartitionDeletions() {
         }
     }
 
+    /** Transitions all Ineligible replicas for the given tables to OfflineReplica. */
+    private void retryDeletionForIneligibleReplicas(Set<Long> tableIds) {
+        Set<TableBucketReplica> replicas =
+                tableIds.stream()
+                        .flatMap(
+                                id ->
+                                        coordinatorContext
+                                                .getReplicasInState(
+                                                        id, ReplicaState.ReplicaDeletionIneligible)
+                                                .stream())
+                        .collect(Collectors.toSet());
+        if (!replicas.isEmpty()) {
+            replicaStateMachine.handleStateChanges(replicas, ReplicaState.OfflineReplica);
+        }
+    }
+
+    /** Transitions all Ineligible replicas for the given partitions to OfflineReplica. */
+    private void retryDeletionForIneligibleReplicasInPartitions(Set<TablePartition> partitions) {
+        Set<TableBucketReplica> replicas =
+                partitions.stream()
+                        .flatMap(
+                                p ->
+                                        coordinatorContext
+                                                .getReplicasInState(
+                                                        p, ReplicaState.ReplicaDeletionIneligible)
+                                                .stream())
+                        .collect(Collectors.toSet());
+        if (!replicas.isEmpty()) {
+            replicaStateMachine.handleStateChanges(replicas, ReplicaState.OfflineReplica);
+        }
+    }
+
     private void completeDeleteTable(long tableId) {
         Set<TableBucketReplica> replicas = coordinatorContext.getAllReplicasForTable(tableId);
         replicaStateMachine.handleStateChanges(replicas, ReplicaState.NonExistentReplica);
@@ -321,18 +425,17 @@ private void asyncDeletePartitionMetadata(long partitionId) {
     }
 
     private boolean isEligibleForDeletion(long tableId) {
-        // the table is queued for deletion and
-        // no any replica is in state deletion started
+        // Three-guard check: queued, no replica in Started, not in ineligible set.
         return coordinatorContext.isTableQueuedForDeletion(tableId)
                 && !coordinatorContext.isAnyReplicaInState(
-                        tableId, ReplicaState.ReplicaDeletionStarted);
+                        tableId, ReplicaState.ReplicaDeletionStarted)
+                && !coordinatorContext.isTableIneligibleForDeletion(tableId);
     }
 
     private boolean isEligibleForDeletion(TablePartition tablePartition) {
-        // the partition is queued for deletion and
-        // no any replica is in state deletion started
         return coordinatorContext.isPartitionQueuedForDeletion(tablePartition)
                 && !coordinatorContext.isAnyReplicaInState(
-                        tablePartition, ReplicaState.ReplicaDeletionStarted);
+                        tablePartition, ReplicaState.ReplicaDeletionStarted)
+                && !coordinatorContext.isPartitionIneligibleForDeletion(tablePartition);
     }
 }
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/event/StopReplicaSendFailedEvent.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/event/StopReplicaSendFailedEvent.java
new file mode 100644
index 0000000000..59e160418b
--- /dev/null
+++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/event/StopReplicaSendFailedEvent.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.server.coordinator.event;
+
+import org.apache.fluss.metadata.TableBucketReplica;
+
+import java.util.Collections;
+import java.util.Set;
+
+/**
+ * Signals that a {@code stopReplica(delete=true)} request could not be delivered at the RPC layer
+ * (e.g., gateway missing or network error). The event processor transitions affected replicas to
+ * {@code ReplicaDeletionIneligible} until the tablet server reconnects.
+ */
+public class StopReplicaSendFailedEvent implements CoordinatorEvent {
+
+    private final Set<TableBucketReplica> failedReplicas;
+
+    public StopReplicaSendFailedEvent(Set<TableBucketReplica> failedReplicas) {
+        this.failedReplicas = Collections.unmodifiableSet(failedReplicas);
+    }
+
+    public Set<TableBucketReplica> getFailedReplicas() {
+        return failedReplicas;
+    }
+}
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/statemachine/ReplicaState.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/statemachine/ReplicaState.java
index 5ed5732cf1..5c12ef7088 100644
--- a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/statemachine/ReplicaState.java
+++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/statemachine/ReplicaState.java
@@ -39,13 +39,13 @@ public Set<ReplicaState> getValidPreviousStates() {
     OnlineReplica {
         @Override
         public Set<ReplicaState> getValidPreviousStates() {
-            return EnumSet.of(NewReplica, OnlineReplica, OfflineReplica);
+            return EnumSet.of(NewReplica, OnlineReplica, OfflineReplica, ReplicaDeletionIneligible);
         }
     },
     OfflineReplica {
         @Override
         public Set<ReplicaState> getValidPreviousStates() {
-            return EnumSet.of(NewReplica, OnlineReplica, OfflineReplica);
+            return EnumSet.of(NewReplica, OnlineReplica, OfflineReplica, ReplicaDeletionIneligible);
         }
     },
     ReplicaMigrationStarted {
@@ -69,5 +69,11 @@ public Set<ReplicaState> getValidPreviousStates() {
         public Set<ReplicaState> getValidPreviousStates() {
             return EnumSet.of(ReplicaDeletionStarted);
         }
+    },
+    ReplicaDeletionIneligible {
+        @Override
+        public Set<ReplicaState> getValidPreviousStates() {
+            return EnumSet.of(OfflineReplica, ReplicaDeletionStarted);
+        }
     }
 }
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/statemachine/ReplicaStateMachine.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/statemachine/ReplicaStateMachine.java
index d35bb7578b..c572807cf3 100644
--- a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/statemachine/ReplicaStateMachine.java
+++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/statemachine/ReplicaStateMachine.java
@@ -346,6 +346,10 @@ private void doHandleStateChanges(
                 validReplicas.forEach(
                         replica -> doStateChange(replica, ReplicaState.ReplicaDeletionSuccessful));
                 break;
+            case ReplicaDeletionIneligible:
+                validReplicas.forEach(
+                        replica -> doStateChange(replica, ReplicaState.ReplicaDeletionIneligible));
+                break;
             case NonExistentReplica:
                 validReplicas.forEach(replica -> doStateChange(replica, null));
                 break;
diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorContextTest.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorContextTest.java
index 2b788bdb48..26c03fe043 100644
--- a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorContextTest.java
+++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorContextTest.java
@@ -21,6 +21,7 @@
 import org.apache.fluss.metadata.Schema;
 import org.apache.fluss.metadata.TableDescriptor;
 import org.apache.fluss.metadata.TableInfo;
+import org.apache.fluss.metadata.TablePartition;
 import org.apache.fluss.metadata.TablePath;
 import org.apache.fluss.server.zk.ZkEpoch;
 import org.apache.fluss.types.DataTypes;
@@ -28,6 +29,7 @@
 import org.junit.jupiter.api.Test;
 
 import java.time.Duration;
+import java.util.Collections;
 
 import static org.apache.fluss.config.ConfigOptions.TABLE_DATALAKE_ENABLED;
 import static org.apache.fluss.record.TestData.DEFAULT_REMOTE_DATA_DIR;
@@ -69,6 +71,81 @@ void testGetLakeTableCount() {
         assertThat(context.getLakeTableCount()).isEqualTo(2);
     }
 
+    @Test
+    void testIneligibleForDeletionMarkAndRemove() {
+        CoordinatorContext context = new CoordinatorContext(ZkEpoch.INITIAL_EPOCH);
+
+        long queuedTableId = 100L;
+        long unqueuedTableId = 200L;
+        TablePartition queuedPartition = new TablePartition(300L, 1L);
+        TablePartition unqueuedPartition = new TablePartition(400L, 2L);
+
+        // only tables/partitions that are queued for deletion can be marked ineligible
+        context.queueTableDeletion(Collections.singleton(queuedTableId));
+        context.queuePartitionDeletion(Collections.singleton(queuedPartition));
+
+        // initially nothing is ineligible
+        assertThat(context.isTableIneligibleForDeletion(queuedTableId)).isFalse();
+        assertThat(context.isPartitionIneligibleForDeletion(queuedPartition)).isFalse();
+
+        // (a) mark-when-queued sets the ineligible flag
+        context.markTableIneligibleForDeletion(queuedTableId, "first reason");
+        context.markPartitionIneligibleForDeletion(queuedPartition, "first reason");
+        assertThat(context.isTableIneligibleForDeletion(queuedTableId)).isTrue();
+        assertThat(context.isPartitionIneligibleForDeletion(queuedPartition)).isTrue();
+
+        // (b) re-mark same id with new reason keeps the flag set (reason is overwritten)
+        context.markTableIneligibleForDeletion(queuedTableId, "updated reason");
+        context.markPartitionIneligibleForDeletion(queuedPartition, "updated reason");
+        assertThat(context.isTableIneligibleForDeletion(queuedTableId)).isTrue();
+        assertThat(context.isPartitionIneligibleForDeletion(queuedPartition)).isTrue();
+
+        // (c) mark-when-NOT-queued is a no-op
+        context.markTableIneligibleForDeletion(unqueuedTableId, "should be ignored");
+        context.markPartitionIneligibleForDeletion(unqueuedPartition, "should be ignored");
+        assertThat(context.isTableIneligibleForDeletion(unqueuedTableId)).isFalse();
+        assertThat(context.isPartitionIneligibleForDeletion(unqueuedPartition)).isFalse();
+
+        // (d) markEligibleForDeletion removes the ineligible flag
+        context.markTableEligibleForDeletion(queuedTableId);
+        context.markPartitionEligibleForDeletion(queuedPartition);
+        assertThat(context.isTableIneligibleForDeletion(queuedTableId)).isFalse();
+        assertThat(context.isPartitionIneligibleForDeletion(queuedPartition)).isFalse();
+
+        // re-marking after eligible should set it back to ineligible
+        context.markTableIneligibleForDeletion(queuedTableId, "second-time reason");
+        context.markPartitionIneligibleForDeletion(queuedPartition, "second-time reason");
+        assertThat(context.isTableIneligibleForDeletion(queuedTableId)).isTrue();
+        assertThat(context.isPartitionIneligibleForDeletion(queuedPartition)).isTrue();
+    }
+
+    @Test
+    void testResetContextClearsAllDeletionState() {
+        CoordinatorContext context = new CoordinatorContext(ZkEpoch.INITIAL_EPOCH);
+
+        long tableId = 100L;
+        TablePartition partition = new TablePartition(200L, 1L);
+
+        // pre-seed all four collections covered by the fix
+        context.queueTableDeletion(Collections.singleton(tableId));
+        context.queuePartitionDeletion(Collections.singleton(partition));
+        context.markTableIneligibleForDeletion(tableId, "blocked");
+        context.markPartitionIneligibleForDeletion(partition, "blocked");
+
+        // sanity-check pre-conditions before reset
+        assertThat(context.getTablesToBeDeleted()).contains(tableId);
+        assertThat(context.getPartitionsToBeDeleted()).contains(partition);
+        assertThat(context.isTableIneligibleForDeletion(tableId)).isTrue();
+        assertThat(context.isPartitionIneligibleForDeletion(partition)).isTrue();
+
+        context.resetContext();
+
+        assertThat(context.getTablesToBeDeleted()).isEmpty();
+        assertThat(context.getPartitionsToBeDeleted()).isEmpty();
+        assertThat(context.isTableIneligibleForDeletion(tableId)).isFalse();
+        assertThat(context.isPartitionIneligibleForDeletion(partition)).isFalse();
+    }
+
     private TableInfo createTableInfo(long tableId, TablePath tablePath, boolean isLake) {
         TableDescriptor tableDescriptor =
                 TableDescriptor.builder()
diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessorTest.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessorTest.java
index e1226156e1..bf579bce75 100644
--- a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessorTest.java
+++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessorTest.java
@@ -51,6 +51,11 @@
 import org.apache.fluss.server.coordinator.event.CommitKvSnapshotEvent;
 import org.apache.fluss.server.coordinator.event.CommitRemoteLogManifestEvent;
 import org.apache.fluss.server.coordinator.event.CoordinatorEventManager;
+import org.apache.fluss.server.coordinator.event.DeadTabletServerEvent;
+import org.apache.fluss.server.coordinator.event.DropPartitionEvent;
+import org.apache.fluss.server.coordinator.event.DropTableEvent;
+import org.apache.fluss.server.coordinator.event.NewTabletServerEvent;
+import org.apache.fluss.server.coordinator.event.StopReplicaSendFailedEvent;
 import org.apache.fluss.server.coordinator.lease.KvSnapshotLeaseManager;
 import org.apache.fluss.server.coordinator.remote.RemoteDirDynamicLoader;
 import org.apache.fluss.server.coordinator.statemachine.BucketState;
@@ -343,6 +348,542 @@ void testDropTableWithRetry() throws Exception {
                 () -> assertThat(zookeeperClient.getTableAssignment(t1Id)).isEmpty());
     }
 
+    @Test
+    void testStopReplicaSendFailedTransitionsReplicasToOffline() throws Exception {
+        // Verification #3: dispatch StopReplicaSendFailedEvent. Replicas in
+        // ReplicaDeletionStarted should be transitioned to ReplicaDeletionIneligible by
+        // processStopReplicaSendFailed, then resumeDeletions Step 2 retries them back to
+        // OfflineReplica. The owning table is marked ineligible so onDeleteTable does not
+        // fire.
+        initCoordinatorChannel();
+        TablePath t1 = TablePath.of(defaultDatabase, "t_stop_replica_send_failed");
+        final long t1Id =
+                createTable(
+                        t1,
+                        new TabletServerInfo[] {
+                            new TabletServerInfo(0, "rack0"),
+                            new TabletServerInfo(1, "rack1"),
+                            new TabletServerInfo(2, "rack2")
+                        });
+        // wait until the table has its full replica set in the context AND all are online
+        retryVerifyContext(
+                ctx -> {
+                    assertThat(ctx.replicaCounts(t1Id)).isEqualTo(N_BUCKETS * REPLICATION_FACTOR);
+                    assertThat(ctx.areAllReplicasInState(t1Id, OnlineReplica)).isTrue();
+                });
+
+        // Setup: queue table for deletion and force all replicas into ReplicaDeletionStarted.
+        Set<TableBucketReplica> replicas =
+                fromCtx(
+                        ctx -> {
+                            ctx.queueTableDeletion(Collections.singleton(t1Id));
+                            Set<TableBucketReplica> all = ctx.getAllReplicasForTable(t1Id);
+                            for (TableBucketReplica r : all) {
+                                ctx.putReplicaState(r, ReplicaState.ReplicaDeletionStarted);
+                            }
+                            return all;
+                        });
+        assertThat(replicas).hasSize(N_BUCKETS * REPLICATION_FACTOR);
+
+        // Dispatch the event under test.
+        eventProcessor.getCoordinatorEventManager().put(new StopReplicaSendFailedEvent(replicas));
+
+        // After processing: table is ineligible and all replicas land in OfflineReplica
+        // (Started -> Ineligible by processStopReplicaSendFailed, then Ineligible ->
+        // OfflineReplica by resumeDeletions Step 2 retry).
+        retryVerifyContext(
+                ctx -> {
+                    assertThat(ctx.isTableIneligibleForDeletion(t1Id)).isTrue();
+                    for (TableBucketReplica r : replicas) {
+                        assertThat(ctx.getReplicaState(r)).isEqualTo(OfflineReplica);
+                    }
+                });
+    }
+
+    @Test
+    void testProcessDeadTabletServerHandlesAllToBeDeletedReplicas() throws Exception {
+        // Verification #4 + Fix #8: processDeadTabletServer must transition BOTH subsets
+        // of to-be-deleted replicas on the dead server:
+        //   - Subset A: already in ReplicaDeletionStarted -> Ineligible
+        //   - Subset B: in OnlineReplica -> OfflineReplica -> Ineligible
+        // After resumeDeletions Step 2, both end up in OfflineReplica. The owning table
+        // is marked ineligible.
+        initCoordinatorChannel();
+        TablePath t1 = TablePath.of(defaultDatabase, "t_dead_ts_to_be_deleted");
+        final long t1Id =
+                createTable(
+                        t1,
+                        new TabletServerInfo[] {
+                            new TabletServerInfo(0, "rack0"),
+                            new TabletServerInfo(1, "rack1"),
+                            new TabletServerInfo(2, "rack2")
+                        });
+        retryVerifyContext(
+                ctx -> {
+                    assertThat(ctx.replicaCounts(t1Id)).isEqualTo(N_BUCKETS * REPLICATION_FACTOR);
+                    assertThat(ctx.areAllReplicasInState(t1Id, OnlineReplica)).isTrue();
+                });
+
+        // Setup: queue table for deletion. Pick replicas on server 0; force one into
+        // ReplicaDeletionStarted (subset A); leave others as OnlineReplica (subset B).
+        final int deadServer = 0;
+        Set<TableBucketReplica> server0Replicas =
+                fromCtx(
+                        ctx -> {
+                            ctx.queueTableDeletion(Collections.singleton(t1Id));
+                            Set<TableBucketReplica> all =
+                                    ctx.getAllReplicasForTable(t1Id).stream()
+                                            .filter(r -> r.getReplica() == deadServer)
+                                            .collect(Collectors.toSet());
+                            assertThat(all).isNotEmpty();
+                            boolean first = true;
+                            for (TableBucketReplica r : all) {
+                                if (first) {
+                                    ctx.putReplicaState(r, ReplicaState.ReplicaDeletionStarted);
+                                    first = false;
+                                }
+                                // others remain in OnlineReplica
+                            }
+                            return all;
+                        });
+
+        // Dispatch the event under test.
+        eventProcessor.getCoordinatorEventManager().put(new DeadTabletServerEvent(deadServer));
+
+        // After processing: server 0 removed from live set, table marked ineligible,
+        // BOTH subsets in OfflineReplica (Ineligible -> OfflineReplica via Step 2).
+        retryVerifyContext(
+                ctx -> {
+                    assertThat(ctx.liveTabletServerSet()).doesNotContain(deadServer);
+                    assertThat(ctx.isTableIneligibleForDeletion(t1Id)).isTrue();
+                    for (TableBucketReplica r : server0Replicas) {
+                        assertThat(ctx.getReplicaState(r)).isEqualTo(OfflineReplica);
+                    }
+                });
+    }
+
+    @Test
+    void testProcessNewTabletServerResumesTableDeletion() throws Exception {
+        // Verification #5: when a previously-dead tablet server reconnects,
+        // processNewTabletServer should clear the ineligible mark on tables queued for
+        // deletion that have replicas on it, then resumeDeletions which re-fires
+        // onDeleteTable. The deletion completes normally.
+        initCoordinatorChannel();
+        TablePath t1 = TablePath.of(defaultDatabase, "t_new_ts_resume_table");
+        final long t1Id =
+                createTable(
+                        t1,
+                        new TabletServerInfo[] {
+                            new TabletServerInfo(0, "rack0"),
+                            new TabletServerInfo(1, "rack1"),
+                            new TabletServerInfo(2, "rack2")
+                        });
+        retryVerifyContext(
+                ctx -> {
+                    assertThat(ctx.replicaCounts(t1Id)).isEqualTo(N_BUCKETS * REPLICATION_FACTOR);
+                    assertThat(ctx.areAllReplicasInState(t1Id, OnlineReplica)).isTrue();
+                });
+
+        final int reconnectingServer = 0;
+        // capture serverInfo BEFORE removing the server from live set
+        ServerInfo capturedServerInfo =
+                fromCtx(ctx -> ctx.getLiveTabletServers().get(reconnectingServer));
+        assertThat(capturedServerInfo).isNotNull();
+
+        // Setup: queue + mark ineligible + transition all replicas to OfflineReplica and
+        // remove the server from the live set so processNewTabletServer's guard does not
+        // short-circuit.
+        fromCtx(
+                ctx -> {
+                    ctx.queueTableDeletion(Collections.singleton(t1Id));
+                    ctx.markTableIneligibleForDeletion(t1Id, "test-setup");
+                    for (TableBucketReplica r : ctx.getAllReplicasForTable(t1Id)) {
+                        ctx.putReplicaState(r, OfflineReplica);
+                    }
+                    ctx.removeLiveTabletServer(reconnectingServer);
+                    return null;
+                });
+
+        // Sanity: ineligible flag is set before the event.
+        Boolean wasIneligible = fromCtx(ctx -> ctx.isTableIneligibleForDeletion(t1Id));
+        assertThat(wasIneligible).isTrue();
+
+        // Dispatch the reconnect event.
+        eventProcessor
+                .getCoordinatorEventManager()
+                .put(new NewTabletServerEvent(capturedServerInfo));
+
+        // After processing: server back in live set, ineligible flag cleared, deletion
+        // proceeds, assignment eventually removed from ZK.
+        retryVerifyContext(
+                ctx -> {
+                    assertThat(ctx.liveTabletServerSet()).contains(reconnectingServer);
+                    assertThat(ctx.isTableIneligibleForDeletion(t1Id)).isFalse();
+                });
+        retry(
+                Duration.ofMinutes(1),
+                () -> assertThat(zookeeperClient.getTableAssignment(t1Id)).isEmpty());
+    }
+
+    @Test
+    void testInitIneligibleForDeletionMarksTableWithReplicaOnDeadServer() throws Exception {
+        // Fix #4 (Verification #17): when the coordinator initializes and finds tables
+        // queued for deletion (assignment in ZK but no table info in metadataManager) whose
+        // replicas live on a tablet server NOT in liveTabletServerSet, mark the table
+        // ineligible for deletion until that server reconnects.
+        final int extraServer = 3;
+        TabletServerRegistration serverRegistration =
+                new TabletServerRegistration(
+                        "rack3",
+                        Collections.singletonList(
+                                new Endpoint("host3", 2222, DEFAULT_LISTENER_NAME)),
+                        System.currentTimeMillis());
+        // register a 4th tablet server so we can place a replica on it
+        zookeeperClient.registerTabletServer(extraServer, serverRegistration);
+        try {
+            retryVerifyContext(ctx -> assertThat(ctx.liveTabletServerSet()).contains(extraServer));
+            initCoordinatorChannel();
+
+            TablePath t1 = TablePath.of(defaultDatabase, "t_init_ineligible");
+            TableAssignment tableAssignment =
+                    TableAssignment.builder()
+                            .add(0, BucketAssignment.of(extraServer, 0, 1))
+                            .build();
+            long t1Id =
+                    metadataManager.createTable(
+                            t1, remoteDataDir, TEST_TABLE, tableAssignment, false);
+            retryVerifyContext(
+                    ctx -> {
+                        assertThat(ctx.getTablePathById(t1Id)).isNotNull();
+                        assertThat(ctx.replicaCounts(t1Id)).isEqualTo(3);
+                        assertThat(ctx.areAllReplicasInState(t1Id, OnlineReplica)).isTrue();
+                    });
+
+            // shutdown so the drop-table event below is not processed
+            eventProcessor.shutdown();
+            // delete only the table node from ZK - the assignment node is left behind so on
+            // restart the coordinator detects this as a queued-for-deletion table
+            metadataManager.dropTable(t1, false);
+            // remove the 4th server from ZK so init sees it as offline
+            ZOO_KEEPER_EXTENSION_WRAPPER
+                    .getCustomExtension()
+                    .cleanupPath(ZkData.ServerIdZNode.path(extraServer));
+
+            // restart the coordinator
+            eventProcessor = buildCoordinatorEventProcessor();
+            initCoordinatorChannel();
+            eventProcessor.startup();
+
+            // Verify: table is queued and marked ineligible (replica on dead server 3).
+            retryVerifyContext(
+                    ctx -> {
+                        assertThat(ctx.liveTabletServerSet()).doesNotContain(extraServer);
+                        assertThat(ctx.isTableQueuedForDeletion(t1Id)).isTrue();
+                        assertThat(ctx.isTableIneligibleForDeletion(t1Id)).isTrue();
+                    });
+        } finally {
+            ZOO_KEEPER_EXTENSION_WRAPPER
+                    .getCustomExtension()
+                    .cleanupPath(ZkData.ServerIdZNode.path(extraServer));
+        }
+    }
+
+    @Test
+    void testInitIneligibleRecoversWhenServerReconnects() throws Exception {
+        // Fix #4 + Change 6 (Verification #18): after init marks a table ineligible because
+        // a hosting tablet server is offline, when that server reconnects via
+        // NewTabletServerEvent, the ineligible mark is cleared and resumeDeletions
+        // re-fires onDeleteTable. The deletion completes normally.
+        final int extraServer = 3;
+        TabletServerRegistration serverRegistration =
+                new TabletServerRegistration(
+                        "rack3",
+                        Collections.singletonList(
+                                new Endpoint("host3", 3333, DEFAULT_LISTENER_NAME)),
+                        System.currentTimeMillis());
+        zookeeperClient.registerTabletServer(extraServer, serverRegistration);
+        try {
+            retryVerifyContext(ctx -> assertThat(ctx.liveTabletServerSet()).contains(extraServer));
+            initCoordinatorChannel();
+
+            TablePath t1 = TablePath.of(defaultDatabase, "t_init_recover");
+            TableAssignment tableAssignment =
+                    TableAssignment.builder()
+                            .add(0, BucketAssignment.of(extraServer, 0, 1))
+                            .build();
+            long t1Id =
+                    metadataManager.createTable(
+                            t1, remoteDataDir, TEST_TABLE, tableAssignment, false);
+            retryVerifyContext(
+                    ctx -> {
+                        assertThat(ctx.getTablePathById(t1Id)).isNotNull();
+                        assertThat(ctx.replicaCounts(t1Id)).isEqualTo(3);
+                        assertThat(ctx.areAllReplicasInState(t1Id, OnlineReplica)).isTrue();
+                    });
+
+            // shutdown coordinator, drop table (leaves assignment in ZK), kill server 3
+            eventProcessor.shutdown();
+            metadataManager.dropTable(t1, false);
+            ZOO_KEEPER_EXTENSION_WRAPPER
+                    .getCustomExtension()
+                    .cleanupPath(ZkData.ServerIdZNode.path(extraServer));
+
+            // restart - init marks the table ineligible
+            eventProcessor = buildCoordinatorEventProcessor();
+            initCoordinatorChannel();
+            eventProcessor.startup();
+            retryVerifyContext(ctx -> assertThat(ctx.isTableIneligibleForDeletion(t1Id)).isTrue());
+
+            // bring server 3 back - the watcher will trigger NewTabletServerEvent
+            zookeeperClient.registerTabletServer(extraServer, serverRegistration);
+            // refresh the gateway map so stopReplica calls succeed for all servers
+            initCoordinatorChannel();
+
+            // After reconnect: ineligible flag cleared, deletion completes.
+            retryVerifyContext(
+                    ctx -> {
+                        assertThat(ctx.liveTabletServerSet()).contains(extraServer);
+                        assertThat(ctx.isTableIneligibleForDeletion(t1Id)).isFalse();
+                    });
+            retry(
+                    Duration.ofMinutes(1),
+                    () -> assertThat(zookeeperClient.getTableAssignment(t1Id)).isEmpty());
+        } finally {
+            ZOO_KEEPER_EXTENSION_WRAPPER
+                    .getCustomExtension()
+                    .cleanupPath(ZkData.ServerIdZNode.path(extraServer));
+        }
+    }
+
+    @Test
+    void testProcessDropTableSkipsOnDeleteWhenIneligible() throws Exception {
+        // Fix #9 (Verification #21): processDropTable now routes through
+        // tableManager.resumeDeletions instead of calling onDeleteTable directly. This
+        // means the three-guard check (queued + no Started + not ineligible) is applied
+        // uniformly. If we pre-mark the table ineligible before the drop event is
+        // processed, onDeleteTable must NOT fire and replicas must remain in OnlineReplica.
+        initCoordinatorChannel();
+        TablePath t1 = TablePath.of(defaultDatabase, "t_drop_ineligible_skip");
+        final long t1Id =
+                createTable(
+                        t1,
+                        new TabletServerInfo[] {
+                            new TabletServerInfo(0, "rack0"),
+                            new TabletServerInfo(1, "rack1"),
+                            new TabletServerInfo(2, "rack2")
+                        });
+        retryVerifyContext(
+                ctx -> {
+                    assertThat(ctx.replicaCounts(t1Id)).isEqualTo(N_BUCKETS * REPLICATION_FACTOR);
+                    assertThat(ctx.areAllReplicasInState(t1Id, OnlineReplica)).isTrue();
+                });
+
+        // Pre-queue and pre-mark ineligible BEFORE the drop event is processed.
+        fromCtx(
+                ctx -> {
+                    ctx.queueTableDeletion(Collections.singleton(t1Id));
+                    ctx.markTableIneligibleForDeletion(t1Id, "test-pre-mark");
+                    return null;
+                });
+
+        // Dispatch drop event - processDropTable will queue (idempotent) and call
+        // resumeDeletions, but the ineligible flag prevents onDeleteTable.
+        eventProcessor.getCoordinatorEventManager().put(new DropTableEvent(t1Id, false, false));
+
+        // Replicas should remain in OnlineReplica - onDeleteTable was guarded.
+        retryVerifyContext(
+                ctx -> {
+                    assertThat(ctx.isTableIneligibleForDeletion(t1Id)).isTrue();
+                    for (TableBucketReplica r : ctx.getAllReplicasForTable(t1Id)) {
+                        assertThat(ctx.getReplicaState(r)).isEqualTo(OnlineReplica);
+                    }
+                });
+    }
+
+    @Test
+    void testProcessDropPartitionSkipsOnDeleteWhenIneligible() throws Exception {
+        // Fix #9 (Verification #22): partition variant of #21. processDropPartition routes
+        // through tableManager.resumeDeletions; with a pre-set ineligible flag, the
+        // partition should not be deleted and its replicas stay in OnlineReplica.
+        TablePath tablePath = TablePath.of(defaultDatabase, "t_drop_partition_skip");
+        initCoordinatorChannel();
+        TableDescriptor descriptor = getPartitionedTable();
+        long tableId =
+                metadataManager.createTable(tablePath, remoteDataDir, descriptor, null, false);
+
+        int nBuckets = 3;
+        int replicationFactor = 3;
+        Map<Integer, BucketAssignment> assignments =
+                generateAssignment(
+                                nBuckets,
+                                replicationFactor,
+                                new TabletServerInfo[] {
+                                    new TabletServerInfo(0, "rack0"),
+                                    new TabletServerInfo(1, "rack1"),
+                                    new TabletServerInfo(2, "rack2")
+                                })
+                        .getBucketAssignments();
+        PartitionAssignment partitionAssignment = new PartitionAssignment(tableId, assignments);
+        Tuple2<PartitionIdName, PartitionIdName> partitions =
+                preparePartitionAssignment(tablePath, tableId, partitionAssignment);
+        long partitionId = partitions.f0.partitionId;
+        String partitionName = partitions.f0.partitionName;
+        TablePartition tablePartition = new TablePartition(tableId, partitionId);
+
+        verifyPartitionCreated(tablePartition, partitionAssignment, nBuckets, replicationFactor);
+
+        // Pre-queue and pre-mark ineligible.
+        fromCtx(
+                ctx -> {
+                    ctx.queuePartitionDeletion(Collections.singleton(tablePartition));
+                    ctx.markPartitionIneligibleForDeletion(tablePartition, "test-pre-mark");
+                    return null;
+                });
+
+        eventProcessor
+                .getCoordinatorEventManager()
+                .put(new DropPartitionEvent(tableId, partitionId, partitionName));
+
+        // The partition replicas should remain in OnlineReplica - onDeletePartition skipped.
+        retryVerifyContext(
+                ctx -> {
+                    assertThat(ctx.isPartitionIneligibleForDeletion(tablePartition)).isTrue();
+                    for (TableBucketReplica r :
+                            ctx.getAllReplicasForPartition(tableId, partitionId)) {
+                        assertThat(ctx.getReplicaState(r)).isEqualTo(OnlineReplica);
+                    }
+                });
+    }
+
+    @Test
+    void testProcessDeadTabletServerMarksPartitionIneligible() throws Exception {
+        // Fix #8 partition variant (Verification #24): same as #4 but for partition replicas.
+        TablePath tablePath = TablePath.of(defaultDatabase, "t_dead_ts_partition_ineligible");
+        initCoordinatorChannel();
+        TableDescriptor descriptor = getPartitionedTable();
+        long tableId =
+                metadataManager.createTable(tablePath, remoteDataDir, descriptor, null, false);
+
+        int nBuckets = 3;
+        int replicationFactor = 3;
+        Map<Integer, BucketAssignment> assignments =
+                generateAssignment(
+                                nBuckets,
+                                replicationFactor,
+                                new TabletServerInfo[] {
+                                    new TabletServerInfo(0, "rack0"),
+                                    new TabletServerInfo(1, "rack1"),
+                                    new TabletServerInfo(2, "rack2")
+                                })
+                        .getBucketAssignments();
+        PartitionAssignment partitionAssignment = new PartitionAssignment(tableId, assignments);
+        Tuple2<PartitionIdName, PartitionIdName> partitions =
+                preparePartitionAssignment(tablePath, tableId, partitionAssignment);
+        long partitionId = partitions.f0.partitionId;
+        TablePartition tablePartition = new TablePartition(tableId, partitionId);
+
+        verifyPartitionCreated(tablePartition, partitionAssignment, nBuckets, replicationFactor);
+
+        final int deadServer = 0;
+        // Setup: queue partition for deletion. Force one server-0 replica into Started
+        // (subset A); leave others as OnlineReplica (subset B).
+        Set<TableBucketReplica> server0Replicas =
+                fromCtx(
+                        ctx -> {
+                            ctx.queuePartitionDeletion(Collections.singleton(tablePartition));
+                            Set<TableBucketReplica> all =
+                                    ctx.getAllReplicasForPartition(tableId, partitionId).stream()
+                                            .filter(r -> r.getReplica() == deadServer)
+                                            .collect(Collectors.toSet());
+                            assertThat(all).isNotEmpty();
+                            boolean first = true;
+                            for (TableBucketReplica r : all) {
+                                if (first) {
+                                    ctx.putReplicaState(r, ReplicaState.ReplicaDeletionStarted);
+                                    first = false;
+                                }
+                            }
+                            return all;
+                        });
+
+        eventProcessor.getCoordinatorEventManager().put(new DeadTabletServerEvent(deadServer));
+
+        // After processing: server 0 not live, partition marked ineligible, both subsets
+        // ultimately in OfflineReplica.
+        retryVerifyContext(
+                ctx -> {
+                    assertThat(ctx.liveTabletServerSet()).doesNotContain(deadServer);
+                    assertThat(ctx.isPartitionIneligibleForDeletion(tablePartition)).isTrue();
+                    for (TableBucketReplica r : server0Replicas) {
+                        assertThat(ctx.getReplicaState(r)).isEqualTo(OfflineReplica);
+                    }
+                });
+    }
+
+    @Test
+    void testProcessNewTabletServerResumesPartitionDeletion() throws Exception {
+        // Change 6 partition variant (Verification #25): partition queued for deletion
+        // with replicas on a previously-offline TS that's now reconnecting.
+        // markPartitionEligibleForDeletion is called and resumeDeletions completes the
+        // partition deletion.
+        TablePath tablePath = TablePath.of(defaultDatabase, "t_new_ts_resume_partition");
+        initCoordinatorChannel();
+        TableDescriptor descriptor = getPartitionedTable();
+        long tableId =
+                metadataManager.createTable(tablePath, remoteDataDir, descriptor, null, false);
+
+        int nBuckets = 3;
+        int replicationFactor = 3;
+        Map<Integer, BucketAssignment> assignments =
+                generateAssignment(
+                                nBuckets,
+                                replicationFactor,
+                                new TabletServerInfo[] {
+                                    new TabletServerInfo(0, "rack0"),
+                                    new TabletServerInfo(1, "rack1"),
+                                    new TabletServerInfo(2, "rack2")
+                                })
+                        .getBucketAssignments();
+        PartitionAssignment partitionAssignment = new PartitionAssignment(tableId, assignments);
+        Tuple2<PartitionIdName, PartitionIdName> partitionsTuple =
+                preparePartitionAssignment(tablePath, tableId, partitionAssignment);
+        long partitionId = partitionsTuple.f0.partitionId;
+        TablePartition tablePartition = new TablePartition(tableId, partitionId);
+
+        verifyPartitionCreated(tablePartition, partitionAssignment, nBuckets, replicationFactor);
+
+        final int reconnectingServer = 0;
+        ServerInfo capturedServerInfo =
+                fromCtx(ctx -> ctx.getLiveTabletServers().get(reconnectingServer));
+        assertThat(capturedServerInfo).isNotNull();
+
+        // Setup: queue + mark ineligible + replicas in OfflineReplica + remove server 0.
+        fromCtx(
+                ctx -> {
+                    ctx.queuePartitionDeletion(Collections.singleton(tablePartition));
+                    ctx.markPartitionIneligibleForDeletion(tablePartition, "test-setup");
+                    for (TableBucketReplica r :
+                            ctx.getAllReplicasForPartition(tableId, partitionId)) {
+                        ctx.putReplicaState(r, OfflineReplica);
+                    }
+                    ctx.removeLiveTabletServer(reconnectingServer);
+                    return null;
+                });
+
+        eventProcessor
+                .getCoordinatorEventManager()
+                .put(new NewTabletServerEvent(capturedServerInfo));
+
+        retryVerifyContext(
+                ctx -> {
+                    assertThat(ctx.liveTabletServerSet()).contains(reconnectingServer);
+                    assertThat(ctx.isPartitionIneligibleForDeletion(tablePartition)).isFalse();
+                });
+        retry(
+                Duration.ofMinutes(1),
+                () -> assertThat(zookeeperClient.getPartitionAssignment(partitionId)).isEmpty());
+    }
+
     @Test
     void testServerBecomeOnlineAndOfflineLine() throws Exception {
         // make sure all request to gateway should be successful
diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorRequestBatchTest.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorRequestBatchTest.java
new file mode 100644
index 0000000000..1d280dbaf2
--- /dev/null
+++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorRequestBatchTest.java
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.server.coordinator;
+
+import org.apache.fluss.metadata.TableBucket;
+import org.apache.fluss.metadata.TableBucketReplica;
+import org.apache.fluss.server.coordinator.event.CoordinatorEvent;
+import org.apache.fluss.server.coordinator.event.StopReplicaSendFailedEvent;
+import org.apache.fluss.server.coordinator.event.TestingEventManager;
+import org.apache.fluss.server.zk.ZkEpoch;
+import org.apache.fluss.server.zk.data.LeaderAndIsr;
+
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import static org.apache.fluss.server.coordinator.CoordinatorTestUtils.createServers;
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * Test for {@link CoordinatorRequestBatch}, focusing on the pre-check in {@code sendStopRequest}
+ * that surfaces a {@link StopReplicaSendFailedEvent} when a target tablet server is not in the live
+ * set.
+ */
+class CoordinatorRequestBatchTest {
+
+    private static final long TABLE_ID = 1L;
+    private static final int BUCKET = 0;
+    private static final int LIVE_SERVER = 1;
+    private static final int DEAD_SERVER = 2;
+    private static final int LEADER_EPOCH = 0;
+    private static final int COORDINATOR_EPOCH = 0;
+
+    private CoordinatorContext coordinatorContext;
+    private TestCoordinatorChannelManager testCoordinatorChannelManager;
+    private TestingEventManager testingEventManager;
+    private CoordinatorRequestBatch requestBatch;
+    private TableBucket tableBucket;
+
+    @BeforeEach
+    void beforeEach() {
+        coordinatorContext = new CoordinatorContext(ZkEpoch.INITIAL_EPOCH);
+        // servers 0 and 1 are live; server 2 is intentionally NOT in the live set
+        coordinatorContext.setLiveTabletServers(createServers(Arrays.asList(0, LIVE_SERVER)));
+
+        // bucket has replicas on the live server and the dead server
+        tableBucket = new TableBucket(TABLE_ID, BUCKET);
+        coordinatorContext.updateBucketReplicaAssignment(
+                tableBucket, Arrays.asList(LIVE_SERVER, DEAD_SERVER));
+        coordinatorContext.putBucketLeaderAndIsr(
+                tableBucket,
+                new LeaderAndIsr(
+                        LIVE_SERVER,
+                        LEADER_EPOCH,
+                        Arrays.asList(LIVE_SERVER, DEAD_SERVER),
+                        Collections.emptyList(),
+                        COORDINATOR_EPOCH,
+                        0));
+
+        // no gateways are set on the channel manager; the live-set pre-check is what matters here
+        testCoordinatorChannelManager = new TestCoordinatorChannelManager();
+        testingEventManager = new TestingEventManager();
+        requestBatch =
+                new CoordinatorRequestBatch(
+                        testCoordinatorChannelManager, testingEventManager, coordinatorContext);
+    }
+
+    @Test
+    void testSendStopRequestEmitsFailedEventWhenServerNotInLiveSet() {
+        // schedule a delete=true stop-replica targeting BOTH the live and dead server
+        Set<Integer> targetServers = new HashSet<>(Arrays.asList(LIVE_SERVER, DEAD_SERVER));
+        requestBatch.newBatch();
+        requestBatch.addStopReplicaRequestForTabletServers(
+                targetServers, tableBucket, true, true, LEADER_EPOCH);
+
+        // fire sendStopRequest internally
+        requestBatch.sendRequestToTabletServers(COORDINATOR_EPOCH);
+
+        // exactly one StopReplicaSendFailedEvent should be emitted (for the dead server)
+        List<StopReplicaSendFailedEvent> failedEvents =
+                testingEventManager.getEvents().stream()
+                        .filter(e -> e instanceof StopReplicaSendFailedEvent)
+                        .map(e -> (StopReplicaSendFailedEvent) e)
+                        .collect(Collectors.toList());
+        assertThat(failedEvents).hasSize(1);
+
+        Set<TableBucketReplica> failedReplicas = failedEvents.get(0).getFailedReplicas();
+        assertThat(failedReplicas)
+                .containsExactly(new TableBucketReplica(tableBucket, DEAD_SERVER))
+                .doesNotContain(new TableBucketReplica(tableBucket, LIVE_SERVER));
+    }
+
+    @Test
+    void testSendStopRequestSilentlySkipsNonDeleteForDeadServer() {
+        // schedule a non-deletion stop-replica (delete=false, deleteRemote=false) targeting the
+        // dead server; this should be silently skipped without surfacing any event
+        requestBatch.newBatch();
+        requestBatch.addStopReplicaRequestForTabletServers(
+                Collections.singleton(DEAD_SERVER), tableBucket, false, false, LEADER_EPOCH);
+
+        requestBatch.sendRequestToTabletServers(COORDINATOR_EPOCH);
+
+        // no StopReplicaSendFailedEvent should be emitted for non-delete stop replicas
+        List<CoordinatorEvent> failedEvents =
+                testingEventManager.getEvents().stream()
+                        .filter(e -> e instanceof StopReplicaSendFailedEvent)
+                        .collect(Collectors.toList());
+        assertThat(failedEvents).isEmpty();
+    }
+}
diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/StopReplicaITCase.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/StopReplicaITCase.java
index 2abec7264b..0a1a3dfd62 100644
--- a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/StopReplicaITCase.java
+++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/StopReplicaITCase.java
@@ -19,8 +19,11 @@
 
 import org.apache.fluss.config.ConfigOptions;
 import org.apache.fluss.config.Configuration;
+import org.apache.fluss.metadata.PartitionSpec;
+import org.apache.fluss.metadata.Schema;
 import org.apache.fluss.metadata.TableBucket;
 import org.apache.fluss.metadata.TableDescriptor;
+import org.apache.fluss.metadata.TablePartition;
 import org.apache.fluss.metadata.TablePath;
 import org.apache.fluss.rpc.gateway.CoordinatorGateway;
 import org.apache.fluss.server.replica.Replica;
@@ -29,8 +32,13 @@
 import org.apache.fluss.server.testutils.RpcMessageTestUtils;
 import org.apache.fluss.server.zk.ZooKeeperClient;
 import org.apache.fluss.server.zk.data.LeaderAndIsr;
+import org.apache.fluss.server.zk.data.PartitionAssignment;
+import org.apache.fluss.server.zk.data.PartitionRegistration;
+import org.apache.fluss.types.DataTypes;
 
 import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.extension.RegisterExtension;
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.ValueSource;
@@ -38,7 +46,10 @@
 import java.nio.file.Path;
 import java.time.Duration;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
+import java.util.Map;
+import java.util.Optional;
 
 import static org.apache.fluss.record.TestData.DATA1_TABLE_DESCRIPTOR;
 import static org.apache.fluss.record.TestData.DATA1_TABLE_DESCRIPTOR_PK;
@@ -132,6 +143,281 @@ void testStopReplica(boolean isPkTable) throws Exception {
         retryUtilReplicaNotExist(tb, isr2, tableDirs2);
     }
 
+    /**
+     * Verification item #10 (table-path end-to-end recovery): verifies that when a tablet server
+     * hosting one of the table's replicas goes offline, dropping the table does NOT get stuck and
+     * deletion completes once the offline tablet server reconnects (Change 5 + Change 6 + Change 9
+     * cooperate to pause and then resume the deletion).
+     */
+    @Test
+    void testDropTableCompletesAfterOfflineTabletServerReconnect() throws Exception {
+        FLUSS_CLUSTER_EXTENSION.waitUntilAllGatewayHasSameMetadata();
+        FLUSS_CLUSTER_EXTENSION.assertHasTabletServerNumber(3);
+
+        TablePath tablePath = TablePath.of("test_db_stop_replica", "test_drop_after_reconnect");
+        TableDescriptor tableDescriptor =
+                TableDescriptor.builder()
+                        .schema(
+                                Schema.newBuilder()
+                                        .column("a", DataTypes.INT())
+                                        .column("b", DataTypes.STRING())
+                                        .build())
+                        .distributedBy(1)
+                        .property(ConfigOptions.TABLE_REPLICATION_FACTOR, 3)
+                        .build();
+        long tableId =
+                RpcMessageTestUtils.createTable(
+                        FLUSS_CLUSTER_EXTENSION, tablePath, tableDescriptor);
+        TableBucket tb = new TableBucket(tableId, 0);
+        FLUSS_CLUSTER_EXTENSION.waitUntilAllReplicaReady(tb);
+
+        List<Integer> isr = waitAndGetIsr(tb);
+        List<Path> tableDirs = assertReplicaExistAndGetTableOrPartitionDirs(tb, isr, false);
+
+        // pick a non-leader replica to take offline so that leadership churn does not prevent
+        // dropTable from being processed quickly.
+        int leader = FLUSS_CLUSTER_EXTENSION.waitAndGetLeader(tb);
+        int offlineServerId =
+                isr.stream().filter(id -> id != leader).findFirst().orElse(isr.get(0));
+
+        FLUSS_CLUSTER_EXTENSION.stopTabletServer(offlineServerId);
+        // wait until coordinator has observed the tablet server going away.
+        FLUSS_CLUSTER_EXTENSION.assertHasTabletServerNumber(2);
+
+        // drop the table — this returns after queueing; the actual deletion is paused because
+        // one of the replica's tablet servers is offline.
+        coordinatorGateway
+                .dropTable(
+                        RpcMessageTestUtils.newDropTableRequest(
+                                tablePath.getDatabaseName(), tablePath.getTableName(), false))
+                .get();
+        // the table's metadata is removed from zk synchronously in dropTable, but the bucket
+        // assignment lives on until all replicas have been stopped.
+        assertThat(zkClient.tableExist(tablePath)).isFalse();
+
+        // bring the tablet server back. Change 6 (processNewTabletServer) clears the ineligible
+        // mark and resumeDeletions drives the pending deletion to completion.
+        FLUSS_CLUSTER_EXTENSION.startTabletServer(offlineServerId);
+        FLUSS_CLUSTER_EXTENSION.assertHasTabletServerNumber(3);
+
+        retryUntilDeletionCleanedInZk(tb, isr, tableDirs, offlineServerId);
+    }
+
+    /**
+     * Verification item #11 (Change 4b end-to-end): would verify that when a stopReplica RPC fails
+     * to send while the target tablet server is still in {@code liveTabletServerSet}, the replica
+     * ends up in {@code OfflineReplica} and the table is marked ineligible until the next event
+     * removes the ineligibility.
+     *
+     * <p>Reason for {@code @Disabled}: deterministically simulating only the RPC-layer send failure
+     * (without taking the tablet server offline) requires fault injection into the coordinator's
+     * gateway, which {@link FlussClusterExtension} does not currently expose. Killing the tablet
+     * server to trigger the same code path is functionally equivalent to {@link
+     * #testDropTableCompletesAfterOfflineTabletServerReconnect()} (#10) and would be redundant. The
+     * corresponding behaviour is already covered at the unit level by the {@code
+     * StopReplicaSendFailedEvent} / {@code CoordinatorRequestBatch} tests (verification items 3 and
+     * 16).
+     */
+    @Test
+    @Disabled(
+            "Requires gateway-level fault injection not yet supported by FlussClusterExtension; "
+                    + "covered by unit tests for StopReplicaSendFailedEvent and "
+                    + "CoordinatorRequestBatch.sendStopRequest.")
+    void testDropTableRecoversFromStopReplicaRpcSendFailure() {
+        // Intentionally left blank — see @Disabled rationale.
+    }
+
+    /**
+     * Verification item #14 (Change 9 end-to-end): when a tablet server is already offline at the
+     * time {@code dropTable} is invoked, deletion must NOT throw and must NOT get stuck. Replicas
+     * on the dead tablet server should be parked in {@code ReplicaDeletionIneligible} and the table
+     * marked ineligible; once the tablet server reconnects, deletion completes.
+     */
+    @Test
+    void testDropTableNotStuckWhenTabletServerDeadAtKickoff() throws Exception {
+        FLUSS_CLUSTER_EXTENSION.waitUntilAllGatewayHasSameMetadata();
+        FLUSS_CLUSTER_EXTENSION.assertHasTabletServerNumber(3);
+
+        TablePath tablePath = TablePath.of("test_db_stop_replica", "test_drop_dead_kickoff");
+        TableDescriptor tableDescriptor =
+                TableDescriptor.builder()
+                        .schema(
+                                Schema.newBuilder()
+                                        .column("a", DataTypes.INT())
+                                        .column("b", DataTypes.STRING())
+                                        .build())
+                        .distributedBy(1)
+                        .property(ConfigOptions.TABLE_REPLICATION_FACTOR, 3)
+                        .build();
+        long tableId =
+                RpcMessageTestUtils.createTable(
+                        FLUSS_CLUSTER_EXTENSION, tablePath, tableDescriptor);
+        TableBucket tb = new TableBucket(tableId, 0);
+        FLUSS_CLUSTER_EXTENSION.waitUntilAllReplicaReady(tb);
+
+        List<Integer> isr = waitAndGetIsr(tb);
+        List<Path> tableDirs = assertReplicaExistAndGetTableOrPartitionDirs(tb, isr, false);
+
+        int leader = FLUSS_CLUSTER_EXTENSION.waitAndGetLeader(tb);
+        int offlineServerId =
+                isr.stream().filter(id -> id != leader).findFirst().orElse(isr.get(0));
+
+        // 1. Take the tablet server offline BEFORE issuing dropTable so the coordinator hits
+        //    the dead-TS branch in onDeleteTableBucket (Change 9).
+        FLUSS_CLUSTER_EXTENSION.stopTabletServer(offlineServerId);
+        FLUSS_CLUSTER_EXTENSION.assertHasTabletServerNumber(2);
+
+        // 2. dropTable must return successfully even though replicas live on a dead TS.
+        coordinatorGateway
+                .dropTable(
+                        RpcMessageTestUtils.newDropTableRequest(
+                                tablePath.getDatabaseName(), tablePath.getTableName(), false))
+                .get();
+        assertThat(zkClient.tableExist(tablePath)).isFalse();
+
+        // 3. Deletion must be paused (table marked ineligible, assignment still present in zk).
+        CoordinatorContext coordinatorContext =
+                FLUSS_CLUSTER_EXTENSION
+                        .getCoordinatorServer()
+                        .getCoordinatorEventProcessor()
+                        .getCoordinatorContext();
+        retry(
+                Duration.ofMinutes(1),
+                () ->
+                        assertThat(coordinatorContext.isTableIneligibleForDeletion(tableId))
+                                .as(
+                                        "table should be marked ineligible while tablet server "
+                                                + offlineServerId
+                                                + " is offline")
+                                .isTrue());
+        // table assignment still in zk — deletion paused, NOT stuck.
+        assertThat(zkClient.getTableAssignment(tableId)).isPresent();
+
+        // 4. Bring the tablet server back; Change 6 clears the ineligible mark and the
+        //    pending deletion resumes.
+        FLUSS_CLUSTER_EXTENSION.startTabletServer(offlineServerId);
+        FLUSS_CLUSTER_EXTENSION.assertHasTabletServerNumber(3);
+
+        retryUntilDeletionCleanedInZk(tb, isr, tableDirs, offlineServerId);
+        retry(
+                Duration.ofMinutes(1),
+                () ->
+                        assertThat(coordinatorContext.isTableIneligibleForDeletion(tableId))
+                                .as("ineligible mark should be cleared after deletion completes")
+                                .isFalse());
+    }
+
+    /**
+     * Verification item #27 (partition variant of #14): when a tablet server is offline at the time
+     * a partition is dropped, partition deletion must NOT get stuck. Once the tablet server
+     * reconnects, the partition is fully removed.
+     */
+    @Test
+    void testDropPartitionNotStuckWhenTabletServerDeadAtKickoff() throws Exception {
+        FLUSS_CLUSTER_EXTENSION.waitUntilAllGatewayHasSameMetadata();
+        FLUSS_CLUSTER_EXTENSION.assertHasTabletServerNumber(3);
+
+        TablePath tablePath = TablePath.of("test_db_stop_replica", "test_drop_partition_dead");
+        TableDescriptor tableDescriptor =
+                TableDescriptor.builder()
+                        .schema(
+                                Schema.newBuilder()
+                                        .column("a", DataTypes.INT())
+                                        .column("b", DataTypes.STRING())
+                                        .build())
+                        .distributedBy(1)
+                        .partitionedBy("b")
+                        .property(ConfigOptions.TABLE_REPLICATION_FACTOR, 3)
+                        .build();
+        long tableId =
+                RpcMessageTestUtils.createTable(
+                        FLUSS_CLUSTER_EXTENSION, tablePath, tableDescriptor);
+
+        // Create one partition explicitly (auto-partition is disabled for this descriptor).
+        String partitionName = "p1";
+        long partitionId =
+                RpcMessageTestUtils.createPartition(
+                        FLUSS_CLUSTER_EXTENSION,
+                        tablePath,
+                        new PartitionSpec(Collections.singletonMap("b", partitionName)),
+                        false);
+        TableBucket tb = new TableBucket(tableId, partitionId, 0);
+        FLUSS_CLUSTER_EXTENSION.waitUntilAllReplicaReady(tb);
+
+        List<Integer> isr = waitAndGetIsr(tb);
+        List<Path> partitionDirs = assertReplicaExistAndGetTableOrPartitionDirs(tb, isr, false);
+
+        int leader = FLUSS_CLUSTER_EXTENSION.waitAndGetLeader(tb);
+        int offlineServerId =
+                isr.stream().filter(id -> id != leader).findFirst().orElse(isr.get(0));
+
+        // Take the tablet server offline BEFORE dropping the partition.
+        FLUSS_CLUSTER_EXTENSION.stopTabletServer(offlineServerId);
+        FLUSS_CLUSTER_EXTENSION.assertHasTabletServerNumber(2);
+
+        // dropPartition must return without throwing.
+        coordinatorGateway
+                .dropPartition(
+                        RpcMessageTestUtils.newDropPartitionRequest(
+                                tablePath,
+                                new PartitionSpec(Collections.singletonMap("b", partitionName)),
+                                false))
+                .get();
+
+        // Partition registration is removed synchronously, but the assignment for the
+        // partition's buckets remains until all replicas finish stopping.
+        retry(
+                Duration.ofMinutes(1),
+                () -> {
+                    Map<String, PartitionRegistration> partitions =
+                            zkClient.getPartitionRegistrations(tablePath);
+                    assertThat(partitions).doesNotContainKey(partitionName);
+                });
+
+        TablePartition tablePartition = new TablePartition(tableId, partitionId);
+        CoordinatorContext coordinatorContext =
+                FLUSS_CLUSTER_EXTENSION
+                        .getCoordinatorServer()
+                        .getCoordinatorEventProcessor()
+                        .getCoordinatorContext();
+        retry(
+                Duration.ofMinutes(1),
+                () ->
+                        assertThat(
+                                        coordinatorContext.isPartitionIneligibleForDeletion(
+                                                tablePartition))
+                                .as(
+                                        "partition should be marked ineligible while tablet "
+                                                + "server "
+                                                + offlineServerId
+                                                + " is offline")
+                                .isTrue());
+        Optional<PartitionAssignment> partitionAssignment =
+                zkClient.getPartitionAssignment(partitionId);
+        assertThat(partitionAssignment)
+                .as("partition assignment should still be present while paused")
+                .isPresent();
+
+        // Bring the tablet server back; partition deletion should resume and complete.
+        FLUSS_CLUSTER_EXTENSION.startTabletServer(offlineServerId);
+        FLUSS_CLUSTER_EXTENSION.assertHasTabletServerNumber(3);
+
+        retryUntilDeletionCleanedInZk(tb, isr, partitionDirs, offlineServerId);
+        retry(
+                Duration.ofMinutes(1),
+                () -> assertThat(zkClient.getPartitionAssignment(partitionId)).isEmpty());
+        retry(
+                Duration.ofMinutes(1),
+                () ->
+                        assertThat(
+                                        coordinatorContext.isPartitionIneligibleForDeletion(
+                                                tablePartition))
+                                .as(
+                                        "partition ineligible mark should be cleared after "
+                                                + "deletion completes")
+                                .isFalse());
+    }
+
     private List<Integer> waitAndGetIsr(TableBucket tb) {
         LeaderAndIsr leaderAndIsr =
                 waitValue(
@@ -192,6 +478,46 @@ private void retryUtilReplicaNotExist(
                 });
     }
 
+    /**
+     * Variant of {@link #retryUtilReplicaNotExist} for tests that restart a tablet server during
+     * the drop flow. The on-disk directory check is skipped for the restarted tablet server because
+     * its {@code ReplicaManager.allReplicas} is empty after restart and {@code stopReplica} for a
+     * {@code NoneReplica} bucket is a no-op (pre-existing TabletServer-side limitation, orthogonal
+     * to the coordinator-side deletion-resume fix exercised by this test). All other assertions —
+     * ZK assignment / LeaderAndIsr emptiness and {@code NoneReplica} state on every replica — are
+     * preserved.
+     */
+    private void retryUntilDeletionCleanedInZk(
+            TableBucket tableBucket,
+            List<Integer> isr,
+            List<Path> tableDirs,
+            int restartedServerId) {
+        retry(
+                Duration.ofMinutes(1),
+                () -> {
+                    for (int i = 0; i < isr.size(); i++) {
+                        int replicaId = isr.get(i);
+                        if (replicaId != restartedServerId) {
+                            assertThat(tableDirs.get(i))
+                                    .as(
+                                            "local replica dir on still-running tablet server "
+                                                    + replicaId
+                                                    + " should be removed")
+                                    .doesNotExist();
+                        }
+                        ReplicaManager replicaManager =
+                                FLUSS_CLUSTER_EXTENSION
+                                        .getTabletServerById(replicaId)
+                                        .getReplicaManager();
+                        assertThat(replicaManager.getReplica(tableBucket))
+                                .isInstanceOf(ReplicaManager.NoneReplica.class);
+                    }
+
+                    assertThat(zkClient.getTableAssignment(tableBucket.getTableId())).isEmpty();
+                    assertThat(zkClient.getLeaderAndIsr(tableBucket)).isEmpty();
+                });
+    }
+
     private static Configuration initConfig() {
         Configuration conf = new Configuration();
         conf.setInt(ConfigOptions.DEFAULT_REPLICATION_FACTOR, 3);
diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/TableManagerTest.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/TableManagerTest.java
index f5357c549f..b802fad8e6 100644
--- a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/TableManagerTest.java
+++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/TableManagerTest.java
@@ -26,6 +26,7 @@
 import org.apache.fluss.server.coordinator.event.CoordinatorEvent;
 import org.apache.fluss.server.coordinator.event.DeleteReplicaResponseReceivedEvent;
 import org.apache.fluss.server.coordinator.event.TestingEventManager;
+import org.apache.fluss.server.coordinator.statemachine.ReplicaState;
 import org.apache.fluss.server.coordinator.statemachine.ReplicaStateMachine;
 import org.apache.fluss.server.coordinator.statemachine.TableBucketStateMachine;
 import org.apache.fluss.server.entity.DeleteReplicaResultForBucket;
@@ -324,6 +325,359 @@ void testCreateAndDropPartition() throws Exception {
         assertThat(coordinatorContext.getAllReplicasForPartition(tableId, partitionId)).isEmpty();
     }
 
+    @Test
+    void testResumeTableDeletionsCompletesWhenAllReplicasSuccessful() throws Exception {
+        // Step 1 happy-path: when all replicas are ReplicaDeletionSuccessful,
+        // resumeTableDeletions should finalize deletion (remove ZK assignment + replicas).
+        long tableId = zookeeperClient.getTableIdAndIncrement();
+        TableAssignment assignment = createAssignment();
+        zookeeperClient.registerTableAssignment(tableId, assignment);
+
+        coordinatorContext.putTableInfo(
+                TableInfo.of(
+                        DATA1_TABLE_PATH_PK,
+                        tableId,
+                        0,
+                        DATA1_TABLE_DESCRIPTOR_PK,
+                        DEFAULT_REMOTE_DATA_DIR,
+                        System.currentTimeMillis(),
+                        System.currentTimeMillis()));
+        tableManager.onCreateNewTable(DATA1_TABLE_PATH_PK, tableId, assignment);
+
+        // queue table for deletion and put all replicas in ReplicaDeletionSuccessful
+        coordinatorContext.queueTableDeletion(Collections.singleton(tableId));
+        for (TableBucketReplica replica : getReplicas(tableId, assignment)) {
+            coordinatorContext.putReplicaState(replica, ReplicaDeletionSuccessful);
+        }
+
+        tableManager.resumeDeletions();
+
+        retry(
+                Duration.ofSeconds(30),
+                () -> assertThat(zookeeperClient.getTableAssignment(tableId)).isEmpty());
+        assertThat(coordinatorContext.getAllReplicasForTable(tableId)).isEmpty();
+    }
+
+    @Test
+    void testResumeTableDeletionsRetryStepTransitionsIneligibleToOffline() throws Exception {
+        // Step 2: when no replica is ReplicaDeletionStarted AND any replica is
+        // ReplicaDeletionIneligible, push Ineligible -> OfflineReplica regardless of the
+        // table-ineligible flag (Ineligible is transient). Step 3 guard still keeps the
+        // table from being onDeleteTable'd while the ineligible mark remains.
+        long tableId = zookeeperClient.getTableIdAndIncrement();
+        TableAssignment assignment = createAssignment();
+        zookeeperClient.registerTableAssignment(tableId, assignment);
+
+        coordinatorContext.putTableInfo(
+                TableInfo.of(
+                        DATA1_TABLE_PATH_PK,
+                        tableId,
+                        0,
+                        DATA1_TABLE_DESCRIPTOR_PK,
+                        DEFAULT_REMOTE_DATA_DIR,
+                        System.currentTimeMillis(),
+                        System.currentTimeMillis()));
+        tableManager.onCreateNewTable(DATA1_TABLE_PATH_PK, tableId, assignment);
+
+        coordinatorContext.queueTableDeletion(Collections.singleton(tableId));
+        coordinatorContext.markTableIneligibleForDeletion(tableId, "test reason");
+
+        // half of the replicas in Ineligible, the rest in OfflineReplica; none Started
+        Set<TableBucketReplica> replicas = getReplicas(tableId, assignment);
+        Set<TableBucketReplica> ineligibleReplicas = new HashSet<>();
+        int idx = 0;
+        for (TableBucketReplica replica : replicas) {
+            if (idx % 2 == 0) {
+                coordinatorContext.putReplicaState(replica, ReplicaState.ReplicaDeletionIneligible);
+                ineligibleReplicas.add(replica);
+            } else {
+                coordinatorContext.putReplicaState(replica, ReplicaState.OfflineReplica);
+            }
+            idx++;
+        }
+
+        testingEventManager.clearEvents();
+        tableManager.resumeDeletions();
+
+        // (a) previously-Ineligible replicas now in OfflineReplica
+        for (TableBucketReplica replica : ineligibleReplicas) {
+            assertThat(coordinatorContext.getReplicaState(replica))
+                    .isEqualTo(ReplicaState.OfflineReplica);
+        }
+        // (b) the table is still ineligible (Step 3 guard kept it)
+        assertThat(coordinatorContext.isTableIneligibleForDeletion(tableId)).isTrue();
+        // (c) no DeleteReplicaResponseReceivedEvent emitted (onDeleteTable not invoked)
+        assertThat(collectDeleteReplicaSuccessEvents()).isEmpty();
+    }
+
+    @Test
+    void testResumeTableDeletionsStep3GuardBlocksWhenIneligible() throws Exception {
+        // Step 3 guard: a table queued for deletion but marked ineligible must NOT be
+        // onDeleteTable'd; replicas should remain in OfflineReplica (not transition to
+        // ReplicaDeletionStarted) and no stop-replica delete event should fire.
+        long tableId = zookeeperClient.getTableIdAndIncrement();
+        TableAssignment assignment = createAssignment();
+        zookeeperClient.registerTableAssignment(tableId, assignment);
+
+        coordinatorContext.putTableInfo(
+                TableInfo.of(
+                        DATA1_TABLE_PATH_PK,
+                        tableId,
+                        0,
+                        DATA1_TABLE_DESCRIPTOR_PK,
+                        DEFAULT_REMOTE_DATA_DIR,
+                        System.currentTimeMillis(),
+                        System.currentTimeMillis()));
+        tableManager.onCreateNewTable(DATA1_TABLE_PATH_PK, tableId, assignment);
+
+        coordinatorContext.queueTableDeletion(Collections.singleton(tableId));
+        coordinatorContext.markTableIneligibleForDeletion(tableId, "guard test");
+
+        Set<TableBucketReplica> replicas = getReplicas(tableId, assignment);
+        for (TableBucketReplica replica : replicas) {
+            coordinatorContext.putReplicaState(replica, ReplicaState.OfflineReplica);
+        }
+
+        testingEventManager.clearEvents();
+        tableManager.resumeDeletions();
+
+        assertThat(collectDeleteReplicaSuccessEvents()).isEmpty();
+        for (TableBucketReplica replica : replicas) {
+            assertThat(coordinatorContext.getReplicaState(replica))
+                    .isEqualTo(ReplicaState.OfflineReplica);
+        }
+    }
+
+    @Test
+    void testResumeTableDeletionsStep3FiresWhenEligibilityRestored() throws Exception {
+        // After eligibility is restored (e.g., owning TS reconnected), the next
+        // resumeDeletions should pass the Step 3 guard and invoke onDeleteTable, which
+        // pushes replicas to ReplicaDeletionStarted and emits DeleteReplicaResponseReceivedEvent
+        // for each via the always-success gateway.
+        long tableId = zookeeperClient.getTableIdAndIncrement();
+        TableAssignment assignment = createAssignment();
+        zookeeperClient.registerTableAssignment(tableId, assignment);
+
+        coordinatorContext.putTableInfo(
+                TableInfo.of(
+                        DATA1_TABLE_PATH_PK,
+                        tableId,
+                        0,
+                        DATA1_TABLE_DESCRIPTOR_PK,
+                        DEFAULT_REMOTE_DATA_DIR,
+                        System.currentTimeMillis(),
+                        System.currentTimeMillis()));
+        tableManager.onCreateNewTable(DATA1_TABLE_PATH_PK, tableId, assignment);
+
+        coordinatorContext.queueTableDeletion(Collections.singleton(tableId));
+        coordinatorContext.markTableIneligibleForDeletion(tableId, "before restore");
+
+        Set<TableBucketReplica> replicas = getReplicas(tableId, assignment);
+        for (TableBucketReplica replica : replicas) {
+            coordinatorContext.putReplicaState(replica, ReplicaState.OfflineReplica);
+        }
+
+        // simulate the TS-reconnect / eligibility-restore path
+        coordinatorContext.markTableEligibleForDeletion(tableId);
+
+        testingEventManager.clearEvents();
+        tableManager.resumeDeletions();
+
+        // gateway always succeeds; checkReplicaDelete asserts the expected
+        // DeleteReplicaResponseReceivedEvent set was emitted by onDeleteTable.
+        checkReplicaDelete(tableId, null, assignment);
+    }
+
+    @Test
+    void testOnDeleteTableBucketSplitsAliveAndDeadByLiveSet() throws Exception {
+        // Drop server 2 from the live set BEFORE the bucket is created so that
+        // onDeleteTableBucket sees an alive/dead split.
+        coordinatorContext.removeLiveTabletServer(2);
+        CoordinatorTestUtils.makeSendLeaderAndStopRequestAlwaysSuccess(
+                coordinatorContext, testCoordinatorChannelManager);
+
+        long tableId = zookeeperClient.getTableIdAndIncrement();
+        TableAssignment assignment =
+                TableAssignment.builder().add(0, BucketAssignment.of(1, 2)).build();
+        zookeeperClient.registerTableAssignment(tableId, assignment);
+
+        coordinatorContext.putTableInfo(
+                TableInfo.of(
+                        DATA1_TABLE_PATH_PK,
+                        tableId,
+                        0,
+                        DATA1_TABLE_DESCRIPTOR_PK,
+                        DEFAULT_REMOTE_DATA_DIR,
+                        System.currentTimeMillis(),
+                        System.currentTimeMillis()));
+        tableManager.onCreateNewTable(DATA1_TABLE_PATH_PK, tableId, assignment);
+
+        coordinatorContext.queueTableDeletion(Collections.singleton(tableId));
+        tableManager.onDeleteTable(tableId);
+
+        TableBucket tableBucket = new TableBucket(tableId, 0);
+        TableBucketReplica aliveReplica = new TableBucketReplica(tableBucket, 1);
+        TableBucketReplica deadReplica = new TableBucketReplica(tableBucket, 2);
+
+        // alive replica went OfflineReplica -> ReplicaDeletionStarted
+        assertThat(coordinatorContext.getReplicaState(aliveReplica))
+                .isEqualTo(ReplicaState.ReplicaDeletionStarted);
+        // dead replica went OfflineReplica -> ReplicaDeletionIneligible
+        assertThat(coordinatorContext.getReplicaState(deadReplica))
+                .isEqualTo(ReplicaState.ReplicaDeletionIneligible);
+        // and the table is marked ineligible (with the documented reason internally)
+        assertThat(coordinatorContext.isTableIneligibleForDeletion(tableId)).isTrue();
+    }
+
+    @Test
+    void testResumePartitionDeletionsMirrorsTablePath() throws Exception {
+        // Partition counterpart combining Step 1 (success completion) and
+        // Step 2 + Step 3 (ineligible-flag retry + guard) in one test.
+        long tableId = zookeeperClient.getTableIdAndIncrement();
+        TableAssignment tableAssignment = TableAssignment.builder().build();
+        zookeeperClient.registerTableAssignment(tableId, tableAssignment);
+
+        coordinatorContext.putTableInfo(
+                TableInfo.of(
+                        DATA1_TABLE_PATH,
+                        tableId,
+                        0,
+                        DATA1_TABLE_DESCRIPTOR,
+                        DEFAULT_REMOTE_DATA_DIR,
+                        System.currentTimeMillis(),
+                        System.currentTimeMillis()));
+        tableManager.onCreateNewTable(DATA1_TABLE_PATH, tableId, tableAssignment);
+
+        // ----- Step 1: all replicas successful, partition deletion completes -----
+        PartitionAssignment partitionAssignment =
+                new PartitionAssignment(tableId, createAssignment().getBucketAssignments());
+        long partitionId = zookeeperClient.getPartitionIdAndIncrement();
+        // partition name is keyed off the auto-incremented partitionId so it is unique
+        // across all tests in the class — partition metadata znodes (PartitionZNode) are
+        // not removed by completeDeletePartition and would otherwise collide between tests.
+        String partitionName = "p_resume_step1_" + partitionId;
+        zookeeperClient.registerPartitionAssignmentAndMetadata(
+                partitionId,
+                partitionName,
+                partitionAssignment,
+                DEFAULT_REMOTE_DATA_DIR,
+                DATA1_TABLE_PATH,
+                tableId);
+        tableManager.onCreateNewPartition(
+                DATA1_TABLE_PATH, tableId, partitionId, partitionName, partitionAssignment);
+
+        TablePartition tp1 = new TablePartition(tableId, partitionId);
+        coordinatorContext.queuePartitionDeletion(Collections.singleton(tp1));
+        for (TableBucketReplica replica : getReplicas(tableId, partitionId, partitionAssignment)) {
+            coordinatorContext.putReplicaState(replica, ReplicaDeletionSuccessful);
+        }
+
+        tableManager.resumeDeletions();
+        retry(
+                Duration.ofSeconds(30),
+                () -> assertThat(zookeeperClient.getPartitionAssignment(partitionId)).isEmpty());
+        assertThat(coordinatorContext.getAllReplicasForPartition(tableId, partitionId)).isEmpty();
+
+        // ----- Step 2 + Step 3: ineligible-flag retry transition + guard -----
+        PartitionAssignment partitionAssignment2 =
+                new PartitionAssignment(tableId, createAssignment().getBucketAssignments());
+        long partitionId2 = zookeeperClient.getPartitionIdAndIncrement();
+        String partitionName2 = "p_resume_step23_" + partitionId2;
+        zookeeperClient.registerPartitionAssignmentAndMetadata(
+                partitionId2,
+                partitionName2,
+                partitionAssignment2,
+                DEFAULT_REMOTE_DATA_DIR,
+                DATA1_TABLE_PATH,
+                tableId);
+        tableManager.onCreateNewPartition(
+                DATA1_TABLE_PATH, tableId, partitionId2, partitionName2, partitionAssignment2);
+
+        TablePartition tp2 = new TablePartition(tableId, partitionId2);
+        coordinatorContext.queuePartitionDeletion(Collections.singleton(tp2));
+        coordinatorContext.markPartitionIneligibleForDeletion(tp2, "test reason");
+
+        Set<TableBucketReplica> replicas2 =
+                getReplicas(tableId, partitionId2, partitionAssignment2);
+        Set<TableBucketReplica> ineligibleReplicas = new HashSet<>();
+        int idx = 0;
+        for (TableBucketReplica replica : replicas2) {
+            if (idx % 2 == 0) {
+                coordinatorContext.putReplicaState(replica, ReplicaState.ReplicaDeletionIneligible);
+                ineligibleReplicas.add(replica);
+            } else {
+                coordinatorContext.putReplicaState(replica, ReplicaState.OfflineReplica);
+            }
+            idx++;
+        }
+
+        testingEventManager.clearEvents();
+        tableManager.resumeDeletions();
+
+        // Step 2: previously-Ineligible replicas now in OfflineReplica
+        for (TableBucketReplica replica : ineligibleReplicas) {
+            assertThat(coordinatorContext.getReplicaState(replica))
+                    .isEqualTo(ReplicaState.OfflineReplica);
+        }
+        // Step 3 guard: partition still marked ineligible, onDeletePartition NOT invoked
+        assertThat(coordinatorContext.isPartitionIneligibleForDeletion(tp2)).isTrue();
+        assertThat(collectDeleteReplicaSuccessEvents()).isEmpty();
+    }
+
+    @Test
+    void testOnDeletePartitionAliveDeadSplit() throws Exception {
+        // Partition variant of the alive/dead split: dead replicas go to
+        // ReplicaDeletionIneligible and the partition is marked ineligible.
+        coordinatorContext.removeLiveTabletServer(2);
+        CoordinatorTestUtils.makeSendLeaderAndStopRequestAlwaysSuccess(
+                coordinatorContext, testCoordinatorChannelManager);
+
+        long tableId = zookeeperClient.getTableIdAndIncrement();
+        TableAssignment tableAssignment = TableAssignment.builder().build();
+        zookeeperClient.registerTableAssignment(tableId, tableAssignment);
+
+        coordinatorContext.putTableInfo(
+                TableInfo.of(
+                        DATA1_TABLE_PATH,
+                        tableId,
+                        0,
+                        DATA1_TABLE_DESCRIPTOR,
+                        DEFAULT_REMOTE_DATA_DIR,
+                        System.currentTimeMillis(),
+                        System.currentTimeMillis()));
+        tableManager.onCreateNewTable(DATA1_TABLE_PATH, tableId, tableAssignment);
+
+        PartitionAssignment partitionAssignment =
+                new PartitionAssignment(
+                        tableId,
+                        TableAssignment.builder()
+                                .add(0, BucketAssignment.of(1, 2))
+                                .build()
+                                .getBucketAssignments());
+        long partitionId = zookeeperClient.getPartitionIdAndIncrement();
+        String partitionName = "p_alive_dead_" + partitionId;
+        zookeeperClient.registerPartitionAssignmentAndMetadata(
+                partitionId,
+                partitionName,
+                partitionAssignment,
+                DEFAULT_REMOTE_DATA_DIR,
+                DATA1_TABLE_PATH,
+                tableId);
+        tableManager.onCreateNewPartition(
+                DATA1_TABLE_PATH, tableId, partitionId, partitionName, partitionAssignment);
+
+        TablePartition tp = new TablePartition(tableId, partitionId);
+        coordinatorContext.queuePartitionDeletion(Collections.singleton(tp));
+        tableManager.onDeletePartition(tableId, partitionId);
+
+        TableBucket tableBucket = new TableBucket(tableId, partitionId, 0);
+        TableBucketReplica deadReplica = new TableBucketReplica(tableBucket, 2);
+
+        assertThat(coordinatorContext.getReplicaState(deadReplica))
+                .isEqualTo(ReplicaState.ReplicaDeletionIneligible);
+        assertThat(coordinatorContext.isPartitionIneligibleForDeletion(tp)).isTrue();
+    }
+
     private TableAssignment createAssignment() {
         return TableAssignment.builder()
                 .add(0, BucketAssignment.of(0, 1, 2))
diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/statemachine/ReplicaStateMachineTest.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/statemachine/ReplicaStateMachineTest.java
index 4d1946ab47..3f06b56fb4 100644
--- a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/statemachine/ReplicaStateMachineTest.java
+++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/statemachine/ReplicaStateMachineTest.java
@@ -60,6 +60,7 @@
 import static org.apache.fluss.server.coordinator.statemachine.ReplicaState.NewReplica;
 import static org.apache.fluss.server.coordinator.statemachine.ReplicaState.OfflineReplica;
 import static org.apache.fluss.server.coordinator.statemachine.ReplicaState.OnlineReplica;
+import static org.apache.fluss.server.coordinator.statemachine.ReplicaState.ReplicaDeletionIneligible;
 import static org.apache.fluss.server.coordinator.statemachine.ReplicaState.ReplicaDeletionStarted;
 import static org.assertj.core.api.Assertions.assertThat;
 
@@ -304,6 +305,52 @@ void testOfflineReplicaShouldBeRemovedFromIsr() throws Exception {
                                 3));
     }
 
+    @Test
+    void testReplicaDeletionIneligibleTransitions() {
+        CoordinatorContext coordinatorContext = new CoordinatorContext(zkEpoch);
+        coordinatorContext.setLiveTabletServers(
+                CoordinatorTestUtils.createServers(Arrays.asList(0, 1)));
+        ReplicaStateMachine replicaStateMachine = createReplicaStateMachine(coordinatorContext);
+
+        long tableId = 1;
+        TableBucket tableBucket = new TableBucket(tableId, 0);
+        TableBucketReplica replica = new TableBucketReplica(tableBucket, 0);
+
+        // pre-seed: directly put replica into ReplicaDeletionStarted; we are exercising
+        // the state machine's transition validation, not how the replica got into Started.
+        coordinatorContext.putReplicaState(replica, ReplicaDeletionStarted);
+
+        // ReplicaDeletionStarted -> ReplicaDeletionIneligible should be a valid transition
+        replicaStateMachine.handleStateChanges(
+                Collections.singletonList(replica), ReplicaDeletionIneligible);
+        assertThat(coordinatorContext.getReplicaState(replica))
+                .isEqualTo(ReplicaDeletionIneligible);
+
+        // ReplicaDeletionIneligible -> OfflineReplica should be a valid transition
+        replicaStateMachine.handleStateChanges(Collections.singletonList(replica), OfflineReplica);
+        assertThat(coordinatorContext.getReplicaState(replica)).isEqualTo(OfflineReplica);
+    }
+
+    @Test
+    void testIneligibleToOnlineTransitionIsValid() {
+        CoordinatorContext coordinatorContext = new CoordinatorContext(zkEpoch);
+        coordinatorContext.setLiveTabletServers(
+                CoordinatorTestUtils.createServers(Arrays.asList(0, 1)));
+        ReplicaStateMachine replicaStateMachine = createReplicaStateMachine(coordinatorContext);
+
+        long tableId = 1;
+        TableBucket tableBucket = new TableBucket(tableId, 0);
+        TableBucketReplica replica = new TableBucketReplica(tableBucket, 0);
+
+        // pre-seed Ineligible directly: we want to verify checkValidReplicaStateChange
+        // accepts Ineligible -> OnlineReplica.
+        coordinatorContext.putReplicaState(replica, ReplicaDeletionIneligible);
+
+        replicaStateMachine.handleStateChanges(Collections.singletonList(replica), OnlineReplica);
+
+        assertThat(coordinatorContext.getReplicaState(replica)).isEqualTo(OnlineReplica);
+    }
+
     private void toReplicaDeletionStartedState(
             ReplicaStateMachine replicaStateMachine, Collection<TableBucketReplica> replicas) {
         replicaStateMachine.handleStateChanges(replicas, NewReplica);

From 3e192c392ed0418c3a72c2e04f41ca1dfccc947c Mon Sep 17 00:00:00 2001
From: Yang Guo <jimmy_guoyang@outlook.com>
Date: Sun, 24 May 2026 18:48:05 +0800
Subject: [PATCH 2/4] fix: stop-replica-failed

---
 .../fluss/server/coordinator/CoordinatorEventProcessor.java    | 2 ++
 .../fluss/server/coordinator/CoordinatorRequestBatch.java      | 1 +
 .../java/org/apache/fluss/server/coordinator/TableManager.java | 3 +++
 3 files changed, 6 insertions(+)

diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessor.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessor.java
index ce4befe146..bfa323016e 100644
--- a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessor.java
+++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessor.java
@@ -908,6 +908,7 @@ private void processDropTable(DropTableEvent dropTableEvent) {
         }
 
         coordinatorContext.queueTableDeletion(Collections.singleton(tableId));
+
         // Route through resumeDeletions so that the eligibility check applies uniformly.
         tableManager.resumeDeletions();
         if (dropTableEvent.isAutoPartitionTable()) {
@@ -944,6 +945,7 @@ private void processDropPartition(DropPartitionEvent dropPartitionEvent) {
         }
 
         coordinatorContext.queuePartitionDeletion(Collections.singleton(tablePartition));
+
         // Route through resumeDeletions so that the eligibility check applies uniformly.
         tableManager.resumeDeletions();
         autoPartitionManager.removePartition(tableId, dropPartitionEvent.getPartitionName());
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorRequestBatch.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorRequestBatch.java
index 126e6098b2..a7e0c36033 100644
--- a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorRequestBatch.java
+++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorRequestBatch.java
@@ -490,6 +490,7 @@ private void sendStopRequest(int coordinatorEpoch) {
                                     "Failed to send stop replica request to tablet server {}.",
                                     serverId,
                                     throwable);
+
                             // For deletion replicas, surface as StopReplicaSendFailedEvent so
                             // they can be marked ineligible. Non-delete stop replica failures
                             // are best-effort and can be ignored.
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/TableManager.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/TableManager.java
index ab2c94dcbb..0715b7a332 100644
--- a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/TableManager.java
+++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/TableManager.java
@@ -426,6 +426,9 @@ private void asyncDeletePartitionMetadata(long partitionId) {
 
     private boolean isEligibleForDeletion(long tableId) {
         // Three-guard check: queued, no replica in Started, not in ineligible set.
+        //   1. queued for deletion,
+        //   2. no replica currently in ReplicaDeletionStarted,
+        //   3. table not in ineligible set (e.g., owning TS still down).
         return coordinatorContext.isTableQueuedForDeletion(tableId)
                 && !coordinatorContext.isAnyReplicaInState(
                         tableId, ReplicaState.ReplicaDeletionStarted)

From 463f7288d1417ae50a5421293f5a3793ef121137 Mon Sep 17 00:00:00 2001
From: Yang Guo <jimmy_guoyang@outlook.com>
Date: Sun, 24 May 2026 18:48:05 +0800
Subject: [PATCH 3/4] fix: stop-replica-failed

---
 .../org/apache/fluss/server/coordinator/StopReplicaITCase.java   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/StopReplicaITCase.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/StopReplicaITCase.java
index 0a1a3dfd62..143fc24459 100644
--- a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/StopReplicaITCase.java
+++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/StopReplicaITCase.java
@@ -403,6 +403,7 @@ void testDropPartitionNotStuckWhenTabletServerDeadAtKickoff() throws Exception {
         FLUSS_CLUSTER_EXTENSION.assertHasTabletServerNumber(3);
 
         retryUntilDeletionCleanedInZk(tb, isr, partitionDirs, offlineServerId);
+
         retry(
                 Duration.ofMinutes(1),
                 () -> assertThat(zkClient.getPartitionAssignment(partitionId)).isEmpty());

From caaaebb96859355f8233280b76138f5cdf1a549c Mon Sep 17 00:00:00 2001
From: Yang Guo <jimmy_guoyang@outlook.com>
Date: Wed, 27 May 2026 18:46:03 +0800
Subject: [PATCH 4/4] feat: per tablet server sender thread

---
 .../apache/fluss/config/ConfigOptions.java    |  19 +
 .../org/apache/fluss/metrics/MetricNames.java |   8 +
 .../utils/concurrent/ShutdownableThread.java  |  11 +
 .../fluss/metrics/util/TestMetricGroup.java   |  21 +-
 .../CoordinatorChannelManager.java            | 187 +++++++++-
 .../coordinator/CoordinatorContext.java       |  55 +--
 .../CoordinatorEventProcessor.java            | 110 +++---
 .../coordinator/CoordinatorRequestBatch.java  |  91 +----
 .../server/coordinator/CoordinatorServer.java |  18 +-
 .../server/coordinator/channel/ApiKey.java    |  30 ++
 .../channel/ControlRequestSendThread.java     | 204 ++++++++++
 .../server/coordinator/channel/QueueItem.java |  87 +++++
 .../channel/TabletServerChannelState.java     |  93 +++++
 .../event/StopReplicaSendFailedEvent.java     |  41 --
 .../CoordinatorChannelManagerTest.java        |   6 +-
 .../coordinator/CoordinatorContextTest.java   |   6 +-
 .../CoordinatorEventProcessorTest.java        |  88 ++---
 .../CoordinatorRequestBatchTest.java          | 131 -------
 .../server/coordinator/StopReplicaITCase.java | 102 +++--
 .../server/coordinator/TableManagerTest.java  |   2 +-
 .../TestCoordinatorChannelManager.java        |  34 +-
 .../channel/ControlRequestSendThreadTest.java | 349 ++++++++++++++++++
 .../statemachine/ReplicaStateMachineTest.java |   6 +-
 .../TableBucketStateMachineTest.java          |   5 +-
 24 files changed, 1231 insertions(+), 473 deletions(-)
 create mode 100644 fluss-server/src/main/java/org/apache/fluss/server/coordinator/channel/ApiKey.java
 create mode 100644 fluss-server/src/main/java/org/apache/fluss/server/coordinator/channel/ControlRequestSendThread.java
 create mode 100644 fluss-server/src/main/java/org/apache/fluss/server/coordinator/channel/QueueItem.java
 create mode 100644 fluss-server/src/main/java/org/apache/fluss/server/coordinator/channel/TabletServerChannelState.java
 delete mode 100644 fluss-server/src/main/java/org/apache/fluss/server/coordinator/event/StopReplicaSendFailedEvent.java
 delete mode 100644 fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorRequestBatchTest.java
 create mode 100644 fluss-server/src/test/java/org/apache/fluss/server/coordinator/channel/ControlRequestSendThreadTest.java

diff --git a/fluss-common/src/main/java/org/apache/fluss/config/ConfigOptions.java b/fluss-common/src/main/java/org/apache/fluss/config/ConfigOptions.java
index b6154fa477..6da230ef86 100644
--- a/fluss-common/src/main/java/org/apache/fluss/config/ConfigOptions.java
+++ b/fluss-common/src/main/java/org/apache/fluss/config/ConfigOptions.java
@@ -859,6 +859,25 @@ public class ConfigOptions {
                     .withDescription("The amount of time to sleep when fetch bucket error occurs.")
                     .withFallbackKeys("log.replica.fetch-backoff-interval");
 
+    public static final ConfigOption<Duration> COORDINATOR_REQUEST_RETRY_BACKOFF =
+            key("coordinator.request-retry.backoff-interval")
+                    .durationType()
+                    .defaultValue(Duration.ofMillis(100))
+                    .withDescription(
+                            "The backoff duration the coordinator waits before retrying a "
+                                    + "control-plane request to a tablet server after a "
+                                    + "transient RPC-layer failure. Mirrors Kafka's "
+                                    + "ControllerChannelManager retry backoff (hardcoded 100ms).");
+
+    public static final ConfigOption<Duration> COORDINATOR_REQUEST_TIMEOUT =
+            key("coordinator.request.timeout")
+                    .durationType()
+                    .defaultValue(Duration.ofSeconds(30))
+                    .withDescription(
+                            "The timeout the sender thread waits for a response to a "
+                                    + "control-plane request before treating it as failed "
+                                    + "and retrying.");
+
     public static final ConfigOption<MemorySize> LOG_REPLICA_FETCH_MAX_BYTES =
             key("log.replica.fetch.max-bytes")
                     .memoryType()
diff --git a/fluss-common/src/main/java/org/apache/fluss/metrics/MetricNames.java b/fluss-common/src/main/java/org/apache/fluss/metrics/MetricNames.java
index 07ebf57b6d..0d47b0db22 100644
--- a/fluss-common/src/main/java/org/apache/fluss/metrics/MetricNames.java
+++ b/fluss-common/src/main/java/org/apache/fluss/metrics/MetricNames.java
@@ -45,6 +45,14 @@ public class MetricNames {
     public static final String PARTITION_COUNT = "partitionCount";
     public static final String REPLICAS_TO_DELETE_COUNT = "replicasToDeleteCount";
 
+    // for coordinator sender (per-tablet-server control request sender threads)
+    public static final String SENDER_TOTAL_QUEUE_SIZE = "senderTotalQueueSize";
+    public static final String SENDER_QUEUE_SIZE = "senderQueueSize";
+    public static final String SENDER_QUEUE_TIME_MS = "senderQueueTimeMs";
+    public static final String SENDER_RETRY_COUNT = "senderRetryCount";
+    public static final String SENDER_STALE_DROP_COUNT = "senderStaleDropCount";
+    public static final String SENDER_ALIVE = "senderAlive";
+
     // for coordinator event processor
     public static final String EVENT_QUEUE_SIZE = "eventQueueSize";
     public static final String EVENT_QUEUE_TIME_MS = "eventQueueTimeMs";
diff --git a/fluss-common/src/main/java/org/apache/fluss/utils/concurrent/ShutdownableThread.java b/fluss-common/src/main/java/org/apache/fluss/utils/concurrent/ShutdownableThread.java
index 54a202feea..f7b319db38 100644
--- a/fluss-common/src/main/java/org/apache/fluss/utils/concurrent/ShutdownableThread.java
+++ b/fluss-common/src/main/java/org/apache/fluss/utils/concurrent/ShutdownableThread.java
@@ -21,6 +21,7 @@
 import org.slf4j.LoggerFactory;
 
 import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
 
 /** An abstract thread that is shutdownable . */
 public abstract class ShutdownableThread extends Thread {
@@ -110,6 +111,16 @@ public void run() {
         log.info("Stopped");
     }
 
+    /**
+     * Causes the current thread to wait until the shutdown is initiated, or the specified waiting
+     * time elapses.
+     */
+    public void pause(long timeout, TimeUnit unit) throws InterruptedException {
+        if (shutdownInitiated.await(timeout, unit)) {
+            log.trace("shutdownInitiated latch count reached zero. Shutdown called.");
+        }
+    }
+
     public boolean isRunning() {
         return !isShutdownInitiated();
     }
diff --git a/fluss-common/src/test/java/org/apache/fluss/metrics/util/TestMetricGroup.java b/fluss-common/src/test/java/org/apache/fluss/metrics/util/TestMetricGroup.java
index 785574037a..7b83d20a4d 100644
--- a/fluss-common/src/test/java/org/apache/fluss/metrics/util/TestMetricGroup.java
+++ b/fluss-common/src/test/java/org/apache/fluss/metrics/util/TestMetricGroup.java
@@ -22,10 +22,12 @@
 import org.apache.fluss.metrics.Gauge;
 import org.apache.fluss.metrics.Histogram;
 import org.apache.fluss.metrics.Meter;
+import org.apache.fluss.metrics.Metric;
 import org.apache.fluss.metrics.SimpleCounter;
 import org.apache.fluss.metrics.groups.MetricGroup;
 
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.Map;
 import java.util.Optional;
 import java.util.function.BiFunction;
@@ -37,6 +39,7 @@ public class TestMetricGroup implements MetricGroup {
     private final Map<String, String> variables;
     private final BiFunction<String, Optional<CharacterFilter>, String> metricIdentifierFunction;
     private final BiFunction<CharacterFilter, Optional<Character>, String> logicalScopeFunction;
+    private final Map<String, Metric> metrics = new HashMap<>();
 
     public TestMetricGroup(
             String[] scopeComponents,
@@ -49,32 +52,48 @@ public TestMetricGroup(
         this.logicalScopeFunction = logicalScopeFunction;
     }
 
+    /** Creates a default {@link TestMetricGroup} suitable for unit tests. */
+    public static TestMetricGroup createTestMetricGroup() {
+        return newBuilder().build();
+    }
+
     public static TestMetricGroupBuilder newBuilder() {
         return new TestMetricGroupBuilder();
     }
 
+    /** Returns a metric previously registered under the given name. */
+    public Metric getMetric(String name) {
+        return metrics.get(name);
+    }
+
     @Override
     public Counter counter(String name) {
-        return new SimpleCounter();
+        SimpleCounter counter = new SimpleCounter();
+        metrics.put(name, counter);
+        return counter;
     }
 
     @Override
     public <C extends Counter> C counter(String name, C counter) {
+        metrics.put(name, counter);
         return counter;
     }
 
     @Override
     public <T, G extends Gauge<T>> G gauge(String name, G gauge) {
+        metrics.put(name, gauge);
         return gauge;
     }
 
     @Override
     public <H extends Histogram> H histogram(String name, H histogram) {
+        metrics.put(name, histogram);
         return histogram;
     }
 
     @Override
     public <M extends Meter> M meter(String name, M meter) {
+        metrics.put(name, meter);
         return meter;
     }
 
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorChannelManager.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorChannelManager.java
index e095935ae2..cb606e3475 100644
--- a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorChannelManager.java
+++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorChannelManager.java
@@ -20,6 +20,10 @@
 import org.apache.fluss.annotation.VisibleForTesting;
 import org.apache.fluss.cluster.ServerNode;
 import org.apache.fluss.cluster.ServerType;
+import org.apache.fluss.config.Configuration;
+import org.apache.fluss.metadata.TableBucketReplica;
+import org.apache.fluss.metrics.MetricNames;
+import org.apache.fluss.metrics.groups.MetricGroup;
 import org.apache.fluss.rpc.RpcClient;
 import org.apache.fluss.rpc.gateway.TabletServerGateway;
 import org.apache.fluss.rpc.messages.ApiMessage;
@@ -35,39 +39,83 @@
 import org.apache.fluss.rpc.messages.StopReplicaResponse;
 import org.apache.fluss.rpc.messages.UpdateMetadataRequest;
 import org.apache.fluss.rpc.messages.UpdateMetadataResponse;
+import org.apache.fluss.server.coordinator.channel.ApiKey;
+import org.apache.fluss.server.coordinator.channel.ControlRequestSendThread;
+import org.apache.fluss.server.coordinator.channel.QueueItem;
+import org.apache.fluss.server.coordinator.channel.TabletServerChannelState;
 import org.apache.fluss.server.utils.RpcGatewayManager;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import javax.annotation.concurrent.NotThreadSafe;
+import javax.annotation.Nullable;
 
+import java.util.ArrayList;
 import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map;
 import java.util.Optional;
+import java.util.Set;
+import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.LinkedBlockingQueue;
 import java.util.function.BiConsumer;
+import java.util.function.IntSupplier;
 
 import static org.apache.fluss.utils.Preconditions.checkState;
 
 /**
- * Using by coordinator server. It's a manager to manage the rpc channels to tablet servers and send
- * request to the servers.
+ * Used by the coordinator server to manage RPC channels to tablet servers and send requests.
+ * Mutations are guarded by {@code channelLock} so the metric reporter thread can safely read queue
+ * sizes. Mirrors Kafka's {@code ControllerChannelManager} which uses a {@code brokerLock} for the
+ * same purpose.
  */
-@NotThreadSafe
 public class CoordinatorChannelManager {
 
     private static final Logger LOG = LoggerFactory.getLogger(CoordinatorChannelManager.class);
 
+    private static final int WINDOW_SIZE = 100;
+
     /** A manager for the rpc gateways to tablet servers. */
     private final RpcGatewayManager<TabletServerGateway> rpcGatewayManager;
 
-    public CoordinatorChannelManager(RpcClient rpcClient) {
+    private final IntSupplier epochSupplier;
+    private final Configuration conf;
+    private final MetricGroup coordinatorMetricGroup;
+
+    private final Object channelLock = new Object();
+    private final Map<Integer, TabletServerChannelState> channelStates = new HashMap<>();
+
+    public CoordinatorChannelManager(
+            RpcClient rpcClient,
+            IntSupplier epochSupplier,
+            Configuration conf,
+            MetricGroup coordinatorMetricGroup) {
         this.rpcGatewayManager = new RpcGatewayManager<>(rpcClient, TabletServerGateway.class);
+        this.epochSupplier = epochSupplier;
+        this.conf = conf;
+        this.coordinatorMetricGroup = coordinatorMetricGroup;
+        coordinatorMetricGroup.gauge(
+                MetricNames.SENDER_TOTAL_QUEUE_SIZE,
+                () -> {
+                    synchronized (channelLock) {
+                        int total = 0;
+                        for (TabletServerChannelState s : channelStates.values()) {
+                            total += s.getQueue().size();
+                        }
+                        return total;
+                    }
+                });
     }
 
     public void startup(Collection<ServerNode> serverNodes) {
         for (ServerNode serverNode : serverNodes) {
-            addTabletServer(serverNode);
+            addNewTabletServer(serverNode);
+        }
+        synchronized (channelLock) {
+            for (TabletServerChannelState state : channelStates.values()) {
+                startSendThread(state);
+            }
         }
     }
 
@@ -75,16 +123,61 @@ public void close() throws Exception {
         rpcGatewayManager.close();
     }
 
+    /** Adds a tablet server and immediately starts its sender thread (runtime addition). */
     public void addTabletServer(ServerNode serverNode) {
-        // add new tablet server to the channel manager
+        addNewTabletServer(serverNode);
+        synchronized (channelLock) {
+            TabletServerChannelState state = channelStates.get(serverNode.id());
+            if (state != null) {
+                startSendThread(state);
+            }
+        }
+    }
+
+    private void addNewTabletServer(ServerNode serverNode) {
         checkState(
                 serverNode.serverType().equals(ServerType.TABLET_SERVER),
                 "The server type should be TABLET_SERVER, but was " + serverNode.serverType());
 
         rpcGatewayManager.addServer(serverNode);
+
+        int id = serverNode.id();
+        synchronized (channelLock) {
+            if (channelStates.containsKey(id)) {
+                return;
+            }
+            BlockingQueue<QueueItem> queue = new LinkedBlockingQueue<>();
+
+            MetricGroup tsGroup =
+                    coordinatorMetricGroup.addGroup("tablet-server-id", String.valueOf(id));
+            tsGroup.gauge(MetricNames.SENDER_QUEUE_SIZE, queue::size);
+
+            ControlRequestSendThread thread =
+                    new ControlRequestSendThread(
+                            id,
+                            queue,
+                            () -> rpcGatewayManager.getRpcGateway(id),
+                            epochSupplier,
+                            conf,
+                            tsGroup);
+            channelStates.put(
+                    id, new TabletServerChannelState(id, serverNode, queue, thread, tsGroup));
+        }
+    }
+
+    private void startSendThread(TabletServerChannelState state) {
+        ControlRequestSendThread thread = state.getSendThread();
+        if (thread.getState() == Thread.State.NEW) {
+            thread.start();
+        }
     }
 
     public void removeTabletServer(Integer serverId) {
+        synchronized (channelLock) {
+            TabletServerChannelState state = channelStates.remove(serverId);
+            teardownChannelState(serverId, state);
+        }
+
         rpcGatewayManager
                 .removeServer(serverId)
                 .exceptionally(
@@ -97,6 +190,37 @@ public void removeTabletServer(Integer serverId) {
                         });
     }
 
+    /** Shuts down all per-tablet-server sender threads and deregisters their metrics. */
+    public void shutdown() {
+        synchronized (channelLock) {
+            for (Integer serverId : new ArrayList<>(channelStates.keySet())) {
+                TabletServerChannelState state = channelStates.remove(serverId);
+                teardownChannelState(serverId, state);
+            }
+        }
+    }
+
+    private void teardownChannelState(int serverId, @Nullable TabletServerChannelState state) {
+        if (state == null) {
+            return;
+        }
+        try {
+            try {
+                state.getSendThread().shutdown();
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+                LOG.warn(
+                        "Interrupted while shutting down sender thread for tabletServer {}",
+                        serverId,
+                        e);
+            }
+            state.getQueue().clear();
+            state.getMetricGroup().close();
+        } catch (Throwable t) {
+            LOG.error("Error tearing down channel state for tabletServer {}", serverId, t);
+        }
+    }
+
     /** Send NotifyLeaderAndIsr request to the server and handle the response. */
     public void sendBucketLeaderAndIsrRequest(
             int receiveServerId,
@@ -109,16 +233,46 @@ public void sendBucketLeaderAndIsrRequest(
                 responseConsumer);
     }
 
-    /** Send StopBucketReplicaRequest to the server and handle the response. */
+    /**
+     * Enqueues a StopReplica request onto the per-tablet-server sender queue. The sender thread
+     * retries on network-level failures until the TS responds or is removed.
+     */
     public void sendStopBucketReplicaRequest(
             int receiveServerId,
             StopReplicaRequest stopReplicaRequest,
+            int coordinatorEpoch,
+            @Nullable Set<TableBucketReplica> deletionReplicas,
             BiConsumer<StopReplicaResponse, ? super Throwable> responseConsumer) {
-        sendRequest(
-                receiveServerId,
-                stopReplicaRequest,
-                TabletServerGateway::stopReplica,
-                responseConsumer);
+        TabletServerChannelState state;
+        synchronized (channelLock) {
+            state = channelStates.get(receiveServerId);
+        }
+        if (state == null) {
+            LOG.warn(
+                    "No channel state for tabletServer {}; dropping stopReplica (epoch={}). "
+                            + "Correctness for any deletion replicas is handled by "
+                            + "processDeadTabletServer.",
+                    receiveServerId,
+                    coordinatorEpoch);
+            return;
+        }
+        QueueItem item =
+                new QueueItem(
+                        ApiKey.STOP_REPLICA,
+                        stopReplicaRequest,
+                        responseConsumer,
+                        coordinatorEpoch,
+                        System.currentTimeMillis(),
+                        deletionReplicas);
+        try {
+            state.getQueue().put(item);
+        } catch (InterruptedException e) {
+            Thread.currentThread().interrupt();
+            LOG.warn(
+                    "Interrupted while enqueueing stopReplica for tabletServer {}",
+                    receiveServerId,
+                    e);
+        }
     }
 
     /** Send UpdateMetadataRequest to the server and handle the response. */
@@ -191,6 +345,13 @@ protected Optional<TabletServerGateway> getTabletServerGateway(int targetServerI
         return rpcGatewayManager.getRpcGateway(targetServerId);
     }
 
+    @VisibleForTesting
+    Map<Integer, TabletServerChannelState> getChannelStates() {
+        synchronized (channelLock) {
+            return new HashMap<>(channelStates);
+        }
+    }
+
     /** A functional interface to send request via TabletServerGateway. */
     @VisibleForTesting
     @FunctionalInterface
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorContext.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorContext.java
index 59de15087b..fe8d254ab1 100644
--- a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorContext.java
+++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorContext.java
@@ -30,7 +30,6 @@
 import org.apache.fluss.server.metadata.ServerInfo;
 import org.apache.fluss.server.zk.ZkEpoch;
 import org.apache.fluss.server.zk.data.LeaderAndIsr;
-import org.apache.fluss.utils.types.Tuple2;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -39,7 +38,6 @@
 import javax.annotation.concurrent.NotThreadSafe;
 
 import java.util.ArrayList;
-import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -58,17 +56,8 @@ public class CoordinatorContext {
     public static final int INITIAL_COORDINATOR_EPOCH = 0;
     public static final int INITIAL_COORDINATOR_EPOCH_ZK_VERSION = 0;
 
-    // for simplicity, we just use retry time, may consider make it a configurable value
-    // and use combine retry times and retry delay
-    public static final int DELETE_TRY_TIMES = 5;
-
     private int offlineBucketCount = 0;
 
-    // a map from the tablet replica to the delete fail number,
-    // once the delete fail number is greater than DELETE_TRY_TIMES, we consider it as
-    // a success deletion.
-    private final Map<TableBucketReplica, Integer> failDeleteNumbers = new HashMap<>();
-
     private final Set<String> liveCoordinatorServers = new HashSet<>();
     private final Map<Integer, ServerInfo> liveTabletServers = new HashMap<>();
     private final Set<Integer> shuttingDownTabletServers = new HashSet<>();
@@ -489,46 +478,6 @@ public Set<TableBucketReplica> getAllReplicasForPartition(long tableId, long par
         return allReplicas;
     }
 
-    /**
-     * Pick up the replicas that should retry delete and replicas that considered as success delete.
-     *
-     * @return A tuple of retry delete replicas and success delete replicas
-     */
-    public Tuple2<Set<TableBucketReplica>, Set<TableBucketReplica>>
-            retryDeleteAndSuccessDeleteReplicas(Collection<TableBucketReplica> failDeleteReplicas) {
-        Set<TableBucketReplica> retryDeleteReplicas = new HashSet<>();
-        Set<TableBucketReplica> successDeleteReplicas = new HashSet<>();
-        for (TableBucketReplica tableBucketReplica : failDeleteReplicas) {
-            if (failDeleteNumbers.getOrDefault(tableBucketReplica, 0) >= DELETE_TRY_TIMES) {
-                // if the current fail number is greater or equal than the threshold, we will
-                // consider it as success delete
-                LOG.warn(
-                        "Delete replica {} failed, retry times is equal to the max retry times {},"
-                                + " just mark it as a successful replica deletion directly.",
-                        tableBucketReplica,
-                        DELETE_TRY_TIMES);
-                failDeleteNumbers.remove(tableBucketReplica);
-                successDeleteReplicas.add(tableBucketReplica);
-            } else {
-                // increment the fail number
-                failDeleteNumbers.merge(tableBucketReplica, 1, Integer::sum);
-                LOG.warn(
-                        "Delete replica {} failed, retry times = {}.",
-                        tableBucketReplica,
-                        failDeleteNumbers.get(tableBucketReplica));
-                retryDeleteReplicas.add(tableBucketReplica);
-            }
-        }
-        return Tuple2.of(retryDeleteReplicas, successDeleteReplicas);
-    }
-
-    /** Clear fail delete number for the given replicas. */
-    public void clearFailDeleteNumbers(Collection<TableBucketReplica> replicas) {
-        for (TableBucketReplica tableBucketReplica : replicas) {
-            failDeleteNumbers.remove(tableBucketReplica);
-        }
-    }
-
     @VisibleForTesting
     protected int replicaCounts(long tableId) {
         return getTableAssignment(tableId).values().stream().mapToInt(List::size).sum();
@@ -684,11 +633,11 @@ public void markPartitionIneligibleForDeletion(TablePartition tablePartition, St
     }
 
     /** Remove the ineligible mark, allowing deletion to be retried. */
-    public void markTableEligibleForDeletion(long tableId) {
+    public void removeTableFromIneligibleForDeletion(long tableId) {
         tablesIneligibleForDeletion.remove(tableId);
     }
 
-    public void markPartitionEligibleForDeletion(TablePartition tablePartition) {
+    public void removePartitionFromIneligibleForDeletion(TablePartition tablePartition) {
         partitionsIneligibleForDeletion.remove(tablePartition);
     }
 
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessor.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessor.java
index bfa323016e..c2f0642634 100644
--- a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessor.java
+++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessor.java
@@ -87,7 +87,6 @@
 import org.apache.fluss.server.coordinator.event.RebalanceEvent;
 import org.apache.fluss.server.coordinator.event.RemoveServerTagEvent;
 import org.apache.fluss.server.coordinator.event.SchemaChangeEvent;
-import org.apache.fluss.server.coordinator.event.StopReplicaSendFailedEvent;
 import org.apache.fluss.server.coordinator.event.TableRegistrationChangeEvent;
 import org.apache.fluss.server.coordinator.event.watcher.CoordinatorChangeWatcher;
 import org.apache.fluss.server.coordinator.event.watcher.TableChangeWatcher;
@@ -308,10 +307,10 @@ public void startup() {
     }
 
     public void shutdown() {
-        // close the event manager
         coordinatorEventManager.close();
         rebalanceManager.close();
         onShutdown();
+        coordinatorChannelManager.shutdown();
         coordinatorContext.resetContext();
     }
 
@@ -621,8 +620,6 @@ public void process(CoordinatorEvent event) {
                     (NotifyLeaderAndIsrResponseReceivedEvent) event);
         } else if (event instanceof DeleteReplicaResponseReceivedEvent) {
             processDeleteReplicaResponseReceived((DeleteReplicaResponseReceivedEvent) event);
-        } else if (event instanceof StopReplicaSendFailedEvent) {
-            processStopReplicaSendFailed((StopReplicaSendFailedEvent) event);
         } else if (event instanceof NewCoordinatorEvent) {
             processNewCoordinator((NewCoordinatorEvent) event);
         } else if (event instanceof DeadCoordinatorEvent) {
@@ -962,67 +959,83 @@ private void processDropPartition(DropPartitionEvent dropPartitionEvent) {
                 dropTableInfo.getTablePath(), tableId, tablePartition.getPartitionId());
     }
 
+    /**
+     * Handles StopReplica deletion responses from the sender thread. Partitions per-bucket results
+     * into succeeded/failed sets, then moves replicas to the appropriate state.
+     *
+     * <p>Mirrors Kafka's {@code processTopicDeletionStopReplicaResponseReceived}
+     * (KafkaController.scala:1451-1471). Fluss's StopReplicaResponse has no top-level error code
+     * (FlussApi.proto:373-375), so the Kafka "requestError != NONE → all in error" branch does not
+     * apply. Each bucket's error code is evaluated individually.
+     *
+     * <p>No {@code isActive} guard is needed (unlike Kafka :1454): {@code
+     * coordinatorEventManager.close()} in {@link #shutdown()} joins the event thread before {@code
+     * coordinatorChannelManager.shutdown()}, so no event runs after resignation.
+     */
     private void processDeleteReplicaResponseReceived(
             DeleteReplicaResponseReceivedEvent deleteReplicaResponseReceivedEvent) {
         List<DeleteReplicaResultForBucket> deleteReplicaResultForBuckets =
                 deleteReplicaResponseReceivedEvent.getDeleteReplicaResults();
 
-        Set<TableBucketReplica> failDeletedReplicas = new HashSet<>();
-        Set<TableBucketReplica> successDeletedReplicas = new HashSet<>();
-        for (DeleteReplicaResultForBucket deleteReplicaResultForBucket :
-                deleteReplicaResultForBuckets) {
-            TableBucketReplica tableBucketReplica =
-                    deleteReplicaResultForBucket.getTableBucketReplica();
-            if (deleteReplicaResultForBucket.succeeded()) {
-                successDeletedReplicas.add(tableBucketReplica);
+        Set<TableBucketReplica> succeeded = new HashSet<>();
+        Set<TableBucketReplica> failed = new HashSet<>();
+        for (DeleteReplicaResultForBucket result : deleteReplicaResultForBuckets) {
+            if (result.succeeded()) {
+                succeeded.add(result.getTableBucketReplica());
             } else {
-                failDeletedReplicas.add(tableBucketReplica);
+                failed.add(result.getTableBucketReplica());
             }
         }
-        // clear the fail deleted number for the success deleted replicas
-        coordinatorContext.clearFailDeleteNumbers(successDeletedReplicas);
-
-        // Response-level failures (TS replied with per-bucket error): retry up to
-        // DELETE_TRY_TIMES then force-succeed. RPC-layer failures (no response) are
-        // handled separately via StopReplicaSendFailedEvent.
-        Tuple2<Set<TableBucketReplica>, Set<TableBucketReplica>>
-                retryDeleteAndSuccessDeleteReplicas =
-                        coordinatorContext.retryDeleteAndSuccessDeleteReplicas(failDeletedReplicas);
 
-        // transmit to deletion started for retry delete replicas
-        replicaStateMachine.handleStateChanges(
-                retryDeleteAndSuccessDeleteReplicas.f0, ReplicaDeletionStarted);
-
-        // add all the replicas that considered as success delete to success deleted replicas
-        successDeletedReplicas.addAll(retryDeleteAndSuccessDeleteReplicas.f1);
-        // transmit to deletion successful for success deleted replicas
-        replicaStateMachine.handleStateChanges(successDeletedReplicas, ReplicaDeletionSuccessful);
+        if (!failed.isEmpty()) {
+            failReplicaDeletion(failed);
+        }
+        if (!succeeded.isEmpty()) {
+            completeReplicaDeletion(succeeded);
+        }
+    }
 
-        // if any success deletion, we can resume
-        if (!successDeletedReplicas.isEmpty()) {
-            tableManager.resumeDeletions();
+    /**
+     * Mirrors Kafka's {@code TopicDeletionManager.failReplicaDeletion}
+     * (TopicDeletionManager.scala:150-161): filter to replicas whose owning table/partition is
+     * still queued for deletion, then move them to {@code ReplicaDeletionIneligible} and mark the
+     * owning table/partition ineligible. Recovery happens when the owning TabletServer reconnects
+     * ({@code processNewTabletServer} clears the ineligible flag).
+     */
+    private void failReplicaDeletion(Set<TableBucketReplica> replicas) {
+        Set<TableBucketReplica> stillQueued =
+                replicas.stream()
+                        .filter(this::isStillQueuedForDeletion)
+                        .collect(Collectors.toSet());
+        if (stillQueued.isEmpty()) {
+            return;
         }
+        replicaStateMachine.handleStateChanges(stillQueued, ReplicaDeletionIneligible);
+        markFailedReplicasIneligible(stillQueued, "stopReplica response error");
+        tableManager.resumeDeletions();
     }
 
-    private void processStopReplicaSendFailed(StopReplicaSendFailedEvent event) {
-        // Transition replicas to ReplicaDeletionIneligible and mark the table/partition
-        // ineligible until the TabletServer reconnects. Only process replicas still in
-        // ReplicaDeletionStarted (parallel events may have already moved them).
-        Set<TableBucketReplica> stillInDeletion =
-                event.getFailedReplicas().stream()
-                        .filter(
-                                replica ->
-                                        coordinatorContext.getReplicaState(replica)
-                                                == ReplicaDeletionStarted)
+    private void completeReplicaDeletion(Set<TableBucketReplica> replicas) {
+        Set<TableBucketReplica> stillQueued =
+                replicas.stream()
+                        .filter(this::isStillQueuedForDeletion)
                         .collect(Collectors.toSet());
-        if (stillInDeletion.isEmpty()) {
+        if (stillQueued.isEmpty()) {
             return;
         }
-        replicaStateMachine.handleStateChanges(stillInDeletion, ReplicaDeletionIneligible);
-        markFailedReplicasIneligible(stillInDeletion, "stopReplica RPC send failed");
+        replicaStateMachine.handleStateChanges(stillQueued, ReplicaDeletionSuccessful);
         tableManager.resumeDeletions();
     }
 
+    private boolean isStillQueuedForDeletion(TableBucketReplica replica) {
+        TableBucket tb = replica.getTableBucket();
+        if (tb.getPartitionId() != null) {
+            return coordinatorContext.isPartitionQueuedForDeletion(
+                    new TablePartition(tb.getTableId(), tb.getPartitionId()));
+        }
+        return coordinatorContext.isTableQueuedForDeletion(tb.getTableId());
+    }
+
     private void markFailedReplicasIneligible(
             Set<TableBucketReplica> failedReplicas, String reason) {
         for (TableBucketReplica replica : failedReplicas) {
@@ -1214,8 +1227,9 @@ private void processNewTabletServer(NewTabletServerEvent newTabletServerEvent) {
                         .filter(coordinatorContext::isPartitionQueuedForDeletion)
                         .collect(Collectors.toSet());
         if (!tablesToResume.isEmpty() || !partitionsToResume.isEmpty()) {
-            tablesToResume.forEach(coordinatorContext::markTableEligibleForDeletion);
-            partitionsToResume.forEach(coordinatorContext::markPartitionEligibleForDeletion);
+            tablesToResume.forEach(coordinatorContext::removeTableFromIneligibleForDeletion);
+            partitionsToResume.forEach(
+                    coordinatorContext::removePartitionFromIneligibleForDeletion);
             tableManager.resumeDeletions();
         }
     }
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorRequestBatch.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorRequestBatch.java
index a7e0c36033..ac16ccfb4e 100644
--- a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorRequestBatch.java
+++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorRequestBatch.java
@@ -38,7 +38,6 @@
 import org.apache.fluss.server.coordinator.event.DeleteReplicaResponseReceivedEvent;
 import org.apache.fluss.server.coordinator.event.EventManager;
 import org.apache.fluss.server.coordinator.event.NotifyLeaderAndIsrResponseReceivedEvent;
-import org.apache.fluss.server.coordinator.event.StopReplicaSendFailedEvent;
 import org.apache.fluss.server.entity.DeleteReplicaResultForBucket;
 import org.apache.fluss.server.entity.NotifyLeaderAndIsrData;
 import org.apache.fluss.server.metadata.BucketMetadata;
@@ -441,111 +440,48 @@ private void sendNotifyLeaderAndIsrRequest(int coordinatorEpoch) {
     }
 
     private void sendStopRequest(int coordinatorEpoch) {
-        Set<Integer> liveServers = coordinatorContext.liveTabletServerSet();
-        for (Map.Entry<Integer, Map<TableBucket, PbStopReplicaReqForBucket>> stopReplciaEntry :
+        for (Map.Entry<Integer, Map<TableBucket, PbStopReplicaReqForBucket>> stopReplicaEntry :
                 stopReplicaRequestMap.entrySet()) {
-            // send request for each tablet server
-            Integer serverId = stopReplciaEntry.getKey();
+            Integer serverId = stopReplicaEntry.getKey();
 
-            // construct the stop replica request
-            Map<TableBucket, PbStopReplicaReqForBucket> stopReplicas = stopReplciaEntry.getValue();
+            Map<TableBucket, PbStopReplicaReqForBucket> stopReplicas = stopReplicaEntry.getValue();
             StopReplicaRequest stopReplicaRequest = new StopReplicaRequest();
             stopReplicaRequest
                     .setCoordinatorEpoch(coordinatorEpoch)
                     .addAllStopReplicasReqs(stopReplicas.values());
 
-            // we collect the buckets whose replica is to be deleted
             Set<TableBucket> deletedReplicaBuckets =
                     stopReplicas.values().stream()
                             .filter(pbBucket -> pbBucket.isDelete() && pbBucket.isDeleteRemote())
                             .map(t -> toTableBucket(t.getTableBucket()))
                             .collect(Collectors.toSet());
 
-            // If the TS is not live, the channel manager has no gateway and would silently
-            // drop the request. Surface the failure so deletion replicas can be marked
-            // ineligible instead of stuck in ReplicaDeletionStarted.
-            if (!liveServers.contains(serverId)) {
-                if (!deletedReplicaBuckets.isEmpty()) {
-                    LOG.warn(
-                            "Tablet server {} is not in the live set when sending stop replica "
-                                    + "request; surfacing as StopReplicaSendFailedEvent for {} "
-                                    + "deletion replica(s).",
-                            serverId,
-                            deletedReplicaBuckets.size());
-                    Set<TableBucketReplica> failedReplicas =
-                            deletedReplicaBuckets.stream()
-                                    .map(bucket -> new TableBucketReplica(bucket, serverId))
-                                    .collect(Collectors.toSet());
-                    eventManager.put(new StopReplicaSendFailedEvent(failedReplicas));
-                }
-                continue;
-            }
+            Set<TableBucketReplica> deletionReplicas =
+                    deletedReplicaBuckets.stream()
+                            .map(bucket -> new TableBucketReplica(bucket, serverId))
+                            .collect(Collectors.toSet());
 
             coordinatorChannelManager.sendStopBucketReplicaRequest(
                     serverId,
                     stopReplicaRequest,
+                    coordinatorEpoch,
+                    deletionReplicas.isEmpty() ? null : deletionReplicas,
                     (response, throwable) -> {
-                        if (throwable != null) {
-                            LOG.warn(
-                                    "Failed to send stop replica request to tablet server {}.",
-                                    serverId,
-                                    throwable);
-
-                            // For deletion replicas, surface as StopReplicaSendFailedEvent so
-                            // they can be marked ineligible. Non-delete stop replica failures
-                            // are best-effort and can be ignored.
-                            if (!deletedReplicaBuckets.isEmpty()) {
-                                Set<TableBucketReplica> failedReplicas =
-                                        deletedReplicaBuckets.stream()
-                                                .map(
-                                                        bucket ->
-                                                                new TableBucketReplica(
-                                                                        bucket, serverId))
-                                                .collect(Collectors.toSet());
-                                eventManager.put(new StopReplicaSendFailedEvent(failedReplicas));
-                            }
-                            return;
-                        }
-                        // handle the response
                         List<DeleteReplicaResultForBucket> deleteReplicaResultForBuckets =
                                 new ArrayList<>();
                         List<PbStopReplicaRespForBucket> stopReplicasResps =
                                 response.getStopReplicasRespsList();
-                        // construct the result for stop replica
-                        // for each replica
                         for (PbStopReplicaRespForBucket stopReplicaRespForBucket :
                                 stopReplicasResps) {
                             TableBucket tableBucket =
                                     toTableBucket(stopReplicaRespForBucket.getTableBucket());
 
-                            // now, for stop replica(delete=false), it's best effort without any
-                            // error handling.
-                            // currently, it only happens in the two case:
-                            // 1. send stop replica(delete = false) for table deletion, if it fails,
-                            // the following step will trigger replica to ReplicaDeletionStarted
-                            // will send stop replica(delete =true) which will retry if fail.
-                            // 2. send notify leader and isr request to tablet server, but the
-                            // tablet server fail to init a replica
-                            // then, it'll send stop replica(delete = false) to the tablet server to
-                            // make the tablet server can stop the replica; It's still fine if
-                            // sending stop replica fail.
-                            // todo: let's revisit here to see whether we can
-                            // really ignore the error after
-                            // we finish the logic of tablet server.
-
-                            // but for stop replica(delete=true), we need to handle the error and
-                            // retry deletion.
-
-                            // filter out the  error response for replica deletion.
                             if (deletedReplicaBuckets.contains(tableBucket)) {
                                 DeleteReplicaResultForBucket deleteReplicaResultForBucket;
-                                TableBucketReplica tableBucketReplica =
-                                        new TableBucketReplica(tableBucket, serverId);
-                                // if fail;
                                 if (stopReplicaRespForBucket.hasErrorCode()) {
                                     deleteReplicaResultForBucket =
                                             new DeleteReplicaResultForBucket(
-                                                    tableBucketReplica.getTableBucket(),
+                                                    tableBucket,
                                                     serverId,
                                                     ApiError.fromErrorMessage(
                                                             stopReplicaRespForBucket));
@@ -556,13 +492,10 @@ private void sendStopRequest(int coordinatorEpoch) {
                                 deleteReplicaResultForBuckets.add(deleteReplicaResultForBucket);
                             }
                         }
-                        // if there are any deleted replicas, construct
-                        // the DeleteReplicaResponseReceivedEvent and put into event manager
                         if (!deleteReplicaResultForBuckets.isEmpty()) {
-                            DeleteReplicaResponseReceivedEvent deleteReplicaResponseReceivedEvent =
+                            eventManager.put(
                                     new DeleteReplicaResponseReceivedEvent(
-                                            deleteReplicaResultForBuckets);
-                            eventManager.put(deleteReplicaResponseReceivedEvent);
+                                            deleteReplicaResultForBuckets));
                         }
                     });
         }
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorServer.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorServer.java
index 6914e8207e..4148374ffb 100644
--- a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorServer.java
+++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorServer.java
@@ -312,18 +312,24 @@ protected void initCoordinatorLeader() throws Exception {
             this.clientMetricGroup = new ClientMetricGroup(metricRegistry, SERVER_NAME);
             this.rpcClient = RpcClient.create(conf, clientMetricGroup);
 
-            this.coordinatorChannelManager = new CoordinatorChannelManager(rpcClient);
+            // start coordinator event processor after we register coordinator leader to zk
+            // so that the event processor can get the coordinator leader node from zk during
+            // start up. In HA for coordinator server, the processor also need to know the leader
+            // node during start up
+            CoordinatorContext coordinatorContext = new CoordinatorContext(zkEpoch);
+
+            this.coordinatorChannelManager =
+                    new CoordinatorChannelManager(
+                            rpcClient,
+                            coordinatorContext::getCoordinatorEpoch,
+                            conf,
+                            serverMetricGroup);
 
             this.autoPartitionManager =
                     new AutoPartitionManager(
                             metadataCache, metadataManager, remoteDirDynamicLoader, conf);
             autoPartitionManager.start();
 
-            // start coordinator event processor after we register coordinator leader to zk
-            // so that the event processor can get the coordinator leader node from zk during
-            // start up. In HA for coordinator server, the processor also need to know the leader
-            // node during start up
-            CoordinatorContext coordinatorContext = new CoordinatorContext(zkEpoch);
             this.coordinatorEventProcessor =
                     new CoordinatorEventProcessor(
                             zkClient,
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/channel/ApiKey.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/channel/ApiKey.java
new file mode 100644
index 0000000000..620ea491a8
--- /dev/null
+++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/channel/ApiKey.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.server.coordinator.channel;
+
+/**
+ * Enumeration of control-plane RPCs routed through the per-tablet-server sender thread.
+ *
+ * <p>Only {@link #STOP_REPLICA} is supported in this iteration. Future iterations may add other
+ * coordinator → tablet-server RPCs (e.g., {@code NOTIFY_LEADER_AND_ISR}) once they are migrated to
+ * the same sender-thread retry layer.
+ */
+public enum ApiKey {
+    /** {@code stopReplica(delete=true|false)} request. */
+    STOP_REPLICA
+}
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/channel/ControlRequestSendThread.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/channel/ControlRequestSendThread.java
new file mode 100644
index 0000000000..9209e0c52f
--- /dev/null
+++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/channel/ControlRequestSendThread.java
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.server.coordinator.channel;
+
+import org.apache.fluss.config.ConfigOptions;
+import org.apache.fluss.config.Configuration;
+import org.apache.fluss.metrics.Counter;
+import org.apache.fluss.metrics.DescriptiveStatisticsHistogram;
+import org.apache.fluss.metrics.Histogram;
+import org.apache.fluss.metrics.MetricNames;
+import org.apache.fluss.metrics.groups.MetricGroup;
+import org.apache.fluss.rpc.gateway.TabletServerGateway;
+import org.apache.fluss.rpc.messages.ApiMessage;
+import org.apache.fluss.rpc.messages.StopReplicaRequest;
+import org.apache.fluss.utils.concurrent.ShutdownableThread;
+
+import java.util.Optional;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.IntSupplier;
+import java.util.function.Supplier;
+
+/**
+ * Per-tablet-server sender thread that drains the control-plane request queue and retries on
+ * transient RPC failures. Mirrors Kafka's {@code RequestSendThread}
+ * (ControllerChannelManager.scala:226-300).
+ *
+ * <p>Each invocation of {@link #doWork()} takes one {@link QueueItem} from the queue. Stale items
+ * (whose {@code coordinatorEpoch} is less than the current epoch) are dropped immediately.
+ * Otherwise the item is sent to the tablet server via the gateway, retrying with a configurable
+ * backoff until the send succeeds or the thread is shut down.
+ *
+ * <p>The callback is invoked in its own {@code try/catch} OUTSIDE the retry loop (alignment fix F),
+ * so a buggy response handler cannot re-enter the send retry path and cannot reach the outer
+ * Scenario-E catch.
+ */
+public class ControlRequestSendThread extends ShutdownableThread {
+
+    private static final int HISTOGRAM_WINDOW_SIZE = 100;
+
+    private final int tabletServerId;
+    private final BlockingQueue<QueueItem> queue;
+    private final Supplier<Optional<TabletServerGateway>> gatewaySupplier;
+    private final IntSupplier epochSupplier;
+    private final long backoffMs;
+    private final long requestTimeoutMs;
+
+    private final Histogram queueTimeMsHistogram;
+    private final Counter retryCount;
+    private final Counter staleDropCount;
+    private final AtomicInteger aliveFlag;
+
+    private final AtomicReference<CompletableFuture<? extends ApiMessage>> inFlight =
+            new AtomicReference<>();
+
+    public ControlRequestSendThread(
+            int tabletServerId,
+            BlockingQueue<QueueItem> queue,
+            Supplier<Optional<TabletServerGateway>> gatewaySupplier,
+            IntSupplier epochSupplier,
+            Configuration conf,
+            MetricGroup metricGroup) {
+        super("coordinator-control-request-sender-" + tabletServerId, true);
+        this.tabletServerId = tabletServerId;
+        this.queue = queue;
+        this.gatewaySupplier = gatewaySupplier;
+        this.epochSupplier = epochSupplier;
+        this.backoffMs = conf.get(ConfigOptions.COORDINATOR_REQUEST_RETRY_BACKOFF).toMillis();
+        this.requestTimeoutMs = conf.get(ConfigOptions.COORDINATOR_REQUEST_TIMEOUT).toMillis();
+
+        this.queueTimeMsHistogram =
+                metricGroup.histogram(
+                        MetricNames.SENDER_QUEUE_TIME_MS,
+                        new DescriptiveStatisticsHistogram(HISTOGRAM_WINDOW_SIZE));
+        this.retryCount = metricGroup.counter(MetricNames.SENDER_RETRY_COUNT);
+        this.staleDropCount = metricGroup.counter(MetricNames.SENDER_STALE_DROP_COUNT);
+        this.aliveFlag = new AtomicInteger(0);
+        metricGroup.gauge(MetricNames.SENDER_ALIVE, aliveFlag::get);
+    }
+
+    @Override
+    public void run() {
+        aliveFlag.set(1);
+        super.run();
+    }
+
+    @Override
+    public boolean initiateShutdown() {
+        boolean initiated = super.initiateShutdown();
+        if (initiated) {
+            CompletableFuture<? extends ApiMessage> f = inFlight.getAndSet(null);
+            if (f != null) {
+                f.cancel(true);
+            }
+        }
+        return initiated;
+    }
+
+    @Override
+    public void shutdown() throws InterruptedException {
+        try {
+            super.shutdown();
+        } finally {
+            aliveFlag.set(0);
+        }
+    }
+
+    @Override
+    public void doWork() throws Exception {
+        try {
+            QueueItem item = queue.take();
+            queueTimeMsHistogram.update(System.currentTimeMillis() - item.getEnqueueTimeMs());
+
+            int currentEpoch = epochSupplier.getAsInt();
+            if (item.getCoordinatorEpoch() < currentEpoch) {
+                staleDropCount.inc();
+                log.warn(
+                        "Dropping stale {} for tabletServer {}: itemEpoch={} < currentEpoch={}",
+                        item.getApiKey(),
+                        tabletServerId,
+                        item.getCoordinatorEpoch(),
+                        currentEpoch);
+                return;
+            }
+
+            ApiMessage response = null;
+            boolean sendSuccessful = false;
+            while (isRunning() && !sendSuccessful) {
+                Optional<TabletServerGateway> gatewayOpt = gatewaySupplier.get();
+                if (!gatewayOpt.isPresent()) {
+                    retryCount.inc();
+                    backoff();
+                    continue;
+                }
+                try {
+                    CompletableFuture<? extends ApiMessage> future = invoke(gatewayOpt.get(), item);
+                    inFlight.set(future);
+                    response = future.get(requestTimeoutMs, TimeUnit.MILLISECONDS);
+                    sendSuccessful = true;
+                } catch (Throwable t) {
+                    retryCount.inc();
+                    log.warn(
+                            "Failed to send {} to tabletServer {}; will retry after {}ms",
+                            item.getApiKey(),
+                            tabletServerId,
+                            backoffMs,
+                            t);
+                    backoff();
+                } finally {
+                    inFlight.set(null);
+                }
+            }
+
+            if (sendSuccessful && item.getCallback() != null) {
+                try {
+                    item.getCallback().accept(response, null);
+                } catch (Throwable t) {
+                    log.error(
+                            "Callback for {} to tabletServer {} threw; the request itself "
+                                    + "succeeded so no retry is attempted.",
+                            item.getApiKey(),
+                            tabletServerId,
+                            t);
+                }
+            }
+        } catch (InterruptedException e) {
+            Thread.currentThread().interrupt();
+        } catch (Throwable t) {
+            log.error("Unexpected error in {}; thread continues", getName(), t);
+        }
+    }
+
+    private void backoff() throws InterruptedException {
+        pause(backoffMs, TimeUnit.MILLISECONDS);
+    }
+
+    private CompletableFuture<? extends ApiMessage> invoke(
+            TabletServerGateway gateway, QueueItem item) {
+        switch (item.getApiKey()) {
+            case STOP_REPLICA:
+                return gateway.stopReplica((StopReplicaRequest) item.getRequest());
+            default:
+                throw new IllegalArgumentException("Unsupported ApiKey: " + item.getApiKey());
+        }
+    }
+}
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/channel/QueueItem.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/channel/QueueItem.java
new file mode 100644
index 0000000000..e0dd936444
--- /dev/null
+++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/channel/QueueItem.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.server.coordinator.channel;
+
+import org.apache.fluss.metadata.TableBucketReplica;
+import org.apache.fluss.rpc.messages.ApiMessage;
+
+import javax.annotation.Nullable;
+
+import java.util.Set;
+import java.util.function.BiConsumer;
+
+/**
+ * An immutable item placed on the per-tablet-server sender queue.
+ *
+ * <p>Mirrors Kafka's {@code QueueItem} (ControllerChannelManager.scala:218-219) with two
+ * Fluss-specific additions: {@code coordinatorEpoch} (for the §5.4.1 stale-drop check that Kafka
+ * does not need because controller resignation tears down the channel manager before the new epoch
+ * takes over) and {@code deletionReplicas} (so the response-handler path can reconstruct the
+ * replica set without re-deriving it from the response).
+ */
+public final class QueueItem {
+
+    private final ApiKey apiKey;
+    private final ApiMessage request;
+    @Nullable private final BiConsumer<ApiMessage, Throwable> callback;
+    private final int coordinatorEpoch;
+    private final long enqueueTimeMs;
+    @Nullable private final Set<TableBucketReplica> deletionReplicas;
+
+    @SuppressWarnings("unchecked")
+    public QueueItem(
+            ApiKey apiKey,
+            ApiMessage request,
+            @Nullable BiConsumer<? extends ApiMessage, ? super Throwable> callback,
+            int coordinatorEpoch,
+            long enqueueTimeMs,
+            @Nullable Set<TableBucketReplica> deletionReplicas) {
+        this.apiKey = apiKey;
+        this.request = request;
+        this.callback = (BiConsumer<ApiMessage, Throwable>) callback;
+        this.coordinatorEpoch = coordinatorEpoch;
+        this.enqueueTimeMs = enqueueTimeMs;
+        this.deletionReplicas = deletionReplicas;
+    }
+
+    public ApiKey getApiKey() {
+        return apiKey;
+    }
+
+    public ApiMessage getRequest() {
+        return request;
+    }
+
+    @Nullable
+    public BiConsumer<ApiMessage, Throwable> getCallback() {
+        return callback;
+    }
+
+    public int getCoordinatorEpoch() {
+        return coordinatorEpoch;
+    }
+
+    public long getEnqueueTimeMs() {
+        return enqueueTimeMs;
+    }
+
+    @Nullable
+    public Set<TableBucketReplica> getDeletionReplicas() {
+        return deletionReplicas;
+    }
+}
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/channel/TabletServerChannelState.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/channel/TabletServerChannelState.java
new file mode 100644
index 0000000000..3585abb23a
--- /dev/null
+++ b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/channel/TabletServerChannelState.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.server.coordinator.channel;
+
+import org.apache.fluss.cluster.ServerNode;
+import org.apache.fluss.metrics.groups.MetricGroup;
+
+import javax.annotation.concurrent.ThreadSafe;
+
+import java.util.concurrent.BlockingQueue;
+
+/**
+ * Per-tablet-server channel state held by {@link
+ * org.apache.fluss.server.coordinator.CoordinatorChannelManager}: the queue, the sender thread, and
+ * the metric handles needed to deregister the per-TS metrics on remove.
+ *
+ * <p>Mirrors Kafka's {@code ControllerBrokerStateInfo} (ControllerChannelManager.scala:768-774).
+ * The {@code networkClient} field is intentionally absent — {@code RpcGatewayManager} owns the
+ * gateway exactly the way Kafka's controller owns the {@code NetworkClient}, so we do not duplicate
+ * it on the state. Three Fluss-specific additions surface behavior that has no Kafka analog:
+ *
+ * <ul>
+ *   <li>{@code retryCount} — counts §5.4 retry attempts.
+ *   <li>{@code staleDropCount} — counts §5.4.1 stale-epoch drops at dequeue.
+ *   <li>{@code aliveGauge} — presence indicator for the per-TS sender (§8.5 Scenario E).
+ * </ul>
+ *
+ * <p>Each field is final and assigned once; mutable concurrent state is encapsulated in the queue,
+ * the thread, and the metric handles themselves.
+ */
+@ThreadSafe
+public final class TabletServerChannelState {
+
+    private final int tabletServerId;
+    private final ServerNode serverNode;
+    private final BlockingQueue<QueueItem> queue;
+    private final ControlRequestSendThread sendThread;
+
+    /**
+     * The per-TS child {@link MetricGroup}. Held so {@code removeTabletServer} can {@code close()}
+     * it and deregister all five per-TS metrics in one shot — mirrors Kafka's two {@code
+     * metricsGroup.removeMetric(...)} calls at ControllerChannelManager.scala:203-204.
+     */
+    private final MetricGroup metricGroup;
+
+    public TabletServerChannelState(
+            int tabletServerId,
+            ServerNode serverNode,
+            BlockingQueue<QueueItem> queue,
+            ControlRequestSendThread sendThread,
+            MetricGroup metricGroup) {
+        this.tabletServerId = tabletServerId;
+        this.serverNode = serverNode;
+        this.queue = queue;
+        this.sendThread = sendThread;
+        this.metricGroup = metricGroup;
+    }
+
+    public int getTabletServerId() {
+        return tabletServerId;
+    }
+
+    public ServerNode getServerNode() {
+        return serverNode;
+    }
+
+    public BlockingQueue<QueueItem> getQueue() {
+        return queue;
+    }
+
+    public ControlRequestSendThread getSendThread() {
+        return sendThread;
+    }
+
+    public MetricGroup getMetricGroup() {
+        return metricGroup;
+    }
+}
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/event/StopReplicaSendFailedEvent.java b/fluss-server/src/main/java/org/apache/fluss/server/coordinator/event/StopReplicaSendFailedEvent.java
deleted file mode 100644
index 59e160418b..0000000000
--- a/fluss-server/src/main/java/org/apache/fluss/server/coordinator/event/StopReplicaSendFailedEvent.java
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.fluss.server.coordinator.event;
-
-import org.apache.fluss.metadata.TableBucketReplica;
-
-import java.util.Collections;
-import java.util.Set;
-
-/**
- * Signals that a {@code stopReplica(delete=true)} request could not be delivered at the RPC layer
- * (e.g., gateway missing or network error). The event processor transitions affected replicas to
- * {@code ReplicaDeletionIneligible} until the tablet server reconnects.
- */
-public class StopReplicaSendFailedEvent implements CoordinatorEvent {
-
-    private final Set<TableBucketReplica> failedReplicas;
-
-    public StopReplicaSendFailedEvent(Set<TableBucketReplica> failedReplicas) {
-        this.failedReplicas = Collections.unmodifiableSet(failedReplicas);
-    }
-
-    public Set<TableBucketReplica> getFailedReplicas() {
-        return failedReplicas;
-    }
-}
diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorChannelManagerTest.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorChannelManagerTest.java
index 31c07d75cd..1f6e974e96 100644
--- a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorChannelManagerTest.java
+++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorChannelManagerTest.java
@@ -22,6 +22,7 @@
 import org.apache.fluss.rpc.RpcClient;
 import org.apache.fluss.rpc.messages.UpdateMetadataRequest;
 import org.apache.fluss.rpc.metrics.TestingClientMetricGroup;
+import org.apache.fluss.server.metrics.group.TestingMetricGroups;
 import org.apache.fluss.server.testutils.FlussClusterExtension;
 import org.apache.fluss.server.zk.ZooKeeperExtension;
 import org.apache.fluss.testutils.common.AllCallbackWrapper;
@@ -54,7 +55,10 @@ void testCoordinatorChannelManager() throws Exception {
         Configuration configuration = new Configuration();
         CoordinatorChannelManager coordinatorChannelManager =
                 new CoordinatorChannelManager(
-                        RpcClient.create(configuration, TestingClientMetricGroup.newInstance()));
+                        RpcClient.create(configuration, TestingClientMetricGroup.newInstance()),
+                        () -> 0,
+                        configuration,
+                        TestingMetricGroups.COORDINATOR_METRICS);
         List<ServerNode> tabletServersNode = FLUSS_CLUSTER_EXTENSION.getTabletServerNodes();
 
         // test start up using server 0
diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorContextTest.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorContextTest.java
index 26c03fe043..558bb9f008 100644
--- a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorContextTest.java
+++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorContextTest.java
@@ -106,9 +106,9 @@ void testIneligibleForDeletionMarkAndRemove() {
         assertThat(context.isTableIneligibleForDeletion(unqueuedTableId)).isFalse();
         assertThat(context.isPartitionIneligibleForDeletion(unqueuedPartition)).isFalse();
 
-        // (d) markEligibleForDeletion removes the ineligible flag
-        context.markTableEligibleForDeletion(queuedTableId);
-        context.markPartitionEligibleForDeletion(queuedPartition);
+        // (d) removeFromIneligibleForDeletion removes the ineligible flag
+        context.removeTableFromIneligibleForDeletion(queuedTableId);
+        context.removePartitionFromIneligibleForDeletion(queuedPartition);
         assertThat(context.isTableIneligibleForDeletion(queuedTableId)).isFalse();
         assertThat(context.isPartitionIneligibleForDeletion(queuedPartition)).isFalse();
 
diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessorTest.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessorTest.java
index bf579bce75..57d3dac01c 100644
--- a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessorTest.java
+++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessorTest.java
@@ -55,7 +55,6 @@
 import org.apache.fluss.server.coordinator.event.DropPartitionEvent;
 import org.apache.fluss.server.coordinator.event.DropTableEvent;
 import org.apache.fluss.server.coordinator.event.NewTabletServerEvent;
-import org.apache.fluss.server.coordinator.event.StopReplicaSendFailedEvent;
 import org.apache.fluss.server.coordinator.lease.KvSnapshotLeaseManager;
 import org.apache.fluss.server.coordinator.remote.RemoteDirDynamicLoader;
 import org.apache.fluss.server.coordinator.statemachine.BucketState;
@@ -316,11 +315,11 @@ void testCreateAndDropTable() throws Exception {
 
     @Test
     void testDropTableWithRetry() throws Exception {
-        // make request to some server should fail, but delete will still be successful
-        // finally with retry logic
+        // Kafka-aligned retry: when stopReplica fails for a server, replicas are marked
+        // ineligible. When the server reconnects (DeadTabletServerEvent followed by
+        // NewTabletServerEvent), the ineligible flag is cleared and deletion retries.
         int failedServer = 0;
         initCoordinatorChannel(failedServer);
-        // create a table,
         TablePath t1 = TablePath.of(defaultDatabase, "tdrop");
         final long t1Id =
                 createTable(
@@ -331,73 +330,36 @@ void testDropTableWithRetry() throws Exception {
                             new TabletServerInfo(2, "rack2")
                         });
 
-        // retry until the create table t1 has been handled by coordinator
-        // otherwise, when receive create table event, it can't find the schema of the table
-        // since it has been deleted by the following code) which cause delete
-        // won't don anything
-        // todo: may need to fix this case;
         retryVerifyContext(ctx -> assertThat(ctx.getTablePathById(t1Id)).isNotNull());
 
-        // drop the table;
+        // Capture server info before the server "dies".
+        ServerInfo capturedServerInfo =
+                fromCtx(ctx -> ctx.getLiveTabletServers().get(failedServer));
+        assertThat(capturedServerInfo).isNotNull();
+
+        // Drop the table. Server 0's stopReplica returns errors; its replicas will be
+        // marked ineligible after the response is processed.
         metadataManager.dropTable(t1, false);
 
-        // retry until the assignment has been deleted from zk, then it means
-        // the table has been deleted successfully
-        retry(
-                Duration.ofMinutes(1),
-                () -> assertThat(zookeeperClient.getTableAssignment(t1Id)).isEmpty());
-    }
+        retryVerifyContext(ctx -> assertThat(ctx.isTableIneligibleForDeletion(t1Id)).isTrue());
 
-    @Test
-    void testStopReplicaSendFailedTransitionsReplicasToOffline() throws Exception {
-        // Verification #3: dispatch StopReplicaSendFailedEvent. Replicas in
-        // ReplicaDeletionStarted should be transitioned to ReplicaDeletionIneligible by
-        // processStopReplicaSendFailed, then resumeDeletions Step 2 retries them back to
-        // OfflineReplica. The owning table is marked ineligible so onDeleteTable does not
-        // fire.
+        // Fix server 0's gateway so future stopReplica calls succeed.
         initCoordinatorChannel();
-        TablePath t1 = TablePath.of(defaultDatabase, "t_stop_replica_send_failed");
-        final long t1Id =
-                createTable(
-                        t1,
-                        new TabletServerInfo[] {
-                            new TabletServerInfo(0, "rack0"),
-                            new TabletServerInfo(1, "rack1"),
-                            new TabletServerInfo(2, "rack2")
-                        });
-        // wait until the table has its full replica set in the context AND all are online
-        retryVerifyContext(
-                ctx -> {
-                    assertThat(ctx.replicaCounts(t1Id)).isEqualTo(N_BUCKETS * REPLICATION_FACTOR);
-                    assertThat(ctx.areAllReplicasInState(t1Id, OnlineReplica)).isTrue();
-                });
 
-        // Setup: queue table for deletion and force all replicas into ReplicaDeletionStarted.
-        Set<TableBucketReplica> replicas =
-                fromCtx(
-                        ctx -> {
-                            ctx.queueTableDeletion(Collections.singleton(t1Id));
-                            Set<TableBucketReplica> all = ctx.getAllReplicasForTable(t1Id);
-                            for (TableBucketReplica r : all) {
-                                ctx.putReplicaState(r, ReplicaState.ReplicaDeletionStarted);
-                            }
-                            return all;
-                        });
-        assertThat(replicas).hasSize(N_BUCKETS * REPLICATION_FACTOR);
+        // Simulate server 0 dying and reconnecting to clear the ineligible flag.
+        eventProcessor.getCoordinatorEventManager().put(new DeadTabletServerEvent(failedServer));
+        retryVerifyContext(
+                ctx -> assertThat(ctx.liveTabletServerSet()).doesNotContain(failedServer));
 
-        // Dispatch the event under test.
-        eventProcessor.getCoordinatorEventManager().put(new StopReplicaSendFailedEvent(replicas));
+        eventProcessor
+                .getCoordinatorEventManager()
+                .put(new NewTabletServerEvent(capturedServerInfo));
 
-        // After processing: table is ineligible and all replicas land in OfflineReplica
-        // (Started -> Ineligible by processStopReplicaSendFailed, then Ineligible ->
-        // OfflineReplica by resumeDeletions Step 2 retry).
-        retryVerifyContext(
-                ctx -> {
-                    assertThat(ctx.isTableIneligibleForDeletion(t1Id)).isTrue();
-                    for (TableBucketReplica r : replicas) {
-                        assertThat(ctx.getReplicaState(r)).isEqualTo(OfflineReplica);
-                    }
-                });
+        // After reconnect the ineligible flag is cleared and deletion retries — this time
+        // succeeding. The table assignment should be fully cleaned up.
+        retry(
+                Duration.ofMinutes(1),
+                () -> assertThat(zookeeperClient.getTableAssignment(t1Id)).isEmpty());
     }
 
     @Test
@@ -824,7 +786,7 @@ void testProcessDeadTabletServerMarksPartitionIneligible() throws Exception {
     void testProcessNewTabletServerResumesPartitionDeletion() throws Exception {
         // Change 6 partition variant (Verification #25): partition queued for deletion
         // with replicas on a previously-offline TS that's now reconnecting.
-        // markPartitionEligibleForDeletion is called and resumeDeletions completes the
+        // removePartitionFromIneligibleForDeletion is called and resumeDeletions completes the
         // partition deletion.
         TablePath tablePath = TablePath.of(defaultDatabase, "t_new_ts_resume_partition");
         initCoordinatorChannel();
diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorRequestBatchTest.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorRequestBatchTest.java
deleted file mode 100644
index 1d280dbaf2..0000000000
--- a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/CoordinatorRequestBatchTest.java
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.fluss.server.coordinator;
-
-import org.apache.fluss.metadata.TableBucket;
-import org.apache.fluss.metadata.TableBucketReplica;
-import org.apache.fluss.server.coordinator.event.CoordinatorEvent;
-import org.apache.fluss.server.coordinator.event.StopReplicaSendFailedEvent;
-import org.apache.fluss.server.coordinator.event.TestingEventManager;
-import org.apache.fluss.server.zk.ZkEpoch;
-import org.apache.fluss.server.zk.data.LeaderAndIsr;
-
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
-
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-import java.util.stream.Collectors;
-
-import static org.apache.fluss.server.coordinator.CoordinatorTestUtils.createServers;
-import static org.assertj.core.api.Assertions.assertThat;
-
-/**
- * Test for {@link CoordinatorRequestBatch}, focusing on the pre-check in {@code sendStopRequest}
- * that surfaces a {@link StopReplicaSendFailedEvent} when a target tablet server is not in the live
- * set.
- */
-class CoordinatorRequestBatchTest {
-
-    private static final long TABLE_ID = 1L;
-    private static final int BUCKET = 0;
-    private static final int LIVE_SERVER = 1;
-    private static final int DEAD_SERVER = 2;
-    private static final int LEADER_EPOCH = 0;
-    private static final int COORDINATOR_EPOCH = 0;
-
-    private CoordinatorContext coordinatorContext;
-    private TestCoordinatorChannelManager testCoordinatorChannelManager;
-    private TestingEventManager testingEventManager;
-    private CoordinatorRequestBatch requestBatch;
-    private TableBucket tableBucket;
-
-    @BeforeEach
-    void beforeEach() {
-        coordinatorContext = new CoordinatorContext(ZkEpoch.INITIAL_EPOCH);
-        // servers 0 and 1 are live; server 2 is intentionally NOT in the live set
-        coordinatorContext.setLiveTabletServers(createServers(Arrays.asList(0, LIVE_SERVER)));
-
-        // bucket has replicas on the live server and the dead server
-        tableBucket = new TableBucket(TABLE_ID, BUCKET);
-        coordinatorContext.updateBucketReplicaAssignment(
-                tableBucket, Arrays.asList(LIVE_SERVER, DEAD_SERVER));
-        coordinatorContext.putBucketLeaderAndIsr(
-                tableBucket,
-                new LeaderAndIsr(
-                        LIVE_SERVER,
-                        LEADER_EPOCH,
-                        Arrays.asList(LIVE_SERVER, DEAD_SERVER),
-                        Collections.emptyList(),
-                        COORDINATOR_EPOCH,
-                        0));
-
-        // no gateways are set on the channel manager; the live-set pre-check is what matters here
-        testCoordinatorChannelManager = new TestCoordinatorChannelManager();
-        testingEventManager = new TestingEventManager();
-        requestBatch =
-                new CoordinatorRequestBatch(
-                        testCoordinatorChannelManager, testingEventManager, coordinatorContext);
-    }
-
-    @Test
-    void testSendStopRequestEmitsFailedEventWhenServerNotInLiveSet() {
-        // schedule a delete=true stop-replica targeting BOTH the live and dead server
-        Set<Integer> targetServers = new HashSet<>(Arrays.asList(LIVE_SERVER, DEAD_SERVER));
-        requestBatch.newBatch();
-        requestBatch.addStopReplicaRequestForTabletServers(
-                targetServers, tableBucket, true, true, LEADER_EPOCH);
-
-        // fire sendStopRequest internally
-        requestBatch.sendRequestToTabletServers(COORDINATOR_EPOCH);
-
-        // exactly one StopReplicaSendFailedEvent should be emitted (for the dead server)
-        List<StopReplicaSendFailedEvent> failedEvents =
-                testingEventManager.getEvents().stream()
-                        .filter(e -> e instanceof StopReplicaSendFailedEvent)
-                        .map(e -> (StopReplicaSendFailedEvent) e)
-                        .collect(Collectors.toList());
-        assertThat(failedEvents).hasSize(1);
-
-        Set<TableBucketReplica> failedReplicas = failedEvents.get(0).getFailedReplicas();
-        assertThat(failedReplicas)
-                .containsExactly(new TableBucketReplica(tableBucket, DEAD_SERVER))
-                .doesNotContain(new TableBucketReplica(tableBucket, LIVE_SERVER));
-    }
-
-    @Test
-    void testSendStopRequestSilentlySkipsNonDeleteForDeadServer() {
-        // schedule a non-deletion stop-replica (delete=false, deleteRemote=false) targeting the
-        // dead server; this should be silently skipped without surfacing any event
-        requestBatch.newBatch();
-        requestBatch.addStopReplicaRequestForTabletServers(
-                Collections.singleton(DEAD_SERVER), tableBucket, false, false, LEADER_EPOCH);
-
-        requestBatch.sendRequestToTabletServers(COORDINATOR_EPOCH);
-
-        // no StopReplicaSendFailedEvent should be emitted for non-delete stop replicas
-        List<CoordinatorEvent> failedEvents =
-                testingEventManager.getEvents().stream()
-                        .filter(e -> e instanceof StopReplicaSendFailedEvent)
-                        .collect(Collectors.toList());
-        assertThat(failedEvents).isEmpty();
-    }
-}
diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/StopReplicaITCase.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/StopReplicaITCase.java
index 143fc24459..769146f772 100644
--- a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/StopReplicaITCase.java
+++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/StopReplicaITCase.java
@@ -37,7 +37,6 @@
 import org.apache.fluss.types.DataTypes;
 
 import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.extension.RegisterExtension;
 import org.junit.jupiter.params.ParameterizedTest;
@@ -204,34 +203,10 @@ void testDropTableCompletesAfterOfflineTabletServerReconnect() throws Exception
     }
 
     /**
-     * Verification item #11 (Change 4b end-to-end): would verify that when a stopReplica RPC fails
-     * to send while the target tablet server is still in {@code liveTabletServerSet}, the replica
-     * ends up in {@code OfflineReplica} and the table is marked ineligible until the next event
-     * removes the ineligibility.
-     *
-     * <p>Reason for {@code @Disabled}: deterministically simulating only the RPC-layer send failure
-     * (without taking the tablet server offline) requires fault injection into the coordinator's
-     * gateway, which {@link FlussClusterExtension} does not currently expose. Killing the tablet
-     * server to trigger the same code path is functionally equivalent to {@link
-     * #testDropTableCompletesAfterOfflineTabletServerReconnect()} (#10) and would be redundant. The
-     * corresponding behaviour is already covered at the unit level by the {@code
-     * StopReplicaSendFailedEvent} / {@code CoordinatorRequestBatch} tests (verification items 3 and
-     * 16).
-     */
-    @Test
-    @Disabled(
-            "Requires gateway-level fault injection not yet supported by FlussClusterExtension; "
-                    + "covered by unit tests for StopReplicaSendFailedEvent and "
-                    + "CoordinatorRequestBatch.sendStopRequest.")
-    void testDropTableRecoversFromStopReplicaRpcSendFailure() {
-        // Intentionally left blank — see @Disabled rationale.
-    }
-
-    /**
-     * Verification item #14 (Change 9 end-to-end): when a tablet server is already offline at the
-     * time {@code dropTable} is invoked, deletion must NOT throw and must NOT get stuck. Replicas
-     * on the dead tablet server should be parked in {@code ReplicaDeletionIneligible} and the table
-     * marked ineligible; once the tablet server reconnects, deletion completes.
+     * when a tablet server is already offline at the time {@code dropTable} is invoked, deletion
+     * must NOT throw and must NOT get stuck. Replicas on the dead tablet server should be parked in
+     * {@code ReplicaDeletionIneligible} and the table marked ineligible; once the tablet server
+     * reconnects, deletion completes.
      */
     @Test
     void testDropTableNotStuckWhenTabletServerDeadAtKickoff() throws Exception {
@@ -307,6 +282,75 @@ void testDropTableNotStuckWhenTabletServerDeadAtKickoff() throws Exception {
                                 .isFalse());
     }
 
+    /**
+     * Verifies that the {@code tableCount} metric (driven by {@code
+     * coordinatorContext.allTables().size()}) correctly decreases after a table is dropped, even
+     * when a tablet server is temporarily offline and the deletion is initially paused.
+     */
+    @Test
+    void testTableCountMetricDecreasesAfterDropWithOfflineTabletServer() throws Exception {
+        FLUSS_CLUSTER_EXTENSION.waitUntilAllGatewayHasSameMetadata();
+        FLUSS_CLUSTER_EXTENSION.assertHasTabletServerNumber(3);
+
+        CoordinatorContext coordinatorContext =
+                FLUSS_CLUSTER_EXTENSION
+                        .getCoordinatorServer()
+                        .getCoordinatorEventProcessor()
+                        .getCoordinatorContext();
+
+        int initialTableCount = coordinatorContext.allTables().size();
+
+        TablePath tablePath = TablePath.of("test_db_stop_replica", "test_table_count_metric");
+        TableDescriptor tableDescriptor =
+                TableDescriptor.builder()
+                        .schema(
+                                Schema.newBuilder()
+                                        .column("a", DataTypes.INT())
+                                        .column("b", DataTypes.STRING())
+                                        .build())
+                        .distributedBy(1)
+                        .property(ConfigOptions.TABLE_REPLICATION_FACTOR, 3)
+                        .build();
+        long tableId =
+                RpcMessageTestUtils.createTable(
+                        FLUSS_CLUSTER_EXTENSION, tablePath, tableDescriptor);
+        TableBucket tb = new TableBucket(tableId, 0);
+        FLUSS_CLUSTER_EXTENSION.waitUntilAllReplicaReady(tb);
+
+        assertThat(coordinatorContext.allTables().size()).isEqualTo(initialTableCount + 1);
+
+        List<Integer> isr = waitAndGetIsr(tb);
+        List<Path> tableDirs = assertReplicaExistAndGetTableOrPartitionDirs(tb, isr, false);
+
+        int leader = FLUSS_CLUSTER_EXTENSION.waitAndGetLeader(tb);
+        int offlineServerId =
+                isr.stream().filter(id -> id != leader).findFirst().orElse(isr.get(0));
+
+        FLUSS_CLUSTER_EXTENSION.stopTabletServer(offlineServerId);
+        FLUSS_CLUSTER_EXTENSION.assertHasTabletServerNumber(2);
+
+        coordinatorGateway
+                .dropTable(
+                        RpcMessageTestUtils.newDropTableRequest(
+                                tablePath.getDatabaseName(), tablePath.getTableName(), false))
+                .get();
+        assertThat(zkClient.tableExist(tablePath)).isFalse();
+
+        FLUSS_CLUSTER_EXTENSION.startTabletServer(offlineServerId);
+        FLUSS_CLUSTER_EXTENSION.assertHasTabletServerNumber(3);
+
+        retryUntilDeletionCleanedInZk(tb, isr, tableDirs, offlineServerId);
+
+        retry(
+                Duration.ofMinutes(1),
+                () ->
+                        assertThat(coordinatorContext.allTables().size())
+                                .as(
+                                        "tableCount should return to initial value after "
+                                                + "deletion completes")
+                                .isEqualTo(initialTableCount));
+    }
+
     /**
      * Verification item #27 (partition variant of #14): when a tablet server is offline at the time
      * a partition is dropped, partition deletion must NOT get stuck. Once the tablet server
diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/TableManagerTest.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/TableManagerTest.java
index b802fad8e6..f57d1fb479 100644
--- a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/TableManagerTest.java
+++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/TableManagerTest.java
@@ -478,7 +478,7 @@ void testResumeTableDeletionsStep3FiresWhenEligibilityRestored() throws Exceptio
         }
 
         // simulate the TS-reconnect / eligibility-restore path
-        coordinatorContext.markTableEligibleForDeletion(tableId);
+        coordinatorContext.removeTableFromIneligibleForDeletion(tableId);
 
         testingEventManager.clearEvents();
         tableManager.resumeDeletions();
diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/TestCoordinatorChannelManager.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/TestCoordinatorChannelManager.java
index a25c4f4cee..59ac256ec1 100644
--- a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/TestCoordinatorChannelManager.java
+++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/TestCoordinatorChannelManager.java
@@ -18,15 +18,27 @@
 package org.apache.fluss.server.coordinator;
 
 import org.apache.fluss.config.Configuration;
+import org.apache.fluss.metadata.TableBucketReplica;
 import org.apache.fluss.rpc.RpcClient;
 import org.apache.fluss.rpc.gateway.TabletServerGateway;
+import org.apache.fluss.rpc.messages.StopReplicaRequest;
+import org.apache.fluss.rpc.messages.StopReplicaResponse;
 import org.apache.fluss.rpc.metrics.TestingClientMetricGroup;
+import org.apache.fluss.server.metrics.group.TestingMetricGroups;
+
+import javax.annotation.Nullable;
 
 import java.util.Collections;
 import java.util.Map;
 import java.util.Optional;
+import java.util.Set;
+import java.util.function.BiConsumer;
 
-/** A coordinator channel manager for test purpose which can set gateways manually. */
+/**
+ * A coordinator channel manager for test purpose which can set gateways manually. Overrides {@link
+ * #sendStopBucketReplicaRequest} to invoke the callback synchronously via the direct gateway path,
+ * bypassing the per-TS sender thread used in production.
+ */
 public class TestCoordinatorChannelManager extends CoordinatorChannelManager {
 
     private Map<Integer, TabletServerGateway> gateways;
@@ -36,7 +48,11 @@ public TestCoordinatorChannelManager() {
     }
 
     public TestCoordinatorChannelManager(Map<Integer, TabletServerGateway> gateways) {
-        super(RpcClient.create(new Configuration(), TestingClientMetricGroup.newInstance()));
+        super(
+                RpcClient.create(new Configuration(), TestingClientMetricGroup.newInstance()),
+                () -> 0,
+                new Configuration(),
+                TestingMetricGroups.COORDINATOR_METRICS);
         this.gateways = gateways;
     }
 
@@ -44,6 +60,20 @@ public void setGateways(Map<Integer, TabletServerGateway> gateways) {
         this.gateways = gateways;
     }
 
+    @Override
+    public void sendStopBucketReplicaRequest(
+            int receiveServerId,
+            StopReplicaRequest stopReplicaRequest,
+            int coordinatorEpoch,
+            @Nullable Set<TableBucketReplica> deletionReplicas,
+            BiConsumer<StopReplicaResponse, ? super Throwable> responseConsumer) {
+        Optional<TabletServerGateway> gatewayOpt = getTabletServerGateway(receiveServerId);
+        if (gatewayOpt.isPresent()) {
+            gatewayOpt.get().stopReplica(stopReplicaRequest).whenComplete(responseConsumer);
+        }
+    }
+
+    @Override
     protected Optional<TabletServerGateway> getTabletServerGateway(int serverId) {
         return Optional.ofNullable(gateways.get(serverId));
     }
diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/channel/ControlRequestSendThreadTest.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/channel/ControlRequestSendThreadTest.java
new file mode 100644
index 0000000000..5d835e6b30
--- /dev/null
+++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/channel/ControlRequestSendThreadTest.java
@@ -0,0 +1,349 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.server.coordinator.channel;
+
+import org.apache.fluss.config.ConfigOptions;
+import org.apache.fluss.config.Configuration;
+import org.apache.fluss.metrics.Counter;
+import org.apache.fluss.metrics.Gauge;
+import org.apache.fluss.metrics.MetricNames;
+import org.apache.fluss.metrics.groups.MetricGroup;
+import org.apache.fluss.metrics.util.TestMetricGroup;
+import org.apache.fluss.rpc.gateway.TabletServerGateway;
+import org.apache.fluss.rpc.messages.StopReplicaRequest;
+import org.apache.fluss.rpc.messages.StopReplicaResponse;
+
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.Test;
+
+import java.lang.reflect.Proxy;
+import java.util.Optional;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Function;
+import java.util.function.IntSupplier;
+import java.util.function.Supplier;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/** Tests for {@link ControlRequestSendThread}. */
+class ControlRequestSendThreadTest {
+
+    private static final int TABLET_SERVER_ID = 1;
+    private static final int EPOCH = 1;
+
+    private ControlRequestSendThread thread;
+
+    @AfterEach
+    void afterEach() throws InterruptedException {
+        if (thread != null) {
+            thread.initiateShutdown();
+            thread.awaitShutdown();
+        }
+    }
+
+    /** U1: happy path — single item dequeued, dispatched, callback invoked. */
+    @Test
+    void testHappyPath() throws Exception {
+        BlockingQueue<QueueItem> queue = new LinkedBlockingQueue<>();
+        MetricGroup metricGroup = TestMetricGroup.createTestMetricGroup();
+
+        StopReplicaResponse response = new StopReplicaResponse();
+        AtomicInteger invocationCount = new AtomicInteger(0);
+        TabletServerGateway gateway =
+                stubGateway(
+                        req -> {
+                            invocationCount.incrementAndGet();
+                            return CompletableFuture.completedFuture(response);
+                        });
+
+        AtomicReference<Object> callbackResponse = new AtomicReference<>();
+        CountDownLatch callbackLatch = new CountDownLatch(1);
+
+        thread = createThread(queue, () -> Optional.of(gateway), () -> EPOCH, metricGroup);
+        thread.start();
+
+        queue.put(
+                new QueueItem(
+                        ApiKey.STOP_REPLICA,
+                        new StopReplicaRequest(),
+                        (resp, err) -> {
+                            callbackResponse.set(resp);
+                            callbackLatch.countDown();
+                        },
+                        EPOCH,
+                        System.currentTimeMillis(),
+                        null));
+
+        assertThat(callbackLatch.await(5, TimeUnit.SECONDS)).isTrue();
+        assertThat(callbackResponse.get()).isSameAs(response);
+        assertThat(getCounter(metricGroup, MetricNames.SENDER_RETRY_COUNT).getCount()).isEqualTo(0);
+        assertThat(getCounter(metricGroup, MetricNames.SENDER_STALE_DROP_COUNT).getCount())
+                .isEqualTo(0);
+        assertThat(getGaugeValue(metricGroup, MetricNames.SENDER_ALIVE)).isEqualTo(1);
+        assertThat(invocationCount.get()).isEqualTo(1);
+    }
+
+    /** U2: retry then succeed — gateway fails once, succeeds on second attempt. */
+    @Test
+    void testRetryThenSucceed() throws Exception {
+        BlockingQueue<QueueItem> queue = new LinkedBlockingQueue<>();
+        MetricGroup metricGroup = TestMetricGroup.createTestMetricGroup();
+
+        StopReplicaResponse response = new StopReplicaResponse();
+        AtomicInteger callCount = new AtomicInteger(0);
+        TabletServerGateway gateway =
+                stubGateway(
+                        req -> {
+                            if (callCount.incrementAndGet() == 1) {
+                                CompletableFuture<StopReplicaResponse> fail =
+                                        new CompletableFuture<>();
+                                fail.completeExceptionally(new RuntimeException("transient error"));
+                                return fail;
+                            }
+                            return CompletableFuture.completedFuture(response);
+                        });
+
+        CountDownLatch callbackLatch = new CountDownLatch(1);
+
+        thread = createThread(queue, () -> Optional.of(gateway), () -> EPOCH, metricGroup);
+        thread.start();
+
+        queue.put(
+                new QueueItem(
+                        ApiKey.STOP_REPLICA,
+                        new StopReplicaRequest(),
+                        (resp, err) -> callbackLatch.countDown(),
+                        EPOCH,
+                        System.currentTimeMillis(),
+                        null));
+
+        assertThat(callbackLatch.await(5, TimeUnit.SECONDS)).isTrue();
+        assertThat(getCounter(metricGroup, MetricNames.SENDER_RETRY_COUNT).getCount())
+                .isGreaterThanOrEqualTo(1);
+    }
+
+    /** U4: stale-epoch drop — item with older epoch is dropped, staleDropCount increments. */
+    @Test
+    void testStaleEpochDrop() throws Exception {
+        BlockingQueue<QueueItem> queue = new LinkedBlockingQueue<>();
+        MetricGroup metricGroup = TestMetricGroup.createTestMetricGroup();
+
+        StopReplicaResponse response = new StopReplicaResponse();
+        TabletServerGateway gateway =
+                stubGateway(req -> CompletableFuture.completedFuture(response));
+
+        CountDownLatch sentinelLatch = new CountDownLatch(1);
+
+        thread = createThread(queue, () -> Optional.of(gateway), () -> 5, metricGroup);
+        thread.start();
+
+        // stale item (epoch 1 < current 5)
+        queue.put(
+                new QueueItem(
+                        ApiKey.STOP_REPLICA,
+                        new StopReplicaRequest(),
+                        (resp, err) -> {},
+                        1,
+                        System.currentTimeMillis(),
+                        null));
+
+        // sentinel item with current epoch to detect when stale item was processed
+        queue.put(
+                new QueueItem(
+                        ApiKey.STOP_REPLICA,
+                        new StopReplicaRequest(),
+                        (resp, err) -> sentinelLatch.countDown(),
+                        5,
+                        System.currentTimeMillis(),
+                        null));
+
+        assertThat(sentinelLatch.await(5, TimeUnit.SECONDS)).isTrue();
+        assertThat(getCounter(metricGroup, MetricNames.SENDER_STALE_DROP_COUNT).getCount())
+                .isEqualTo(1);
+    }
+
+    /** U5: gateway absent then present — retries until gateway becomes available. */
+    @Test
+    void testGatewayAbsentThenPresent() throws Exception {
+        BlockingQueue<QueueItem> queue = new LinkedBlockingQueue<>();
+        MetricGroup metricGroup = TestMetricGroup.createTestMetricGroup();
+
+        StopReplicaResponse response = new StopReplicaResponse();
+        TabletServerGateway gateway =
+                stubGateway(req -> CompletableFuture.completedFuture(response));
+
+        AtomicInteger gatewayCallCount = new AtomicInteger(0);
+        Supplier<Optional<TabletServerGateway>> gatewaySupplier =
+                () -> {
+                    if (gatewayCallCount.incrementAndGet() <= 3) {
+                        return Optional.empty();
+                    }
+                    return Optional.of(gateway);
+                };
+
+        CountDownLatch callbackLatch = new CountDownLatch(1);
+
+        thread = createThread(queue, gatewaySupplier, () -> EPOCH, metricGroup);
+        thread.start();
+
+        queue.put(
+                new QueueItem(
+                        ApiKey.STOP_REPLICA,
+                        new StopReplicaRequest(),
+                        (resp, err) -> callbackLatch.countDown(),
+                        EPOCH,
+                        System.currentTimeMillis(),
+                        null));
+
+        assertThat(callbackLatch.await(5, TimeUnit.SECONDS)).isTrue();
+        assertThat(getCounter(metricGroup, MetricNames.SENDER_RETRY_COUNT).getCount())
+                .isGreaterThanOrEqualTo(3);
+    }
+
+    /** U7: shutdown cancels in-flight — shutdown completes quickly, callback never invoked. */
+    @Test
+    void testShutdownCancelsInFlight() throws Exception {
+        BlockingQueue<QueueItem> queue = new LinkedBlockingQueue<>();
+        MetricGroup metricGroup = TestMetricGroup.createTestMetricGroup();
+
+        CountDownLatch invokedLatch = new CountDownLatch(1);
+        CompletableFuture<StopReplicaResponse> neverCompleteFuture = new CompletableFuture<>();
+        TabletServerGateway gateway =
+                stubGateway(
+                        req -> {
+                            invokedLatch.countDown();
+                            return neverCompleteFuture;
+                        });
+
+        AtomicInteger callbackCount = new AtomicInteger(0);
+
+        thread = createThread(queue, () -> Optional.of(gateway), () -> EPOCH, metricGroup);
+        thread.start();
+
+        queue.put(
+                new QueueItem(
+                        ApiKey.STOP_REPLICA,
+                        new StopReplicaRequest(),
+                        (resp, err) -> callbackCount.incrementAndGet(),
+                        EPOCH,
+                        System.currentTimeMillis(),
+                        null));
+
+        assertThat(invokedLatch.await(5, TimeUnit.SECONDS)).isTrue();
+
+        long shutdownStart = System.currentTimeMillis();
+        thread.shutdown();
+        long shutdownDuration = System.currentTimeMillis() - shutdownStart;
+
+        assertThat(shutdownDuration).isLessThan(2000);
+        assertThat(callbackCount.get()).isEqualTo(0);
+        assertThat(getGaugeValue(metricGroup, MetricNames.SENDER_ALIVE)).isEqualTo(0);
+        thread = null;
+    }
+
+    /** U8: callback throws after successful send — thread survives, no re-send. */
+    @Test
+    void testCallbackThrowsDoesNotRetry() throws Exception {
+        BlockingQueue<QueueItem> queue = new LinkedBlockingQueue<>();
+        MetricGroup metricGroup = TestMetricGroup.createTestMetricGroup();
+
+        StopReplicaResponse response = new StopReplicaResponse();
+        AtomicInteger invocationCount = new AtomicInteger(0);
+        TabletServerGateway gateway =
+                stubGateway(
+                        req -> {
+                            invocationCount.incrementAndGet();
+                            return CompletableFuture.completedFuture(response);
+                        });
+
+        CountDownLatch secondCallbackLatch = new CountDownLatch(1);
+
+        thread = createThread(queue, () -> Optional.of(gateway), () -> EPOCH, metricGroup);
+        thread.start();
+
+        // first item: callback throws
+        queue.put(
+                new QueueItem(
+                        ApiKey.STOP_REPLICA,
+                        new StopReplicaRequest(),
+                        (resp, err) -> {
+                            throw new RuntimeException("buggy callback");
+                        },
+                        EPOCH,
+                        System.currentTimeMillis(),
+                        null));
+
+        // second item: proves thread survived
+        queue.put(
+                new QueueItem(
+                        ApiKey.STOP_REPLICA,
+                        new StopReplicaRequest(),
+                        (resp, err) -> secondCallbackLatch.countDown(),
+                        EPOCH,
+                        System.currentTimeMillis(),
+                        null));
+
+        assertThat(secondCallbackLatch.await(5, TimeUnit.SECONDS)).isTrue();
+        // gateway invoked exactly twice (once per item, no re-send from callback failure)
+        assertThat(invocationCount.get()).isEqualTo(2);
+        assertThat(getCounter(metricGroup, MetricNames.SENDER_RETRY_COUNT).getCount()).isEqualTo(0);
+        assertThat(getGaugeValue(metricGroup, MetricNames.SENDER_ALIVE)).isEqualTo(1);
+    }
+
+    @SuppressWarnings("unchecked")
+    private static TabletServerGateway stubGateway(
+            Function<StopReplicaRequest, CompletableFuture<StopReplicaResponse>> stopReplicaFn) {
+        return (TabletServerGateway)
+                Proxy.newProxyInstance(
+                        TabletServerGateway.class.getClassLoader(),
+                        new Class<?>[] {TabletServerGateway.class},
+                        (proxy, method, args) -> {
+                            if ("stopReplica".equals(method.getName())) {
+                                return stopReplicaFn.apply((StopReplicaRequest) args[0]);
+                            }
+                            throw new UnsupportedOperationException(
+                                    "Stub does not support: " + method.getName());
+                        });
+    }
+
+    private ControlRequestSendThread createThread(
+            BlockingQueue<QueueItem> queue,
+            Supplier<Optional<TabletServerGateway>> gatewaySupplier,
+            IntSupplier epochSupplier,
+            MetricGroup metricGroup) {
+        Configuration conf = new Configuration();
+        conf.set(ConfigOptions.COORDINATOR_REQUEST_RETRY_BACKOFF, java.time.Duration.ofMillis(10));
+        return new ControlRequestSendThread(
+                TABLET_SERVER_ID, queue, gatewaySupplier, epochSupplier, conf, metricGroup);
+    }
+
+    private static Counter getCounter(MetricGroup group, String name) {
+        return (Counter) ((TestMetricGroup) group).getMetric(name);
+    }
+
+    @SuppressWarnings("unchecked")
+    private static int getGaugeValue(MetricGroup group, String name) {
+        return ((Gauge<Integer>) ((TestMetricGroup) group).getMetric(name)).getValue();
+    }
+}
diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/statemachine/ReplicaStateMachineTest.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/statemachine/ReplicaStateMachineTest.java
index 3f06b56fb4..f86a7c3042 100644
--- a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/statemachine/ReplicaStateMachineTest.java
+++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/statemachine/ReplicaStateMachineTest.java
@@ -33,6 +33,7 @@
 import org.apache.fluss.server.coordinator.event.DeleteReplicaResponseReceivedEvent;
 import org.apache.fluss.server.entity.DeleteReplicaResultForBucket;
 import org.apache.fluss.server.metadata.ServerInfo;
+import org.apache.fluss.server.metrics.group.TestingMetricGroups;
 import org.apache.fluss.server.zk.NOPErrorHandler;
 import org.apache.fluss.server.zk.ZkEpoch;
 import org.apache.fluss.server.zk.ZooKeeperClient;
@@ -365,7 +366,10 @@ private ReplicaStateMachine createReplicaStateMachine(CoordinatorContext coordin
                         new CoordinatorChannelManager(
                                 RpcClient.create(
                                         new Configuration(),
-                                        TestingClientMetricGroup.newInstance())),
+                                        TestingClientMetricGroup.newInstance()),
+                                () -> 0,
+                                new Configuration(),
+                                TestingMetricGroups.COORDINATOR_METRICS),
                         (event) -> {
                             // do nothing
                         },
diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/statemachine/TableBucketStateMachineTest.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/statemachine/TableBucketStateMachineTest.java
index 4dca20f567..b3dd36e575 100644
--- a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/statemachine/TableBucketStateMachineTest.java
+++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/statemachine/TableBucketStateMachineTest.java
@@ -300,7 +300,10 @@ void testStateChangeToOnline() throws Exception {
                         new CoordinatorChannelManager(
                                 RpcClient.create(
                                         new Configuration(),
-                                        TestingClientMetricGroup.newInstance())),
+                                        TestingClientMetricGroup.newInstance()),
+                                () -> 0,
+                                new Configuration(),
+                                TestingMetricGroups.COORDINATOR_METRICS),
                         coordinatorContext,
                         autoPartitionManager,
                         lakeTableTieringManager,