diff --git a/scripts/lab_dual_node_outage_soak_mixed_churn.py b/scripts/lab_dual_node_outage_soak_mixed_churn.py
new file mode 100644
index 000000000..139dd48d5
--- /dev/null
+++ b/scripts/lab_dual_node_outage_soak_mixed_churn.py
@@ -0,0 +1,1892 @@
+#!/usr/bin/env python3
+"""Lab variant of aws_dual_node_outage_soak_mixed_churn.py.
+
+Designed to run from the simplyblock jump host against the bare-metal lab
+cluster (mgmt 192.168.10.210, storage .201-.204) deployed by
+setup_lab_perf_test1.py.
+
+Iteration pattern (load during outage, unload during settle):
+  1. start fio on every volume
+  2. apply the dual-node outage pair (fio takes the hit)
+  3. wait for both nodes back online + post-outage check_fio (fault gate)
+  4. stop fio
+  5. optionally rebuild one randomly-selected volume
+     (every --churn-every-n-iters iterations)
+  6. wait_for_cluster_stable + wait_for_data_migration_complete -- now
+     UNLOADED, so migration drains fast
+  7. next iteration
+
+This trades the AWS-variant's "fio runs continuously across outages plus
+a 3-20 minute background churn timer" for a deterministic per-iteration
+pattern: settling never happens under fio load, so iteration time drops
+to whatever rebalance takes on a quiet cluster. There is no inter-
+iteration NIC chaos.
+
+Differences vs. the AWS variant beyond the iteration pattern:
+  - SSH uses a single shared root password (CLI flag, env var, or prompt),
+    not an EC2 keypair. Install paramiko if you can ("pip3 install --user
+    paramiko" on the jump host); otherwise the script falls back to
+    `sshpass + ssh`, which is slower because every command opens a fresh
+    connection.
+  - Defaults match the lab topology: --expected-node-count 4, metadata
+    file cluster_metadata_base.json (the file setup_lab_perf_test1.py
+    writes), mount root /root/lab_outage_soak_*.
+
+Typical invocation (from the jump host, after setup_lab_perf_test1.py has
+created the cluster and written cluster_metadata_base.json next to this
+script):
+
+    python3 ~/lab_dual_node_outage_soak_mixed_churn.py             # prompts for password
+    SBCLI_ROOT_PASSWORD='...' python3 ~/lab_dual_node_outage_soak_mixed_churn.py
+    python3 ~/lab_dual_node_outage_soak_mixed_churn.py --password '...'
+"""
+import argparse
+import getpass
+import itertools
+import json
+import logging
+import os
+import posixpath
+import random
+import re
+import shlex
+import subprocess
+import sys
+import threading
+import time
+from dataclasses import dataclass
+from pathlib import Path
+
+try:
+    import paramiko
+except ImportError:
+    paramiko = None
+
+# Silence paramiko's Transport-thread "Socket exception: Connection
+# reset by peer (104)" prints. They fire whenever an open SSH
+# connection to a storage node gets RST'd by a planned event —
+# host_reboot outage tearing down sshd, NIC down/up flapping, etc.
+# The retry/reconnect logic handles it cleanly; the stack-trace-less
+# stderr lines just clutter the soak output.
+logging.getLogger("paramiko").setLevel(logging.CRITICAL)
+logging.getLogger("paramiko.transport").setLevel(logging.CRITICAL)
+
+
+UUID_RE = re.compile(r"[a-f0-9]{8}(?:-[a-f0-9]{4}){3}-[a-f0-9]{12}")
+# `sbctl lvol connect` emits `sudo nvme connect ... --nqn=<NQN> ...`
+# (long form with `=`, see lvol_controller.py:1737). Tolerate the legacy
+# short form `-n <NQN>` as well so older sbctl deployments still parse.
+NQN_RE = re.compile(r"(?:--nqn[=\s]+|-n\s+)(\S+)")
+
+
+OUTAGE_METHODS = (
+    "graceful", "forced", "container_kill", "host_reboot",
+    "network_outage_20", "network_outage_50",
+)
+# Methods that leave the node in a state where it recovers on its own
+# (no sbctl restart required from the soak driver).
+AUTO_RECOVER_METHODS = (
+    "container_kill", "host_reboot",
+    "network_outage_20", "network_outage_50",
+)
+
+# Scenario enumeration:
+#   3 role categories × P(M,2) ordered distinct-method pairs
+#   = 3 × M·(M-1) scenarios per cycle.
+# Examples:
+#   M=5 → 3 × 20 = 60
+#   M=6 → 3 × 30 = 90
+# Role categories (relative ring-distance preserved; the actual node pair
+# is re-rolled randomly per scenario at execution time so the soak hits
+# many different concrete pairs while keeping the topological distance
+# fixed for each category).
+# Order matters: the soak walks the full method permutation for one
+# category before moving on. "unrelated" runs first so the outage with
+# the widest blast-radius coverage (two nodes from different LVS rings)
+# exercises the cluster before the within-ring categories.
+#   - unrelated         : pair sharing no LVS in any role — ring-distance
+#                         ≥ 3 (≥ 2 nodes between).
+#   - primary_tertiary  : primary + tertiary of same LVS — ring-distance
+#                         2 (exactly one node between); no replication
+#                         edge connects them (jumps over the secondary).
+#   - primary_secondary : primary + secondary of same LVS — ring-distance
+#                         1 (direct successor). Represents both (P,S) and
+#                         (S,T): two adjacent replicas of the same LVS
+#                         going down is structurally symmetric regardless
+#                         of which end.
+# Same-method pairs (graceful,graceful etc.) are not enumerated — the
+# user-agreed count 30 for 6 methods equals 6·5, not 6².
+ROLE_CATEGORIES = ("unrelated", "primary_tertiary", "primary_secondary")
+
+
+def parse_args():
+    default_metadata = Path(__file__).with_name("cluster_metadata_base.json")
+    default_log_dir = Path(__file__).parent
+
+    parser = argparse.ArgumentParser(
+        description=(
+            "Run a long fio soak against the lab cluster while cycling random "
+            "two-node outages with mixed outage methods. Each iteration: start "
+            "fio on every volume, apply the outage pair, fault-check fio, "
+            "stop fio, optionally rebuild one volume, then wait for the cluster "
+            "to settle UNLOADED (no IO pressure on rebalance/data-migration). "
+            "Trades 'fio always running' for fast iteration time."
+        )
+    )
+    parser.add_argument("--metadata", default=str(default_metadata), help="Path to cluster metadata JSON.")
+    parser.add_argument("--pool", default="pool01", help="Pool name for volume creation.")
+    parser.add_argument("--expected-node-count", type=int, default=4, help="Required storage node count.")
+    parser.add_argument("--volume-size", default="25G", help="Volume size to create per storage node.")
+    parser.add_argument("--runtime", type=int, default=72000, help="fio runtime in seconds.")
+    parser.add_argument("--restart-timeout", type=int, default=900, help="Seconds to wait for restarted nodes.")
+    parser.add_argument("--rebalance-timeout", type=int, default=7200, help="Seconds to wait for rebalancing.")
+    parser.add_argument("--poll-interval", type=int, default=10, help="Poll interval for health checks.")
+    parser.add_argument(
+        "--shutdown-gap",
+        type=int,
+        default=0,
+        help="Optional delay between shutting down the two selected nodes.",
+    )
+    parser.add_argument(
+        "--log-file",
+        default=str(default_log_dir / f"aws_dual_node_outage_soak_{time.strftime('%Y%m%d_%H%M%S')}.log"),
+        help="Single log file for script and CLI output.",
+    )
+    parser.add_argument(
+        "--run-on-mgmt",
+        action="store_true",
+        help="Run management-node commands locally instead of over SSH.",
+    )
+    parser.add_argument(
+        "--password",
+        default=None,
+        help=(
+            "Root password shared by mgmt+storage nodes. If omitted, falls "
+            "back to the SBCLI_ROOT_PASSWORD env var, then to an interactive "
+            "prompt. Avoid the flag form on shared hosts (visible in `ps`)."
+        ),
+    )
+    parser.add_argument(
+        "--methods",
+        default=",".join(OUTAGE_METHODS),
+        help=(
+            "Comma-separated subset of outage methods to pick from per iteration. "
+            f"Choices: {','.join(OUTAGE_METHODS)}. "
+            "Each iteration picks 2 distinct methods at random."
+        ),
+    )
+    parser.add_argument(
+        "--auto-recover-wait",
+        type=int,
+        default=900,
+        help=(
+            "Seconds to wait for a node to return online after a container_kill "
+            "or host_reboot outage (no sbctl restart is issued)."
+        ),
+    )
+    parser.add_argument(
+        "--cycles",
+        type=int,
+        default=1,
+        help=(
+            "Number of passes through the full deterministic scenario list. "
+            "Each pass covers C(N,2)*M² scenarios (250 for 5 nodes × 5 methods; "
+            "540 for 6 × 6). 0 means loop forever."
+        ),
+    )
+    parser.add_argument(
+        "--shuffle-scenarios",
+        action="store_true",
+        help=(
+            "Shuffle scenario order per cycle (seeded deterministically off "
+            "the cycle index). Useful when a full cycle is too long to finish "
+            "and you want even coverage across early/mid/late pairs."
+        ),
+    )
+    parser.add_argument(
+        "--start-at",
+        type=int,
+        default=1,
+        help=(
+            "Start the first cycle at scenario N (1-indexed). Scenarios "
+            "1..N-1 are skipped in the first cycle only; subsequent cycles "
+            "run from scenario 1 as normal. Use to resume after a failure — "
+            "e.g. --start-at 60 if scenario 60 is the one that failed."
+        ),
+    )
+    parser.add_argument(
+        "--churn-every-n-iters",
+        type=int,
+        default=3,
+        help=(
+            "Rebuild one randomly-chosen volume every N outage iterations, "
+            "in the unloaded window between fio-stop and rebalance-wait. "
+            "Default 3. Set to 0 to disable; --no-churn is an explicit alias."
+        ),
+    )
+    parser.add_argument(
+        "--no-churn",
+        action="store_true",
+        help="Disable per-iteration volume churn entirely.",
+    )
+    args = parser.parse_args()
+    methods = [m.strip() for m in args.methods.split(",") if m.strip()]
+    bad = [m for m in methods if m not in OUTAGE_METHODS]
+    if bad:
+        parser.error(f"Unknown outage method(s): {bad}. Choices: {list(OUTAGE_METHODS)}")
+    if not methods:
+        parser.error("At least one outage method must be enabled")
+    args.methods = methods
+    if args.churn_every_n_iters < 0:
+        parser.error("--churn-every-n-iters must be >= 0")
+    args.password = resolve_password(args.password)
+    if not args.password:
+        parser.error("Empty root password; supply --password, SBCLI_ROOT_PASSWORD, or answer the prompt.")
+    if subprocess.run(["which", "sshpass"], capture_output=True).returncode != 0 and paramiko is None:
+        parser.error(
+            "Neither paramiko nor sshpass is available on this host. "
+            "Install one ('pip3 install --user paramiko' or 'sudo dnf install sshpass') "
+            "before running."
+        )
+    return args
+
+
+def resolve_password(cli_value):
+    if cli_value:
+        return cli_value
+    env_value = os.environ.get("SBCLI_ROOT_PASSWORD")
+    if env_value:
+        return env_value
+    return getpass.getpass("Root password for lab nodes (.210, .201-.204): ")
+
+
+def load_metadata(path):
+    with open(path, "r", encoding="utf-8") as handle:
+        return json.load(handle)
+
+
+class Logger:
+    def __init__(self, path):
+        self.path = path
+        self.lock = threading.Lock()
+        Path(path).parent.mkdir(parents=True, exist_ok=True)
+
+    def log(self, message):
+        line = f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] {message}"
+        with self.lock:
+            print(line, flush=True)
+            with open(self.path, "a", encoding="utf-8") as handle:
+                handle.write(line + "\n")
+
+    def block(self, header, content):
+        if content is None:
+            return
+        text = content.rstrip()
+        if not text:
+            return
+        with self.lock:
+            with open(self.path, "a", encoding="utf-8") as handle:
+                handle.write(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] {header}\n")
+                handle.write(text + "\n")
+
+
+class RemoteCommandError(RuntimeError):
+    pass
+
+
+class RemoteHost:
+    def __init__(self, hostname, user, password, logger, name):
+        self.hostname = hostname
+        self.user = user
+        self.password = password
+        self.logger = logger
+        self.name = name
+        self.client = None
+        self.connect()
+
+    def connect(self):
+        if paramiko is None:
+            return
+        self.close()
+        last_error = None
+        for attempt in range(1, 16):
+            try:
+                client = paramiko.SSHClient()
+                client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+                client.connect(
+                    hostname=self.hostname,
+                    username=self.user,
+                    password=self.password,
+                    timeout=15,
+                    banner_timeout=15,
+                    auth_timeout=15,
+                    allow_agent=False,
+                    look_for_keys=False,
+                )
+                transport = client.get_transport()
+                if transport is not None:
+                    transport.set_keepalive(30)
+                self.client = client
+                return
+            except Exception as exc:
+                last_error = exc
+                self.logger.log(
+                    f"{self.name}: SSH attempt {attempt}/15 failed to {self.hostname}: {exc}"
+                )
+                time.sleep(5)
+        raise RemoteCommandError(f"{self.name}: failed to connect to {self.hostname}: {last_error}")
+
+    def run(self, command, timeout=600, check=True, label=None):
+        if paramiko is None:
+            return self._run_via_ssh_cli(command, timeout=timeout, check=check, label=label)
+        if self.client is None:
+            self.connect()
+        label = label or command
+        self.logger.log(f"{self.name}: RUN {label}")
+        try:
+            stdin, stdout, stderr = self.client.exec_command(command, timeout=timeout)
+            stdout_text = stdout.read().decode("utf-8", errors="replace")
+            stderr_text = stderr.read().decode("utf-8", errors="replace")
+            rc = stdout.channel.recv_exit_status()
+        except Exception as exc:
+            self.logger.log(f"{self.name}: command transport failure for {label}: {exc}; reconnecting once")
+            self.connect()
+            stdin, stdout, stderr = self.client.exec_command(command, timeout=timeout)
+            stdout_text = stdout.read().decode("utf-8", errors="replace")
+            stderr_text = stderr.read().decode("utf-8", errors="replace")
+            rc = stdout.channel.recv_exit_status()
+        self.logger.block(f"{self.name}: STDOUT for {label}", stdout_text)
+        self.logger.block(f"{self.name}: STDERR for {label}", stderr_text)
+        if check and rc != 0:
+            raise RemoteCommandError(
+                f"{self.name}: command failed with rc={rc}: {label}"
+            )
+        return rc, stdout_text, stderr_text
+
+    def _run_via_ssh_cli(self, command, timeout=600, check=True, label=None):
+        label = label or command
+        self.logger.log(f"{self.name}: RUN {label}")
+        ssh_cmd = [
+            "sshpass", "-e",
+            "ssh",
+            "-o", "StrictHostKeyChecking=no",
+            "-o", "UserKnownHostsFile=/dev/null",
+            "-o", "LogLevel=ERROR",
+            "-o", "ConnectTimeout=15",
+            "-o", "ServerAliveInterval=30",
+            f"{self.user}@{self.hostname}",
+            command,
+        ]
+        env = os.environ.copy()
+        env["SSHPASS"] = self.password
+        try:
+            completed = subprocess.run(
+                ssh_cmd,
+                capture_output=True,
+                text=True,
+                timeout=timeout,
+                check=False,
+                env=env,
+            )
+        except subprocess.TimeoutExpired as exc:
+            stdout_text = exc.stdout or ""
+            stderr_text = exc.stderr or ""
+            self.logger.block(f"{self.name}: STDOUT for {label}", stdout_text)
+            self.logger.block(f"{self.name}: STDERR for {label}", stderr_text)
+            raise RemoteCommandError(f"{self.name}: command timed out: {label}") from exc
+        stdout_text = completed.stdout or ""
+        stderr_text = completed.stderr or ""
+        rc = completed.returncode
+        self.logger.block(f"{self.name}: STDOUT for {label}", stdout_text)
+        self.logger.block(f"{self.name}: STDERR for {label}", stderr_text)
+        if check and rc != 0:
+            raise RemoteCommandError(f"{self.name}: command failed with rc={rc}: {label}")
+        return rc, stdout_text, stderr_text
+
+    def close(self):
+        if self.client is not None:
+            self.client.close()
+            self.client = None
+
+
+class LocalHost:
+    def __init__(self, logger, name):
+        self.logger = logger
+        self.name = name
+
+    def run(self, command, timeout=600, check=True, label=None):
+        label = label or command
+        self.logger.log(f"{self.name}: RUN {label}")
+        try:
+            completed = subprocess.run(
+                ["/bin/bash", "-lc", command],
+                capture_output=True,
+                text=True,
+                timeout=timeout,
+                check=False,
+            )
+        except subprocess.TimeoutExpired as exc:
+            stdout_text = exc.stdout or ""
+            stderr_text = exc.stderr or ""
+            self.logger.block(f"{self.name}: STDOUT for {label}", stdout_text)
+            self.logger.block(f"{self.name}: STDERR for {label}", stderr_text)
+            raise RemoteCommandError(f"{self.name}: command timed out: {label}") from exc
+        stdout_text = completed.stdout or ""
+        stderr_text = completed.stderr or ""
+        rc = completed.returncode
+        self.logger.block(f"{self.name}: STDOUT for {label}", stdout_text)
+        self.logger.block(f"{self.name}: STDERR for {label}", stderr_text)
+        if check and rc != 0:
+            raise RemoteCommandError(f"{self.name}: command failed with rc={rc}: {label}")
+        return rc, stdout_text, stderr_text
+
+    def close(self):
+        return
+
+
+# Number of fio worker processes per --name. Must match --numjobs in
+# start_fio(). --group_reporting aggregates all workers into one report,
+# so a single fio summary + per-run stderr stream is sufficient to
+# diagnose any fio fault.
+FIO_NUMJOBS = 4
+
+
+@dataclass
+class FioJob:
+    volume_id: str
+    volume_name: str
+    mount_point: str
+    fio_log: str       # fio's --output summary file (written on exit)
+    fio_stderr: str    # captured stdout+stderr during the run (progress,
+                       # errors, "max_latency exceeded" messages). This is
+                       # the primary source of ground truth for fio faults.
+    rc_file: str
+    pid: int
+    fio_name: str      # matches --name=<fio_name> in the fio command line
+
+
+class TestRunError(RuntimeError):
+    pass
+
+
+class SoakRunner:
+    def __init__(self, args, metadata, logger):
+        self.args = args
+        self.metadata = metadata
+        self.logger = logger
+        self.user = metadata["user"]
+        self.password = args.password
+        self.run_id = time.strftime("%Y%m%d_%H%M%S")
+        if args.run_on_mgmt:
+            self.mgmt = LocalHost(logger, "mgmt")
+        else:
+            self.mgmt = RemoteHost(metadata["mgmt"]["public_ip"], self.user, self.password, logger, "mgmt")
+        client_entry = metadata["clients"][0]
+        if args.run_on_mgmt:
+            client_addr = client_entry.get("private_ip") or client_entry["public_ip"]
+        else:
+            client_addr = client_entry["public_ip"]
+        self.client = RemoteHost(client_addr, self.user, self.password, logger, "client")
+        self.cluster_id = metadata.get("cluster_uuid") or ""
+        self.fio_jobs = []
+        # Stored so the churn cycle can pick a random volume to rebuild.
+        self.volumes = []
+        self.created_volume_ids = []
+        # Mixed-outage state
+        self.methods = list(args.methods)
+        # On multipath clusters, network-layer coverage is provided by the
+        # inter-iteration single-NIC chaos. Dropping all data NICs on a node
+        # (network_outage_*) is a simple-cluster-only scenario.
+        if self._is_multipath():
+            filtered = [m for m in self.methods if not m.startswith("network_outage_")]
+            dropped = [m for m in self.methods if m not in filtered]
+            if dropped:
+                self.logger.log(
+                    f"multipath cluster detected: excluding {dropped} from outage methods"
+                )
+            if not filtered:
+                raise TestRunError(
+                    "No outage methods remain after excluding network_outage_* "
+                    "on multipath cluster; pass --methods with at least one "
+                    "non-network_outage method"
+                )
+            self.methods = filtered
+        self.node_hosts = {}  # uuid -> RemoteHost (private_ip of storage node)
+        self.node_ip_map = self._build_node_ip_map()
+        # Counter incremented by every churn cycle; embedded in fio --name and
+        # the rebuilt volume name so logs/pkill targets remain unique across
+        # iterations even when the same volume index is rebuilt repeatedly.
+        self.churn_counter = 0
+        self.mount_root = None
+
+    def close(self):
+        self.client.close()
+        self.mgmt.close()
+        for host in self.node_hosts.values():
+            try:
+                host.close()
+            except Exception:
+                pass
+
+    def _build_node_ip_map(self):
+        """Return {uuid: private_ip} for every storage node we know about."""
+        ip_map = {}
+        topology = self.metadata.get("topology") or {}
+        for node in topology.get("nodes", []):
+            uuid = node.get("uuid")
+            ip = node.get("management_ip") or node.get("private_ip")
+            if uuid and ip:
+                ip_map[uuid] = ip
+        # Fallback: pair storage_nodes list with sbctl-returned UUIDs by mgmt IP,
+        # which is done lazily in _resolve_node_ip below.
+        return ip_map
+
+    def _resolve_node_ip(self, uuid):
+        """Return the private/mgmt IP for a storage node UUID, refreshing via
+        sbctl if we haven't seen it in metadata."""
+        ip = self.node_ip_map.get(uuid)
+        if ip:
+            return ip
+        # Try fetching via sbctl sn list JSON.
+        nodes = self.sbctl("sn list --json", json_output=True)
+        for node in nodes:
+            candidate_ip = (
+                node.get("Management IP")
+                or node.get("Mgmt IP")
+                or node.get("mgmt_ip")
+                or node.get("management_ip")
+            )
+            if node.get("UUID") == uuid and candidate_ip:
+                self.node_ip_map[uuid] = candidate_ip
+                return candidate_ip
+        raise TestRunError(f"Cannot resolve storage-node IP for UUID {uuid}")
+
+    def _node_host(self, uuid):
+        """Lazily create a RemoteHost for a storage node identified by UUID."""
+        if uuid in self.node_hosts:
+            return self.node_hosts[uuid]
+        ip = self._resolve_node_ip(uuid)
+        host = RemoteHost(ip, self.user, self.password, self.logger, f"sn[{ip}]")
+        self.node_hosts[uuid] = host
+        return host
+
+    def sbctl(self, args, timeout=600, json_output=False):
+        command = "sudo /usr/local/bin/sbctl -d " + args
+        _, stdout_text, stderr_text = self.mgmt.run(
+            command,
+            timeout=timeout,
+            check=True,
+            label=f"sbctl {args}",
+        )
+        if not json_output:
+            return stdout_text
+        for candidate in (stdout_text, stderr_text, stdout_text + "\n" + stderr_text):
+            candidate = candidate.strip()
+            if not candidate:
+                continue
+            try:
+                return json.loads(candidate)
+            except json.JSONDecodeError:
+                pass
+            decoder = json.JSONDecoder()
+            final_payloads = []
+            list_payloads = []
+            dict_payloads = []
+            for start, char in enumerate(candidate):
+                if char not in "[{":
+                    continue
+                try:
+                    obj, end = decoder.raw_decode(candidate[start:])
+                except json.JSONDecodeError:
+                    continue
+                if not isinstance(obj, (dict, list)):
+                    continue
+                if not candidate[start + end:].strip():
+                    final_payloads.append(obj)
+                elif isinstance(obj, list):
+                    list_payloads.append(obj)
+                else:
+                    dict_payloads.append(obj)
+            if final_payloads:
+                return final_payloads[-1]
+            if list_payloads:
+                return list_payloads[-1]
+            if dict_payloads:
+                return dict_payloads[-1]
+        raise TestRunError(f"Failed to parse JSON from sbctl {args}")
+
+    def ensure_prerequisites(self):
+        self.logger.log(f"Using password auth as {self.user}; paramiko={'yes' if paramiko else 'no (sshpass fallback)'}")
+        self.client.run(
+            "if command -v dnf >/dev/null 2>&1; then "
+            "sudo dnf install -y nvme-cli fio xfsprogs; "
+            "else sudo apt-get update && sudo apt-get install -y nvme-cli fio xfsprogs; fi",
+            timeout=1800,
+            label="install client packages",
+        )
+        self.client.run("sudo modprobe nvme_tcp", timeout=60, label="load nvme_tcp")
+
+    def get_cluster_id(self):
+        if self.cluster_id:
+            return self.cluster_id
+        clusters = self.sbctl("cluster list --json", json_output=True)
+        if not clusters:
+            raise TestRunError("No clusters returned by sbctl cluster list")
+        self.cluster_id = clusters[0]["UUID"]
+        return self.cluster_id
+
+    def get_nodes(self):
+        nodes = self.sbctl("sn list --json", json_output=True)
+        parsed = []
+        for node in nodes:
+            parsed.append(
+                {
+                    "uuid": node["UUID"],
+                    "status": str(node.get("Status", "")).lower(),
+                    "mgmt_ip": node.get("Mgmt IP") or node.get("mgmt_ip") or "",
+                    "hostname": node.get("Hostname") or "",
+                }
+            )
+        return parsed
+
+    def ensure_expected_nodes(self):
+        nodes = self.get_nodes()
+        if len(nodes) != self.args.expected_node_count:
+            raise TestRunError(
+                f"Expected {self.args.expected_node_count} storage nodes, found {len(nodes)}. "
+                f"Update metadata or pass --expected-node-count."
+            )
+        return nodes
+
+    def assert_cluster_not_suspended(self):
+        clusters = self.sbctl("cluster list --json", json_output=True)
+        if not clusters:
+            raise TestRunError("Cluster list returned no rows")
+        status = str(clusters[0].get("Status", "")).lower()
+        if status == "suspended":
+            raise TestRunError("Cluster is suspended")
+        return status
+
+    def wait_for_all_online(self, target_nodes=None, timeout=None):
+        timeout = timeout or self.args.restart_timeout
+        expected = self.args.expected_node_count
+        target_nodes = set(target_nodes or [])
+        started = time.time()
+        while time.time() - started < timeout:
+            self.assert_cluster_not_suspended()
+            nodes = self.ensure_expected_nodes()
+            statuses = {node["uuid"]: node["status"] for node in nodes}
+            offline = [uuid for uuid, status in statuses.items() if status != "online"]
+            unaffected_bad = [
+                uuid for uuid, status in statuses.items()
+                if uuid not in target_nodes and status != "online"
+            ]
+            if unaffected_bad:
+                raise TestRunError(
+                    "Unaffected nodes are not online: "
+                    + ", ".join(f"{uuid}:{statuses[uuid]}" for uuid in unaffected_bad)
+                )
+            if not offline and len(statuses) == expected:
+                return nodes
+            self.logger.log(
+                "Waiting for all nodes online: "
+                + ", ".join(f"{uuid}:{status}" for uuid, status in statuses.items())
+            )
+            time.sleep(self.args.poll_interval)
+        raise TestRunError("Timed out waiting for nodes to return online")
+
+    def wait_for_cluster_stable(self):
+        cluster_id = self.get_cluster_id()
+        started = time.time()
+        while time.time() - started < self.args.rebalance_timeout:
+            cluster_list = self.sbctl("cluster list --json", json_output=True)
+            status = str(cluster_list[0].get("Status", "")).lower()
+            if status == "suspended":
+                raise TestRunError("Cluster entered suspended state")
+            cluster_info = self.sbctl(f"cluster get {cluster_id}", json_output=True)
+            rebalancing = bool(cluster_info.get("is_re_balancing", False))
+            nodes = self.ensure_expected_nodes()
+            node_statuses = {node["uuid"]: node["status"] for node in nodes}
+            if status == "active" and not rebalancing and all(
+                state == "online" for state in node_statuses.values()
+            ):
+                self.logger.log("Cluster stable: ACTIVE, online, not rebalancing")
+                return
+            self.logger.log(
+                "Waiting for cluster stability: "
+                f"status={status}, rebalancing={rebalancing}, "
+                + ", ".join(f"{uuid}:{state}" for uuid, state in node_statuses.items())
+            )
+            time.sleep(self.args.poll_interval)
+        raise TestRunError("Timed out waiting for cluster rebalancing to finish")
+
+    def get_active_tasks(self):
+        cluster_id = self.get_cluster_id()
+        script = (
+            "import json; "
+            "from simplyblock_core import db_controller; "
+            "from simplyblock_core.models.job_schedule import JobSchedule; "
+            "db = db_controller.DBController(); "
+            f"tasks = db.get_job_tasks({cluster_id!r}, reverse=False); "
+            "out = [t.get_clean_dict() for t in tasks "
+            "if t.status != JobSchedule.STATUS_DONE and not getattr(t, 'canceled', False)]; "
+            "print(json.dumps(out))"
+        )
+        out = self.mgmt.run(
+            f"sudo python3 -c {shlex.quote(script)}",
+            timeout=60,
+            label="list active tasks",
+        )[1].strip()
+        return json.loads(out or "[]")
+
+    def wait_for_no_active_tasks(self, reason):
+        started = time.time()
+        while time.time() - started < self.args.rebalance_timeout:
+            self.assert_cluster_not_suspended()
+            active_tasks = self.get_active_tasks()
+            if not active_tasks:
+                return
+            details = ", ".join(
+                f"{task.get('function_name')}:{task.get('status')}:{task.get('node_id') or task.get('device_id')}"
+                for task in active_tasks
+            )
+            self.logger.log(f"Waiting before {reason}; active tasks: {details}")
+            time.sleep(self.args.poll_interval)
+        raise TestRunError(f"Timed out waiting for active tasks to finish before {reason}")
+
+    @staticmethod
+    def _is_data_migration_task(task):
+        function_name = str(task.get("function_name", "")).lower()
+        task_name = str(task.get("task_name", "")).lower()
+        task_type = str(task.get("task_type", "")).lower()
+        haystack = " ".join([function_name, task_name, task_type])
+        markers = (
+            "migration",
+            "rebalanc",
+            "sync",
+        )
+        return any(marker in haystack for marker in markers)
+
+    def wait_for_data_migration_complete(self, reason):
+        started = time.time()
+        while time.time() - started < self.args.rebalance_timeout:
+            self.assert_cluster_not_suspended()
+            active_tasks = self.get_active_tasks()
+            migration_tasks = [task for task in active_tasks if self._is_data_migration_task(task)]
+            if not migration_tasks:
+                return
+            details = ", ".join(
+                f"{task.get('function_name')}:{task.get('status')}:{task.get('node_id') or task.get('device_id')}"
+                for task in migration_tasks
+            )
+            self.logger.log(f"Waiting before {reason}; data migration tasks: {details}")
+            time.sleep(self.args.poll_interval)
+        raise TestRunError(
+            f"Timed out waiting for data migration tasks to finish before {reason}"
+        )
+
+    def sbctl_allow_failure(self, args, timeout=600):
+        command = "sudo /usr/local/bin/sbctl -d " + args
+        rc, stdout_text, stderr_text = self.mgmt.run(
+            command,
+            timeout=timeout,
+            check=False,
+            label=f"sbctl {args}",
+        )
+        return rc, stdout_text, stderr_text
+
+    def shutdown_with_migration_retry(self, node_id):
+        while True:
+            rc, stdout_text, stderr_text = self.sbctl_allow_failure(
+                f"sn shutdown {node_id}",
+                timeout=300,
+            )
+            if rc == 0:
+                return
+            output = f"{stdout_text}\n{stderr_text}".lower()
+            retry_markers = (
+                "migration",
+                "migrat",
+                "rebalanc",
+                "active task",
+                "running task",
+                "in_progress",
+                "in progress",
+            )
+            if any(marker in output for marker in retry_markers):
+                self.logger.log(
+                    f"Shutdown of {node_id} blocked by migration/rebalance/task; retrying in 15s"
+                )
+                time.sleep(15)
+                continue
+            raise RemoteCommandError(
+                f"mgmt: command failed with rc={rc}: sbctl sn shutdown {node_id}"
+            )
+
+    def prepare_client(self):
+        home = "/root" if self.user == "root" else posixpath.join("/home", self.user)
+        mount_root = posixpath.join(home, f"lab_outage_soak_{self.run_id}")
+        command = (
+            "sudo pkill -f '[f]io --name=aws_dual_soak_' || true\n"
+            f"sudo mkdir -p {shlex.quote(mount_root)}\n"
+            f"sudo chown {shlex.quote(self.user)}:{shlex.quote(self.user)} {shlex.quote(mount_root)}\n"
+        )
+        self.client.run(f"bash -lc {shlex.quote(command)}", timeout=120, label="prepare client workspace")
+        return mount_root
+
+    def extract_uuid(self, text):
+        for line in reversed(text.splitlines()):
+            stripped = line.strip()
+            if UUID_RE.fullmatch(stripped):
+                return stripped
+        raise TestRunError(f"Failed to extract standalone UUID from output: {text}")
+
+    def _create_one_volume(self, volume_name, node_uuid, index):
+        """Create one lvol bound to ``node_uuid`` and return its volume dict.
+
+        Retries inside the rebalance window if the LVStore is being recreated
+        or while a rebalance / data migration is in flight, matching the
+        behaviour of the bulk ``create_volumes`` path.
+        """
+        volume_id = None
+        started = time.time()
+        while time.time() - started < self.args.rebalance_timeout:
+            self.wait_for_all_online(timeout=self.args.restart_timeout)
+            self.wait_for_cluster_stable()
+            output = self.sbctl(
+                f"lvol add {volume_name} {self.args.volume_size} {self.args.pool} --host-id {node_uuid}"
+            )
+            if "ERROR:" in output or "LVStore is being recreated" in output:
+                self.logger.log(f"Volume create for {volume_name} deferred: {output.strip()}")
+                time.sleep(self.args.poll_interval)
+                continue
+            volume_id = self.extract_uuid(output)
+            break
+        if volume_id is None:
+            raise TestRunError(f"Timed out creating volume {volume_name} on node {node_uuid}")
+        self.created_volume_ids.append(volume_id)
+        self.logger.log(f"Created volume {volume_name} ({volume_id}) on node {node_uuid}")
+        return {
+            "index": index,
+            "volume_name": volume_name,
+            "volume_id": volume_id,
+            "node_uuid": node_uuid,
+        }
+
+    def create_volumes(self, nodes):
+        self.logger.log(
+            f"Creating {len(nodes)} volumes of size {self.args.volume_size}, one per storage node"
+        )
+        volumes = []
+        for index, node in enumerate(nodes, start=1):
+            volume_name = f"aws_dual_soak_{self.run_id}_v{index}"
+            volumes.append(self._create_one_volume(volume_name, node["uuid"], index))
+        return volumes
+
+    def connect_and_mount_volumes(self, volumes, mount_root):
+        self.logger.log("Connecting volumes to client and preparing filesystems")
+        for volume in volumes:
+            self._connect_and_mount_one(volume, mount_root)
+
+    def _connect_and_mount_one(self, volume, mount_root):
+        """Connect, mkfs, mount a single volume. Mutates ``volume`` to add
+        mount_point / fio_log / fio_stderr / rc_file / nqn keys.
+
+        Saving ``nqn`` lets the churn cycle disconnect via ``nvme disconnect
+        -n <nqn>`` without having to re-derive it from the device path.
+        """
+        connect_output = self.sbctl(f"lvol connect {volume['volume_id']}")
+        connect_commands = []
+        for line in connect_output.splitlines():
+            stripped = line.strip()
+            if stripped.startswith("sudo nvme connect"):
+                connect_commands.append(stripped)
+        if not connect_commands:
+            raise TestRunError(f"No nvme connect command returned for {volume['volume_id']}")
+        nqn = None
+        for cmd in connect_commands:
+            m = NQN_RE.search(cmd)
+            if m:
+                nqn = m.group(1)
+                break
+        if nqn is None:
+            raise TestRunError(
+                f"Failed to parse NQN from lvol connect output for {volume['volume_id']}"
+            )
+        volume["nqn"] = nqn
+        successful_connects = 0
+        failed_connects = []
+        for connect_cmd in connect_commands:
+            try:
+                self.client.run(connect_cmd, timeout=120, label=f"connect {volume['volume_id']}")
+                successful_connects += 1
+            except TestRunError as exc:
+                failed_connects.append(str(exc))
+                self.logger.log(f"Path connect failed for {volume['volume_id']}: {exc}")
+        if successful_connects == 0:
+            raise TestRunError(
+                f"No nvme paths connected for {volume['volume_id']}: {'; '.join(failed_connects)}"
+            )
+        if failed_connects:
+            self.logger.log(
+                f"Continuing with {successful_connects}/{len(connect_commands)} connected paths "
+                f"for {volume['volume_id']}"
+            )
+        volume["mount_point"] = posixpath.join(mount_root, f"vol{volume['index']}")
+        volume["fio_log"] = posixpath.join(mount_root, f"fio_vol{volume['index']}.log")
+        volume["fio_stderr"] = posixpath.join(mount_root, f"fio_vol{volume['index']}.stderr")
+        volume["rc_file"] = posixpath.join(mount_root, f"fio_vol{volume['index']}.rc")
+        find_and_mount = (
+            "set -euo pipefail\n"
+            f"dev=$(readlink -f /dev/disk/by-id/*{volume['volume_id']}* | head -n 1)\n"
+            "if [ -z \"$dev\" ]; then\n"
+            f"  echo 'Failed to locate NVMe device for {volume['volume_id']}' >&2\n"
+            "  exit 1\n"
+            "fi\n"
+            f"sudo mkfs.xfs -f \"$dev\"\n"
+            f"sudo mkdir -p {shlex.quote(volume['mount_point'])}\n"
+            f"sudo mount \"$dev\" {shlex.quote(volume['mount_point'])}\n"
+            f"sudo chown {shlex.quote(self.user)}:{shlex.quote(self.user)} {shlex.quote(volume['mount_point'])}\n"
+        )
+        self.client.run(
+            f"bash -lc {shlex.quote(find_and_mount)}",
+            timeout=600,
+            label=f"format and mount {volume['volume_id']}",
+        )
+
+    def _build_fio_name(self, index, churn_id):
+        # Names embed both the volume index and the churn counter so the name
+        # is unique even after a churn replaces a volume — avoids prefix
+        # collisions when pkill -f matches by --name=<name>.
+        return f"aws_dual_soak_v{index}_c{churn_id}"
+
+    def _start_fio_for_volume(self, volume, fio_name):
+        # Capture fio's stdout+stderr to a dedicated file. --output only
+        # writes the aggregate summary on exit; progress lines and error
+        # messages ("fio: max_latency exceeded", IO error details, etc.)
+        # go to stderr during the run. That stream is the authoritative
+        # source for "what went wrong" — surface it on every fault.
+        start_script = (
+            "set -euo pipefail\n"
+            f"rm -f {shlex.quote(volume['rc_file'])} {shlex.quote(volume['fio_stderr'])}\n"
+            "nohup bash -lc "
+            + shlex.quote(
+                f"cd {shlex.quote(volume['mount_point'])} && "
+                f"fio --name={fio_name} --directory={shlex.quote(volume['mount_point'])} "
+                "--direct=1 --rw=randrw --bs=4K --group_reporting --time_based "
+                f"--numjobs={FIO_NUMJOBS} --iodepth=4 --size=4G --runtime={self.args.runtime} "
+                "--ioengine=aiolib --max_latency=20s --exitall_on_error=1 "
+                f"--output={shlex.quote(volume['fio_log'])}; "
+                "rc=$?; "
+                f"echo $rc > {shlex.quote(volume['rc_file'])}"
+            )
+            + f" > {shlex.quote(volume['fio_stderr'])} 2>&1 & echo $!"
+        )
+        _, stdout_text, _ = self.client.run(
+            f"bash -lc {shlex.quote(start_script)}",
+            timeout=60,
+            label=f"start fio {volume['volume_id']}",
+        )
+        pid_text = stdout_text.strip().splitlines()[-1]
+        pid = int(pid_text)
+        job = FioJob(
+            volume_id=volume["volume_id"],
+            volume_name=volume["volume_name"],
+            mount_point=volume["mount_point"],
+            fio_log=volume["fio_log"],
+            fio_stderr=volume["fio_stderr"],
+            rc_file=volume["rc_file"],
+            pid=pid,
+            fio_name=fio_name,
+        )
+        self.logger.log(f"Started fio for {volume['volume_name']} with pid {pid} (name={fio_name})")
+        return job
+
+    def start_fio(self, volumes):
+        self.logger.log("Starting fio on all mounted volumes in parallel")
+        fio_jobs = []
+        for volume in volumes:
+            fio_name = self._build_fio_name(volume["index"], 0)
+            fio_jobs.append(self._start_fio_for_volume(volume, fio_name))
+        self.fio_jobs = fio_jobs
+        # Give fio a few seconds to begin; don't block on worker fork — the
+        # authoritative "fio is in trouble" signal is rc_file / stderr, not
+        # process counts. If fio never issues IO the pre-stop check and
+        # outage cluster-health checks will surface that.
+        time.sleep(5)
+
+    def read_remote_file(self, path):
+        rc, stdout_text, _ = self.client.run(
+            f"bash -lc {shlex.quote(f'cat {shlex.quote(path)}')}",
+            timeout=30,
+            check=False,
+            label=f"read {path}",
+        )
+        if rc != 0:
+            return ""
+        return stdout_text
+
+    # ----- fio fault detection --------------------------------------------
+
+    # Any line in fio_stderr matching one of these (case-sensitive, fixed
+    # string) is treated as a fio fault — even if fio is still running.
+    # ``--max_latency`` violations in particular log "fio: latency of …
+    # exceeds specified max" and do NOT always terminate fio when run
+    # with --group_reporting + --numjobs>1, so a process-still-alive
+    # check alone misses them.
+    FIO_STDERR_ERROR_MARKERS = (
+        "fio: latency of",     # --max_latency violation
+        "fio: io_u error",     # io_u submission/completion error
+        "fio: pid=",           # generic fio per-job error dump
+        "io_u error on file",  # alternate io_u error format
+        "verify failed",       # data verification fault
+        "fio: verify",         # alternate verify error
+        "fio: error",          # generic fio error
+        "Killed",              # bash reports fio got SIGKILL
+        "Terminated",          # bash reports fio got SIGTERM (no churn here)
+    )
+
+    def _read_rc_file(self, job):
+        """Return the rc string if fio's wrapping bash wrote rc_file, else None.
+
+        ``rc_file`` is one of three independent fault signals; see
+        ``_check_fio_fault``.
+        """
+        probe = (
+            f"if [ -f {shlex.quote(job.rc_file)} ]; then "
+            f"cat {shlex.quote(job.rc_file)}; fi"
+        )
+        _, stdout_text, _ = self.client.run(
+            f"bash -lc {shlex.quote(probe)}",
+            timeout=15,
+            check=False,
+            label=f"check rc_file {job.volume_name}",
+        )
+        rc = (stdout_text or "").strip()
+        return rc or None
+
+    def _wrapper_alive(self, job):
+        """Return True iff the wrapping bash that runs fio is still alive.
+
+        ``job.pid`` is the pid printed by ``echo $!`` at start_fio time
+        (the nohup'd bash, parent of fio). If that pid is gone AND no
+        rc_file was written, fio was signalled away and bash never got
+        to record an exit code — that case is a fault, not "still running".
+        """
+        probe = (
+            f"if kill -0 {int(job.pid)} 2>/dev/null; then echo alive; fi"
+        )
+        _, stdout_text, _ = self.client.run(
+            f"bash -lc {shlex.quote(probe)}",
+            timeout=15,
+            check=False,
+            label=f"check wrapper pid {job.volume_name}",
+        )
+        return stdout_text.strip() == "alive"
+
+    def _scan_fio_stderr_for_errors(self, job):
+        """Return matching error lines from fio_stderr (up to 20), or "".
+
+        See FIO_STDERR_ERROR_MARKERS for the list. ``--max_latency``
+        violations in particular are reported here even while fio
+        continues running, so this catches faults the rc_file / pid
+        checks would miss.
+        """
+        if not self.FIO_STDERR_ERROR_MARKERS:
+            return ""
+        grep_args = " ".join(
+            f"-e {shlex.quote(p)}" for p in self.FIO_STDERR_ERROR_MARKERS
+        )
+        grep_cmd = (
+            f"grep -F -m 20 {grep_args} "
+            f"{shlex.quote(job.fio_stderr)} 2>/dev/null || true"
+        )
+        _, stdout_text, _ = self.client.run(
+            f"bash -lc {shlex.quote(grep_cmd)}",
+            timeout=15,
+            check=False,
+            label=f"scan stderr {job.volume_name}",
+        )
+        return stdout_text.strip()
+
+    def _check_fio_fault(self, job):
+        """Detect any fio fault for ``job``. Returns ``(kind, detail)`` or None.
+
+        Three independent signals — ANY one is a fault:
+          * ``exited``: fio's wrapping bash wrote rc_file (any rc, including 0,
+            is a fault mid-run because fio's --runtime is orders of magnitude
+            longer than an outage iteration).
+          * ``missing``: the wrapping bash pid is gone and no rc_file was
+            written — fio was signalled away (or its wrapper died) without
+            recording an exit code.
+          * ``stderr_error``: fio_stderr contains a known fio error marker
+            (max_latency violation, io_u error, verify failure, etc.) — fio
+            may still be running but is degraded; treat it as a fault.
+
+        ``detail`` is a human-readable one-liner. The full stderr/output
+        is dumped via ``_dump_fio_streams`` by the callers.
+        """
+        rc = self._read_rc_file(job)
+        if rc is not None:
+            return ("exited", f"fio exited rc={rc}")
+
+        if not self._wrapper_alive(job):
+            return (
+                "missing",
+                f"fio wrapper pid {job.pid} is gone and no rc_file was written",
+            )
+
+        err = self._scan_fio_stderr_for_errors(job)
+        if err:
+            first_line = err.splitlines()[0][:240]
+            return ("stderr_error", f"stderr error marker: {first_line}")
+
+        return None
+
+    def _dump_fio_streams(self, job, context):
+        """Write fio's captured stderr and --output summary into the soak
+        log so the actual fio error text (max_latency violations, IO
+        errors, "fio: pid=…, err=…, func=…" lines) is visible next to
+        the outage scenario that triggered it."""
+        for label, path, lines in [
+            ("fio stderr",  job.fio_stderr, 200),
+            ("fio summary", job.fio_log,     60),
+        ]:
+            _, body, _ = self.client.run(
+                f"bash -lc {shlex.quote(f'tail -{lines} {shlex.quote(path)} 2>/dev/null || true')}",
+                timeout=30,
+                check=False,
+                label=f"dump {label} {job.volume_name}",
+            )
+            if body.strip():
+                self.logger.block(
+                    f"[{context}] {job.volume_name} {label} ({path}):",
+                    body,
+                )
+            else:
+                self.logger.log(
+                    f"[{context}] {job.volume_name} {label} ({path}): (empty)"
+                )
+
+    def check_fio(self):
+        """Raise if any tracked fio shows a fault.
+
+        Three independent signals are evaluated per ``_check_fio_fault``:
+        rc_file written (fio exited), wrapper pid gone with no rc_file
+        (signalled away), or a fio error marker in stderr (max_latency
+        violation, io_u/verify error, etc.). ANY of these is a fault —
+        fio's ``--runtime`` is orders of magnitude longer than a single
+        outage iteration, and a degraded-but-running fio is just as
+        invalid a result as a dead one.
+
+        On fault, every faulting job's captured stderr and --output
+        summary are dumped into the soak log so the exact fio error
+        lines are visible next to the iteration that triggered them.
+        """
+        faulted = []
+        for job in self.fio_jobs:
+            fault = self._check_fio_fault(job)
+            if fault is not None:
+                faulted.append((job, fault))
+        if not faulted:
+            return
+        for job, (kind, detail) in faulted:
+            self._dump_fio_streams(job, context=f"fio fault [{kind}] {detail}")
+        details = ", ".join(
+            f"{j.volume_name}={kind}:{detail}" for j, (kind, detail) in faulted
+        )
+        raise TestRunError(f"fio fault detected: {details}")
+
+    def ensure_fio_running(self):
+        self.check_fio()
+
+    def stop_fio(self):
+        """Stop every fio process launched by this soak on the client host.
+
+        Called between outage iterations so rebalancing runs unloaded.
+        Before killing, calls ``check_fio`` — any fio that wrote its
+        rc_file is a mid-run exit, which is a fault. Dumps the captured
+        fio stderr/summary into the soak log so the actual fio error
+        text is side-by-side with the outage scenario that triggered it.
+        After the check passes we SIGTERM (short grace window) then
+        SIGKILL; matching by ``fio --name=aws_dual_soak_*`` catches both
+        the bash wrapper and any fio workers.
+        """
+        if not self.fio_jobs:
+            return
+
+        # Pre-kill verification: any fio having exited is a fault.
+        self.check_fio()
+
+        self.logger.log("All fio still running; stopping them between iterations")
+        kill_script = (
+            "set +e\n"
+            "sudo pkill -TERM -f 'fio --name=aws_dual_soak_' 2>/dev/null || true\n"
+            "for i in $(seq 1 15); do\n"
+            "  if ! pgrep -f 'fio --name=aws_dual_soak_' >/dev/null; then\n"
+            "    exit 0\n"
+            "  fi\n"
+            "  sleep 2\n"
+            "done\n"
+            "sudo pkill -KILL -f 'fio --name=aws_dual_soak_' 2>/dev/null || true\n"
+        )
+        self.client.run(
+            f"bash -lc {shlex.quote(kill_script)}",
+            timeout=90,
+            check=False,
+            label="stop fio",
+        )
+        # Drop the job list; start_fio will rebuild it from self.volumes.
+        self.fio_jobs = []
+
+    # ----- single-volume fio + churn ---------------------------------------
+
+    def _disconnect_one_volume(self, volume):
+        nqn = volume.get("nqn")
+        if not nqn:
+            self.logger.log(
+                f"WARNING: no NQN saved for {volume['volume_name']}; skipping nvme disconnect"
+            )
+            return
+        # ``nvme disconnect -n <nqn>`` tears down every controller (path) for
+        # that subsystem in one call, so multipath connections are handled
+        # without a per-path teardown loop.
+        self.client.run(
+            f"sudo nvme disconnect -n {shlex.quote(nqn)}",
+            timeout=60,
+            check=False,
+            label=f"nvme disconnect {volume['volume_name']}",
+        )
+
+    def _unmount_one_volume(self, volume):
+        mount_point = volume.get("mount_point")
+        if not mount_point:
+            return
+        # Try plain unmount first, then -f, then lazy as last resort. We've
+        # already SIGKILLed the fio holding the mount, so plain umount
+        # should succeed on the happy path; the fallbacks only matter if
+        # buffered IO is still draining.
+        umount_script = (
+            f"sudo umount {shlex.quote(mount_point)} 2>/dev/null || "
+            f"sudo umount -f {shlex.quote(mount_point)} 2>/dev/null || "
+            f"sudo umount -l {shlex.quote(mount_point)} 2>/dev/null || true"
+        )
+        self.client.run(
+            f"bash -lc {shlex.quote(umount_script)}",
+            timeout=60,
+            check=False,
+            label=f"umount {volume['volume_name']}",
+        )
+
+    def _delete_one_lvol(self, volume):
+        rc, stdout_text, stderr_text = self.sbctl_allow_failure(
+            f"lvol delete {volume['volume_id']} --force",
+            timeout=600,
+        )
+        if rc != 0:
+            raise TestRunError(
+                f"lvol delete failed for {volume['volume_name']} ({volume['volume_id']}): "
+                f"{stdout_text.strip()} | {stderr_text.strip()}"
+            )
+        if volume["volume_id"] in self.created_volume_ids:
+            self.created_volume_ids.remove(volume["volume_id"])
+
+    def _churn_one_volume(self):
+        """Rebuild one randomly-selected volume.
+
+        Called between iterations, AFTER stop_fio. fio is not running, so
+        no per-job teardown is needed — we just delete + recreate +
+        remount the volume. The next iteration's start_fio() will pick up
+        the fresh volume and start a new fio job for it.
+        """
+        if not self.volumes:
+            return
+        idx = random.randrange(len(self.volumes))
+        old_volume = self.volumes[idx]
+
+        self.churn_counter += 1
+        churn_id = self.churn_counter
+        new_name = f"aws_dual_soak_{self.run_id}_v{old_volume['index']}_c{churn_id}"
+        self.logger.log(
+            f"churn {churn_id}: rebuilding {old_volume['volume_name']} "
+            f"({old_volume['volume_id']}) on node {old_volume['node_uuid']} "
+            f"-> {new_name}"
+        )
+
+        self._unmount_one_volume(old_volume)
+        self._disconnect_one_volume(old_volume)
+        self._delete_one_lvol(old_volume)
+
+        # Recreate on the SAME storage node so the topology used by the
+        # outage scenario list (pinned at startup off role-representative
+        # pairs) doesn't drift.
+        new_volume = self._create_one_volume(
+            new_name,
+            old_volume["node_uuid"],
+            old_volume["index"],
+        )
+        self._connect_and_mount_one(new_volume, self.mount_root)
+
+        self.volumes[idx] = new_volume
+        self.logger.log(
+            f"churn {churn_id}: complete; {new_name} ({new_volume['volume_id']}) "
+            f"will be picked up by next start_fio"
+        )
+
+    # ----- outage methods ---------------------------------------------------
+
+    def _forced_shutdown(self, node_id):
+        """Shutdown with --force; still retry if blocked by migration."""
+        while True:
+            rc, stdout_text, stderr_text = self.sbctl_allow_failure(
+                f"sn shutdown {node_id} --force",
+                timeout=300,
+            )
+            if rc == 0:
+                return
+            output = f"{stdout_text}\n{stderr_text}".lower()
+            retry_markers = (
+                "migration", "migrat", "rebalanc",
+                "active task", "running task",
+                "in_progress", "in progress",
+            )
+            if any(m in output for m in retry_markers):
+                self.logger.log(
+                    f"Forced shutdown of {node_id} blocked by migration/task; retrying in 15s"
+                )
+                time.sleep(15)
+                continue
+            raise RemoteCommandError(
+                f"mgmt: command failed with rc={rc}: sbctl sn shutdown {node_id} --force"
+            )
+
+    def _container_kill(self, node_id):
+        """Kill the SPDK container on the storage node's host. Node is expected
+        to auto-recover; no sbctl restart is issued."""
+        host = self._node_host(node_id)
+        cmd = (
+            "set -euo pipefail; "
+            "cns=$(sudo docker ps --format '{{.Names}}' | grep -E '^spdk_[0-9]+$' || true); "
+            "if [ -z \"$cns\" ]; then echo 'no spdk_* container found' >&2; exit 0; fi; "
+            "for cn in $cns; do echo \"killing $cn\"; sudo docker kill \"$cn\" || true; done"
+        )
+        host.run(
+            f"bash -lc {shlex.quote(cmd)}",
+            timeout=120,
+            check=False,
+            label=f"container_kill {node_id}",
+        )
+
+    def _host_reboot(self, node_id):
+        """Reboot the storage node's host. Node is expected to auto-recover;
+        no sbctl restart is issued."""
+        host = self._node_host(node_id)
+        # nohup + background + sleep so the shell exit beats reboot cleanly
+        cmd = "sudo nohup bash -c 'sleep 2; reboot -f' >/dev/null 2>&1 &"
+        try:
+            host.run(
+                f"bash -lc {shlex.quote(cmd)}",
+                timeout=30,
+                check=False,
+                label=f"host_reboot {node_id}",
+            )
+        except RemoteCommandError as exc:
+            # SSH may drop as the host goes down — not fatal.
+            self.logger.log(f"host_reboot {node_id}: ssh terminated as expected: {exc}")
+        # Drop the cached SSH client; it's going to die anyway.
+        cached = self.node_hosts.pop(node_id, None)
+        if cached is not None:
+            try:
+                cached.close()
+            except Exception:
+                pass
+
+    # --- Multipath NIC chaos ---
+
+    def _is_multipath(self):
+        return bool(self.metadata.get("multipath"))
+
+    def _get_data_nics(self):
+        """Return the list of data NIC names (e.g. ['eth1', 'eth2'])."""
+        nics = self.metadata.get("data_nics")
+        if nics:
+            return nics
+        iface = self.metadata.get("data_iface")
+        if iface:
+            return [iface]
+        return []
+
+    def _network_outage(self, node_id, duration):
+        """Take all data NICs down on one storage node for *duration* seconds,
+        then bring them back up. Simulates a transient network partition of
+        a single node. Node is expected to auto-recover once the NICs return
+        — no sbctl restart is issued."""
+        host = self._node_host(node_id)
+        nics = self._get_data_nics() or ["eth1"]
+        self.logger.log(
+            f"network_outage on {node_id}: dropping {nics} for {duration}s"
+        )
+        for nic in nics:
+            try:
+                host.run(f"sudo ip link set {nic} down", timeout=10, check=False,
+                         label=f"netout down {nic} on {node_id}")
+            except Exception as e:
+                self.logger.log(f"WARNING: failed to down {nic} on {node_id}: {e}")
+        try:
+            time.sleep(duration)
+        finally:
+            for nic in nics:
+                try:
+                    host.run(f"sudo ip link set {nic} up", timeout=10, check=False,
+                             label=f"netout up {nic} on {node_id}")
+                except Exception as e:
+                    self.logger.log(f"WARNING: failed to up {nic} on {node_id}: {e}")
+
+    def _apply_outage(self, node_id, method):
+        self.logger.log(f"Applying outage '{method}' on {node_id}")
+        if method == "graceful":
+            self.shutdown_with_migration_retry(node_id)
+        elif method == "forced":
+            self._forced_shutdown(node_id)
+        elif method == "container_kill":
+            self._container_kill(node_id)
+        elif method == "host_reboot":
+            self._host_reboot(node_id)
+        elif method.startswith("network_outage_"):
+            try:
+                duration = int(method.rsplit("_", 1)[-1])
+            except ValueError:
+                raise TestRunError(f"Unknown outage method: {method}")
+            self._network_outage(node_id, duration)
+        else:
+            raise TestRunError(f"Unknown outage method: {method}")
+
+    def _needs_manual_restart(self, method):
+        return method not in AUTO_RECOVER_METHODS
+
+    def wait_node_leaves_online(self, node_id, timeout=90, poll=2):
+        """Poll sbctl until the control plane observes node_id leaving 'online'.
+        Returns True once any non-online status is seen, False on timeout.
+
+        Why this exists: the CP's health-check loop updates status on its own
+        cadence. If the soak polls wait_for_all_online *before* the CP has
+        noticed the outage, the first poll reports all-online and we return
+        while the target is actually still down. The next iteration then
+        stacks extra outages on a silently-offline node and breaks the FTT
+        budget (see incident: 2026-04-20 iter 17 container_kill on 2870dfa5,
+        CP status transition lagged the soak's first sn-list by ~1 s).
+        """
+        deadline = time.time() + timeout
+        while time.time() < deadline:
+            try:
+                nodes = self.get_nodes()
+            except Exception as exc:
+                self.logger.log(f"wait_node_leaves_online: sn list failed ({exc})")
+                time.sleep(poll)
+                continue
+            status = next(
+                (n["status"] for n in nodes if n["uuid"] == node_id),
+                "unknown",
+            )
+            if status != "online":
+                self.logger.log(
+                    f"CP observed {node_id[:8]} leaving online (now {status})"
+                )
+                return True
+            time.sleep(poll)
+        return False
+
+    def run_outage_pair(self, node1, node2, method1, method2):
+        self.logger.log(
+            f"Outage pair: {node1}={method1} and {node2}={method2}"
+        )
+        # Apply first outage, then optional gap, then second outage.
+        self._apply_outage(node1, method1)
+        if self.args.shutdown_gap:
+            time.sleep(self.args.shutdown_gap)
+        self._apply_outage(node2, method2)
+
+        # Issue sbctl restart only for methods that leave the node in a
+        # "shutdown" state that the CP won't recover on its own.
+        # Retry with backoff: when the other node in the pair used an
+        # auto-recover method (container_kill / host_reboot), it may
+        # still be in_shutdown or in_restart when we try to restart the
+        # manually-recovered peer — the per-cluster guard rejects
+        # concurrent restarts. Retrying gives the auto-recovering node
+        # time to come back.
+        for node_id, method in [(node1, method1), (node2, method2)]:
+            if not self._needs_manual_restart(method):
+                continue
+            deadline = time.time() + self.args.restart_timeout
+            while True:
+                try:
+                    # Emit a RESTART header with the wall-clock timestamp,
+                    # then dump the raw sbctl -d restart stdout below it
+                    # (without per-line timestamp prefix) so the CP trace
+                    # produced by -d lines up with a single moment in time.
+                    self.logger.log(
+                        f"RESTART: {time.strftime('%Y-%m-%d %H:%M:%S')} {node_id}"
+                    )
+                    stdout_text = self.sbctl(f"sn restart {node_id}", timeout=300)
+                    with self.logger.lock:
+                        print(stdout_text, flush=True, end=""
+                              if stdout_text.endswith("\n") else "\n")
+                        with open(self.logger.path, "a", encoding="utf-8") as handle:
+                            handle.write(stdout_text)
+                            if not stdout_text.endswith("\n"):
+                                handle.write("\n")
+                    break
+                except Exception as e:
+                    if time.time() >= deadline:
+                        raise
+                    self.logger.log(
+                        f"Restart of {node_id} failed ({e}), "
+                        f"retrying in 15s (peer may still be recovering)")
+                    time.sleep(15)
+
+        # Before we call wait_for_all_online, make sure the control plane has
+        # actually observed each auto-recover target leaving 'online' state.
+        # Otherwise wait_for_all_online can race the CP: the first sn-list
+        # poll may still report the just-killed node as 'online' (stale),
+        # all statuses look good, and we return immediately — the node is
+        # then in a silent offline state when the next iteration stacks
+        # more outages on top, crossing the FTT budget.
+        # network_outage_* methods can finish before the CP notices; that's
+        # fine (short outages often recover from HA multipath without CP
+        # involvement), so we don't fail if the observation window expires.
+        for node_id, method in [(node1, method1), (node2, method2)]:
+            if method not in AUTO_RECOVER_METHODS:
+                continue
+            if method.startswith("network_outage_"):
+                observed = self.wait_node_leaves_online(node_id, timeout=30)
+                if not observed:
+                    self.logger.log(
+                        f"CP did not observe {node_id[:8]} offline for "
+                        f"{method} within 30s (expected for short NIC drops)"
+                    )
+            else:
+                # container_kill, host_reboot: the node IS down; we must see it.
+                observed = self.wait_node_leaves_online(node_id, timeout=90)
+                if not observed:
+                    self.logger.log(
+                        f"WARN: CP never observed {node_id[:8]} offline after "
+                        f"{method} within 90s; sn-list may be stale"
+                    )
+
+        # For auto-recovery methods, allow a longer wait window since the host
+        # has to reboot / the container has to come back under its supervisor.
+        wait_timeout = self.args.restart_timeout
+        if any(
+            m in AUTO_RECOVER_METHODS for m in (method1, method2)
+        ):
+            wait_timeout = max(wait_timeout, self.args.auto_recover_wait)
+
+        self.wait_for_all_online(
+            target_nodes={node1, node2}, timeout=wait_timeout
+        )
+        # Intentionally no check_fio / wait_for_cluster_stable here: the
+        # outer loop calls check_fio right after this returns, then
+        # stop_fio, then waits for cluster stability unloaded.
+
+    # ----- topology & scenario enumeration ---------------------------------
+
+    def discover_topology(self):
+        """Return {lvs_name: {'primary': uuid, 'secondary': uuid, 'tertiary': uuid}}.
+
+        Queried once at soak startup to identify the 4 role-representative
+        node pairs. Leader takeover mid-soak may shift role assignments;
+        the scenario list is pinned at startup so the 4 chosen pairs stay
+        fixed across retries even if the CP has re-promoted since.
+        """
+        script = (
+            "import json; "
+            "from simplyblock_core import db_controller; "
+            "db = db_controller.DBController(); "
+            "nodes = db.get_storage_nodes(); "
+            "out = {n.lvstore: {"
+            "'primary': n.get_id(), "
+            "'secondary': getattr(n, 'secondary_node_id', '') or '', "
+            "'tertiary': getattr(n, 'tertiary_node_id', '') or ''"
+            "} for n in nodes "
+            "if getattr(n, 'lvstore', '') "
+            "and not getattr(n, 'is_secondary_node', False)}; "
+            "print(json.dumps(out))"
+        )
+        _, stdout_text, _ = self.mgmt.run(
+            f"sudo python3 -c {shlex.quote(script)}",
+            timeout=60,
+            label="discover topology",
+        )
+        for line in reversed((stdout_text or "").strip().splitlines()):
+            line = line.strip()
+            if line.startswith("{"):
+                try:
+                    return json.loads(line)
+                except json.JSONDecodeError:
+                    continue
+        raise TestRunError(
+            f"Failed to parse topology JSON from mgmt; stdout was:\n{stdout_text}"
+        )
+
+    def _validate_topology_for_categories(self):
+        """Verify the pinned topology can supply at least one pair per category.
+
+        Raises TestRunError if:
+          * the pinned topology has no LVS (empty cluster)
+          * no LVS has both primary and secondary (primary_secondary unservable)
+          * no LVS has both primary and tertiary (primary_tertiary unservable)
+          * no unrelated pair exists — in a dense FT=2 ring with N ≤ 4 this
+            is possible; raise so the coverage gap is explicit.
+        """
+        if not self.topology:
+            raise TestRunError("Empty topology; cannot pick representative pairs")
+
+        if not self._candidate_pairs_for_role("secondary"):
+            raise TestRunError(
+                "No LVS in topology has both primary and secondary; "
+                "primary_secondary category is unservable"
+            )
+        if not self._candidate_pairs_for_role("tertiary"):
+            raise TestRunError(
+                "No LVS in topology has both primary and tertiary; "
+                "primary_tertiary category is unservable"
+            )
+
+        all_nodes, lvs_members = self._lvs_membership()
+        if not self._unrelated_pairs(all_nodes, lvs_members):
+            raise TestRunError(
+                "No unrelated node pair found in topology "
+                f"({len(all_nodes)} nodes across {len(lvs_members)} LVSs)"
+            )
+
+    def _candidate_pairs_for_role(self, role_b):
+        """All (primary, role_b) pairs across the pinned topology."""
+        pairs = []
+        for roles in self.topology.values():
+            a = roles.get("primary")
+            b = roles.get(role_b)
+            if a and b:
+                pairs.append((a, b))
+        return pairs
+
+    def _lvs_membership(self):
+        """Return (all_nodes, lvs_members) derived from the pinned topology."""
+        all_nodes = set()
+        lvs_members = []
+        for r in self.topology.values():
+            members = {v for v in r.values() if v}
+            lvs_members.append(members)
+            all_nodes.update(members)
+        return all_nodes, lvs_members
+
+    def _unrelated_pairs(self, all_nodes, lvs_members):
+        """All node pairs that share no LVS in any role."""
+        pairs = []
+        for a, b in itertools.combinations(sorted(all_nodes), 2):
+            if not any(a in m and b in m for m in lvs_members):
+                pairs.append((a, b))
+        return pairs
+
+    def pick_pair_for_category(self, category):
+        """Randomly pick a (node_a, node_b) pair for the given role category.
+
+        Distance preserved per category (so each scenario in a "group" hits
+        the same topological relationship, just on different concrete nodes):
+          - primary_secondary: ring-distance 1 (direct successor)
+          - primary_tertiary:  ring-distance 2 (exactly one node between)
+          - unrelated:         ring-distance ≥ 3 (≥ 2 nodes between)
+        """
+        if category == "unrelated":
+            all_nodes, lvs_members = self._lvs_membership()
+            candidates = self._unrelated_pairs(all_nodes, lvs_members)
+        elif category == "primary_secondary":
+            candidates = self._candidate_pairs_for_role("secondary")
+        elif category == "primary_tertiary":
+            candidates = self._candidate_pairs_for_role("tertiary")
+        else:
+            raise TestRunError(f"Unknown role category: {category}")
+
+        if not candidates:
+            raise TestRunError(
+                f"No candidate pairs available for category {category}"
+            )
+        return random.choice(candidates)
+
+    def build_scenarios(self, nodes):
+        """Enumerate role categories × P(M,2) ordered method pairs.
+
+        Returns a list of dicts with keys: method_a, method_b, category.
+        The actual (a, b) node pair is rolled at iteration time via
+        ``pick_pair_for_category`` so the soak hits many concrete pairs per
+        group while keeping the relative ring-distance fixed per category.
+        Same-method method pairs are NOT included — ordered distinct pairs
+        only, per itertools.permutations(methods, 2).
+        """
+        _ = nodes  # unused: pair picking happens at iteration time
+        scenarios = []
+        for category in ROLE_CATEGORIES:
+            for m_a, m_b in itertools.permutations(self.methods, 2):
+                scenarios.append({
+                    "method_a": m_a,
+                    "method_b": m_b,
+                    "category": category,
+                })
+        method_pair_count = len(self.methods) * (len(self.methods) - 1)
+        self.logger.log(
+            f"Built {len(scenarios)} scenarios: "
+            f"{len(ROLE_CATEGORIES)} role categories × "
+            f"P({len(self.methods)},2)={method_pair_count} ordered method pairs "
+            f"(node pair rolled randomly per scenario)"
+        )
+        return scenarios
+
+    def run(self):
+        self.ensure_prerequisites()
+        nodes = self.ensure_expected_nodes()
+        self.wait_for_all_online(timeout=self.args.restart_timeout)
+        # Wait for the cluster to be fully stable (no in-flight rebalance
+        # or data migration) before starting iterations.
+        self.wait_for_cluster_stable()
+        self.wait_for_data_migration_complete("test start")
+        mount_root = self.prepare_client()
+        # Saved so the churn cycle can mount its newly-created volume back
+        # into the same workspace tree.
+        self.mount_root = mount_root
+        volumes = self.create_volumes(nodes)
+        # Stored so the churn cycle can drive per-volume teardown/rebuild
+        # without re-creating / re-mounting the underlying soak workspace.
+        self.volumes = volumes
+        self.connect_and_mount_volumes(volumes, mount_root)
+
+        # Pin the topology once, before any outages. Leader takeover during
+        # the soak can permanently shift role assignments, but the
+        # representative pairs are fixed at startup so each cycle targets
+        # the same pairs for the same role categories.
+        self.topology = self.discover_topology()
+        self.logger.log(f"Pinned topology: {json.dumps(self.topology, sort_keys=True)}")
+        self._validate_topology_for_categories()
+        self.scenarios = self.build_scenarios(nodes)
+        if not self.scenarios:
+            raise TestRunError("No outage scenarios built; method/node list empty")
+
+        start_at = max(1, self.args.start_at)
+        if start_at > len(self.scenarios):
+            raise TestRunError(
+                f"--start-at {start_at} exceeds scenario count "
+                f"{len(self.scenarios)}; nothing to run"
+            )
+
+        churn_every = 0 if self.args.no_churn else self.args.churn_every_n_iters
+        if churn_every > 0:
+            self.logger.log(
+                f"Volume churn enabled: rebuild one random volume every "
+                f"{churn_every} iteration(s) in the unloaded settle window"
+            )
+        else:
+            self.logger.log("Volume churn disabled")
+
+        # iteration counter is aligned to scenario_idx: when --start-at N is
+        # used, the first executed scenario logs as iteration=N so post-hoc
+        # grep for "iteration 60" finds the resumed scenario and its prior
+        # failure side by side.
+        iteration = start_at - 1
+        cycle = 0
+        while True:
+            cycle += 1
+            if self.args.cycles and cycle > self.args.cycles:
+                self.logger.log(
+                    f"Completed {cycle - 1} full cycle(s) of {len(self.scenarios)} "
+                    f"scenarios; exiting"
+                )
+                return
+
+            cycle_scenarios = list(self.scenarios)
+            if self.args.shuffle_scenarios:
+                # Seed off cycle number so two soaks with the same --cycles
+                # walk identical sequences, but successive cycles rotate
+                # through different orderings.
+                random.Random(cycle).shuffle(cycle_scenarios)
+
+            cycle_start_at = start_at if cycle == 1 else 1
+            self.logger.log(
+                f"Starting cycle {cycle} ({len(cycle_scenarios)} scenarios"
+                f"{', shuffled' if self.args.shuffle_scenarios else ''}"
+                f"{f', starting at scenario {cycle_start_at}' if cycle_start_at > 1 else ''})"
+            )
+
+            for scenario_idx, scenario in enumerate(cycle_scenarios, 1):
+                if scenario_idx < cycle_start_at:
+                    continue
+                iteration += 1
+
+                node1, node2 = self.pick_pair_for_category(scenario["category"])
+                method1 = scenario["method_a"]
+                method2 = scenario["method_b"]
+
+                self.logger.log(
+                    f"Starting outage iteration {iteration} "
+                    f"(cycle {cycle} scenario {scenario_idx}/{len(cycle_scenarios)}): "
+                    f"category={scenario['category']} "
+                    f"pair=({node1[:8]},{node2[:8]}) "
+                    f"methods=({method1},{method2})"
+                )
+
+                # Skip scenarios whose nodes are not currently in the
+                # expected-node set (e.g. one has been removed from the
+                # cluster mid-soak). Better to log-and-skip than to try to
+                # restart a ghost.
+                current_uuids = {n["uuid"] for n in self.ensure_expected_nodes()}
+                missing = [uid for uid in (node1, node2) if uid not in current_uuids]
+                if missing:
+                    self.logger.log(
+                        f"Scenario {iteration} skipped: nodes {missing} not in "
+                        f"current cluster set {sorted(current_uuids)}"
+                    )
+                    continue
+
+                # Load during outage: start fresh fio so the outage hits
+                # live IO. This is the only window where fio runs.
+                self.start_fio(self.volumes)
+
+                self.run_outage_pair(node1, node2, method1, method2)
+
+                # Fault gate: any fio that exited / faulted during the
+                # outage is a real failure. check_fio raises on faults.
+                self.check_fio()
+
+                # Unload before settle: stop fio so the rebalance / data-
+                # migration drain runs without IO pressure. This is what
+                # makes the iteration cycle short.
+                self.stop_fio()
+
+                # Optional volume rebuild. fio is already stopped, so we
+                # don't need any per-job teardown — just delete + recreate
+                # + remount; the next iteration's start_fio picks it up.
+                if churn_every > 0 and iteration % churn_every == 0:
+                    self._churn_one_volume()
+
+                self.wait_for_all_online(timeout=self.args.restart_timeout)
+                self.wait_for_cluster_stable()
+                self.wait_for_data_migration_complete("next iteration")
+
+
+def main():
+    args = parse_args()
+    logger = Logger(args.log_file)
+    logger.log(f"Logging to {args.log_file}")
+    metadata = load_metadata(args.metadata)
+    if not metadata.get("clients"):
+        raise SystemExit("Metadata file does not contain a client host")
+
+    runner = SoakRunner(args, metadata, logger)
+    try:
+        runner.run()
+    except (RemoteCommandError, TestRunError, ValueError) as exc:
+        logger.log(f"ERROR: {exc}")
+        sys.exit(1)
+    finally:
+        runner.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/setup_lab_perf_test1.py b/scripts/setup_lab_perf_test1.py
index 305d836fd..ca44ce49f 100644
--- a/scripts/setup_lab_perf_test1.py
+++ b/scripts/setup_lab_perf_test1.py
@@ -41,8 +41,8 @@
 USER = "root"
 IFACE = "eth0"
 DATA_IFACE = "eth1"
-BRANCH = "inline-checksum-validation"
-MAX_LVOL = "100"
+BRANCH = "main"
+MAX_LVOL = "25"
 
 # Same volume plan layout as the AWS variant; consumed by downstream perf tooling.
 VOLUME_PLAN = [
@@ -280,6 +280,78 @@ def normalize_ref(value):
     return json.loads(output)
 
 
+def fetch_alceml_modes(mgmt_ip, cluster_uuid):
+    """Return per-alceml mode info for every storage device in the cluster.
+
+    Mirrors simplyblock_core.utils.alceml_checksum_params:
+      0 = off                   (cluster.inline_checksum False)
+      1 = md-on-device          (cluster ON, device md_supported)
+      2 = fallback / emulation  (cluster ON, device has no md-capable LBAF)
+    """
+    script = f"""python3 - <<'PY'
+import json
+from simplyblock_core.db_controller import DBController
+
+db = DBController()
+cluster = db.get_cluster_by_id({cluster_uuid!r})
+nodes = db.get_storage_nodes_by_cluster_id({cluster_uuid!r}) or []
+inline = bool(getattr(cluster, "inline_checksum", False))
+
+rows = []
+for node in nodes:
+    label = getattr(node, "hostname", "") or node.get_id()
+    for dev in (getattr(node, "nvme_devices", None) or []):
+        md_supported = bool(getattr(dev, "md_supported", False))
+        md_size = int(getattr(dev, "md_size", 0) or 0)
+        if not inline:
+            method, mode_label = 0, "off"
+        elif md_supported:
+            method, mode_label = 1, "md-on-device"
+        else:
+            method, mode_label = 2, "fallback (emulation)"
+        rows.append({{
+            "node": label,
+            "alceml": getattr(dev, "alceml_name", "") or getattr(dev, "uuid", ""),
+            "method": method,
+            "mode": mode_label,
+            "md_supported": md_supported,
+            "md_size": md_size,
+        }})
+
+print(json.dumps({{"inline_checksum": inline, "devices": rows}}, indent=2))
+PY"""
+    output = ssh_exec(mgmt_ip, [script], get_output=True, check=True)[0]
+    return json.loads(output)
+
+
+def print_alceml_summary(summary):
+    inline = summary.get("inline_checksum", False)
+    devices = summary.get("devices", [])
+    print("\n--- ALCEML inline-checksum modes ---")
+    print(f"Cluster inline_checksum: {'ENABLED' if inline else 'disabled'}")
+    if not devices:
+        print("  (no devices reported)")
+        return
+    by_node = {}
+    for row in devices:
+        by_node.setdefault(row["node"], []).append(row)
+    for node, rows in sorted(by_node.items()):
+        print(f"  {node}:")
+        for row in rows:
+            print(
+                f"    - {row['alceml'] or '(unnamed)':<40} "
+                f"method={row['method']} {row['mode']:<22} "
+                f"md_size={row['md_size']} md_supported={row['md_supported']}"
+            )
+    md_count = sum(1 for r in devices if r["method"] == 1)
+    fb_count = sum(1 for r in devices if r["method"] == 2)
+    off_count = sum(1 for r in devices if r["method"] == 0)
+    print(
+        f"Totals: md-on-device={md_count}  fallback={fb_count}  off={off_count}  "
+        f"(of {len(devices)} devices)"
+    )
+
+
 def parse_args():
     parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
     parser.add_argument(
@@ -296,6 +368,16 @@ def parse_args():
         default="cluster_metadata_base.json",
         help="Where to write the cluster metadata JSON (default: ./cluster_metadata_base.json).",
     )
+    parser.add_argument(
+        "--no-inline-checksum",
+        action="store_true",
+        help=(
+            "Disable inline CRC checksum validation. By default the cluster is "
+            "created with --enable-inline-checksum (matching the inline-checksum-"
+            "validation branch + ultra:checksum-validation-latest image). The "
+            "flag is frozen at create time and cannot be changed later."
+        ),
+    )
     return parser.parse_args()
 
 
@@ -348,19 +430,140 @@ def main():
             t.result()
     print("Phase 1: DONE - all nodes have sbcli installed.")
 
+    # --- Phase 1.5: cleanup leftover state from any prior deploy ---
+    # Order matters:
+    #   1. sn deploy-cleaner first (tears down SPDK containers + NVMe state).
+    #   2. docker rm -f any stragglers, then `docker system prune -af --volumes`.
+    #      Per the deployment notes: SAFE before cluster create (no active FDB
+    #      volumes yet); NEVER run after activate (it would wipe FDB).
+    #   3. Fresh `docker pull` of the simplyblock + ultra images named in the
+    #      installed env_var, so we don't reuse a stale cached layer.
+    print("Phase 1.5a: Running sbctl sn deploy-cleaner on every node...")
+    deploy_cleaner_cmds = ["/usr/local/bin/sbctl -d sn deploy-cleaner"]
+    with ThreadPoolExecutor(max_workers=len(all_setup_ips)) as executor:
+        tasks = [executor.submit(ssh_exec, ip, deploy_cleaner_cmds, check=False)
+                 for ip in all_setup_ips]
+        for t in tasks:
+            t.result()
+    print("Phase 1.5a: DONE.")
+
+    print("Phase 1.5b: Removing any straggler containers and pruning Docker...")
+    docker_cleanup_cmds = [
+        "containers=$(docker ps -aq); "
+        "if [ -n \"$containers\" ]; then docker rm -f $containers; fi",
+        "docker system prune -af --volumes",
+    ]
+    with ThreadPoolExecutor(max_workers=len(all_setup_ips)) as executor:
+        tasks = [executor.submit(ssh_exec, ip, docker_cleanup_cmds, check=False)
+                 for ip in all_setup_ips]
+        for t in tasks:
+            t.result()
+    print("Phase 1.5b: DONE.")
+
+    # NVMe partition cleanup. deploy-cleaner already pulls SPDK off the
+    # drives, but a prior deploy may have left GPT tables / filesystem
+    # signatures / leftover namespace state behind. Wipe signatures, then
+    # nvme-format every non-root NVMe so the data plane sees a clean slate.
+    # sn configure --enable-inline-checksum --force will reformat to a
+    # metadata-capable LBAF on top of this. Storage nodes only -- the mgmt
+    # node is never used for SPDK data devices.
+    print("Phase 1.5d: Wiping partitions and formatting NVMes on storage nodes...")
+    nvme_cleanup_script = r"""set -u
+root_src=$(findmnt -no SOURCE / 2>/dev/null || true)
+root_dev=$(echo "$root_src" | sed -E 's|p?[0-9]+$||')
+echo "Root NVMe (will be skipped): $root_dev"
+for d in $(lsblk -dno NAME,TYPE | awk '$2=="disk" && $1 ~ /^nvme/ {print "/dev/"$1}'); do
+    [ -b "$d" ] || continue
+    if [ "$d" = "$root_dev" ]; then
+        echo "Skip $d (root)"
+        continue
+    fi
+    for p in ${d}p*; do
+        [ -b "$p" ] || continue
+        umount -f "$p" 2>/dev/null || true
+    done
+    echo "Wiping $d (wipefs)"
+    wipefs -af "$d" 2>/dev/null || true
+    echo "Formatting $d (nvme format -s 0)"
+    nvme format "$d" -f -s 0 2>/dev/null || \
+        echo "  WARN: nvme format failed on $d (continuing; sn configure will retry)"
+done
+"""
+    with ThreadPoolExecutor(max_workers=len(sn_ips)) as executor:
+        tasks = [executor.submit(ssh_exec, ip, [nvme_cleanup_script], check=False)
+                 for ip in sn_ips]
+        for t in tasks:
+            t.result()
+    print("Phase 1.5d: DONE.")
+
+    print("Phase 1.5c: Fresh-pulling simplyblock + ultra images on every node...")
+    # Pull with retry: public.ecr.aws occasionally returns transient errors
+    # (IPv6 source-address races, S3 signed-URL hiccups, etc.). Retry up to
+    # 6 times with 15s backoff so one node's blip doesn't abort the deploy.
+    pull_script = """python3 - <<'PY'
+import os, subprocess, sys, time
+import simplyblock_core
+envf = os.path.join(os.path.dirname(simplyblock_core.__file__), 'env_var')
+images = []
+with open(envf) as f:
+    for line in f:
+        if '=' not in line:
+            continue
+        k, v = line.strip().split('=', 1)
+        if k in ('SIMPLY_BLOCK_DOCKER_IMAGE', 'SIMPLY_BLOCK_SPDK_ULTRA_IMAGE') and v:
+            images.append(v)
+if not images:
+    print('no images found in env_var', file=sys.stderr)
+    sys.exit(1)
+for img in images:
+    print(f'Pulling {img}', flush=True)
+    last_rc = 1
+    for attempt in range(1, 7):
+        last_rc = subprocess.call(['docker', 'pull', img])
+        if last_rc == 0:
+            break
+        print(f'  pull failed (rc={last_rc}), attempt {attempt}/6 - retry in 15s', flush=True)
+        time.sleep(15)
+    if last_rc != 0:
+        print(f'  giving up on {img} after 6 attempts', file=sys.stderr)
+        sys.exit(last_rc)
+PY"""
+    with ThreadPoolExecutor(max_workers=len(all_setup_ips)) as executor:
+        tasks = [executor.submit(ssh_exec, ip, [pull_script], check=True)
+                 for ip in all_setup_ips]
+        for t in tasks:
+            t.result()
+    print("Phase 1.5c: DONE - all nodes have fresh images.")
+
+    inline_checksum = not args.no_inline_checksum
+    checksum_flag = " --enable-inline-checksum" if inline_checksum else ""
+    print(f"Inline checksum validation: {'ENABLED' if inline_checksum else 'disabled'}")
+
     # --- Phase 2: cluster create + sn configure/deploy ---
     print("Phase 2a: Creating cluster on management node...")
     ssh_exec(mgmt_ip, [
         "/usr/local/bin/sbctl -d cluster create --enable-node-affinity"
         " --data-chunks-per-stripe 2 --parity-chunks-per-stripe 2"
+        + checksum_flag
     ], check=True)
     print("Phase 2a: DONE - cluster created.")
 
+    # sn configure --force always prompts "Type YES/Y to continue" before
+    # formatting NVMes (see simplyblock_core/utils/__init__.py:~1789). The
+    # prompt is for interactive safety; here we feed YES on stdin so the
+    # automated deploy doesn't hang the full SSH timeout (10 min) on the
+    # confirmation. Wrap with `echo YES | ...` instead of plumbing stdin
+    # through ssh_exec because it's localized to this one command.
     print("Phase 2b: Configuring storage nodes...")
+    configure_cmd = (
+        f"/usr/local/bin/sbctl -d sn configure --max-lvol {shlex.quote(args.max_lvol)}"
+        + checksum_flag + (" --force" if inline_checksum else "")
+    )
+    if inline_checksum:
+        configure_cmd = f"echo YES | {configure_cmd}"
     with ThreadPoolExecutor(max_workers=len(sn_ips)) as executor:
-        tasks = [executor.submit(ssh_exec, ip, [
-            f"/usr/local/bin/sbctl -d sn configure --max-lvol {shlex.quote(args.max_lvol)}"
-        ], check=True) for ip in sn_ips]
+        tasks = [executor.submit(ssh_exec, ip, [configure_cmd], check=True)
+                 for ip in sn_ips]
         for t in tasks:
             t.result()
     print("Phase 2b: DONE - all SNs configured.")
@@ -469,6 +672,12 @@ def main():
     with open(args.metadata_out, "w") as f:
         json.dump(final_metadata, f, indent=4)
 
+    try:
+        alceml_summary = fetch_alceml_modes(mgmt_ip, cluster_uuid)
+        print_alceml_summary(alceml_summary)
+    except Exception as exc:
+        print(f"WARNING: failed to fetch ALCEML mode summary: {exc}")
+
     print("\n--- Setup Complete ---")
     print(f"Cluster {cluster_uuid} is active. Metadata saved to {args.metadata_out}.")
 
diff --git a/simplyblock_cli/cli.py b/simplyblock_cli/cli.py
index d0b3ee71a..f00f0b662 100755
--- a/simplyblock_cli/cli.py
+++ b/simplyblock_cli/cli.py
@@ -104,6 +104,7 @@ def init_storage_node__configure(self, subparser):
         argument = subcommand.add_argument('--size-range', help='NVMe SSD device size range separated by -, can be X(m,g,t) or bytes as integer, example: --size-range 50G-1T or --size-range 1232345-67823987, --device-model and --size-range must be set together.', type=str, default='', dest='size_range', required=False)
         argument = subcommand.add_argument('--nvme-names', help='Comma separated list of nvme namespace names like nvme0n1,nvme1n1.', type=str, default='', dest='nvme_names', required=False)
         argument = subcommand.add_argument('--force', help='Force format detected or passed nvme pci address to 4K and clean partitions.', dest='force', action='store_true')
+        argument = subcommand.add_argument('--enable-inline-checksum', help='When formatting (with --force), prefer an LBAF that supports >=8 bytes of NVMe metadata per block, so alceml can run inline checksum validation in md-on-device mode. Drives with no md-capable LBAF still format to plain 4K and will use the fallback layout.', dest='inline_checksum', action='store_true')
         argument = subcommand.add_argument('--calculate-hp-only', help='Calculate the minimum required huge pages, it depends on the following params: --cores-percentage, --sockets-to-use, --max-lvol, --nodes-per-socket, --number-of-devices.', dest='calculate_hp_only', action='store_true')
         argument = subcommand.add_argument('--number-of-devices', help='Number of devices that will be used on this host. For calculating huge pages memory only.', type=int, dest='number_of_devices')
 
@@ -419,6 +420,7 @@ def init_cluster__create(self, subparser):
         if self.developer_mode:
             argument = subcommand.add_argument('--disable-monitoring', help='Disable monitoring stack, false by default. Default: `false`.', dest='disable_monitoring', action='store_true')
         argument = subcommand.add_argument('--strict-node-anti-affinity', help='Enable strict node anti affinity for storage nodes. Never more than one chunk is placed on a node. This requires a minimum of _data-chunks-in-stripe + parity-chunks-in-stripe + 1_ nodes in the cluster.', dest='strict_node_anti_affinity', action='store_true')
+        argument = subcommand.add_argument('--enable-inline-checksum', help='Enable inline CRC checksum validation on every IO for silent-data-error protection. Cannot be enabled or disabled after cluster creation. Per-device alceml mode (md-on-device vs fallback) is auto-detected at add-node.', dest='inline_checksum', action='store_true')
         argument = subcommand.add_argument('--name', '-n', help='Assigns a name to the newly created cluster.', type=str, dest='name')
         argument = subcommand.add_argument('--qpair-count', help='The NVMe/TCP transport qpair count per logical volume. Default: `32`.', type=range_type(0, 128), default=32, dest='qpair_count')
         argument = subcommand.add_argument('--client-qpair-count', help='The default NVMe/TCP transport qpair count per logical volume for client. Default: `3`.', type=range_type(0, 128), default=3, dest='client_qpair_count')
@@ -453,6 +455,7 @@ def init_cluster__add(self, subparser):
         if self.developer_mode:
             argument = subcommand.add_argument('--inflight-io-threshold', help='The number of inflight IOs allowed before the IO queuing starts. Default: `4`.', type=int, default=4, dest='inflight_io_threshold')
         argument = subcommand.add_argument('--strict-node-anti-affinity', help='Enable strict node anti affinity for storage nodes. Never more than one chunk is placed on a node. This requires a minimum of _data-chunks-in-stripe + parity-chunks-in-stripe + 1_ nodes in the cluster."', dest='strict_node_anti_affinity', action='store_true')
+        argument = subcommand.add_argument('--enable-inline-checksum', help='Enable inline CRC checksum validation on every IO for silent-data-error protection. Cannot be enabled or disabled after cluster creation.', dest='inline_checksum', action='store_true')
         argument = subcommand.add_argument('--name', '-n', help='Assigns a name to the newly created cluster.', type=str, dest='name')
         argument = subcommand.add_argument('--client-data-nic', help='Network interface name from client to use for logical volume connection.', type=str, dest='client_data_nic')
         argument = subcommand.add_argument('--use-backup', help='The path to JSON file with S3/MinIO backup configuration.', type=str, dest='use_backup')
diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py
index 05378f51a..e49727ada 100755
--- a/simplyblock_cli/clibase.py
+++ b/simplyblock_cli/clibase.py
@@ -140,7 +140,8 @@ def storage_node__configure(self, sub_command, args):
             args.max_lvol, max_prov, sockets_to_use,args.nodes_per_socket,
             pci_allowed, pci_blocked, force=args.force, device_model=args.device_model,
             size_range=args.size_range, cores_percentage=cores_percentage, nvme_names=nvme_names,
-            calculate_hp_only=args.calculate_hp_only, number_of_devices=number_of_devices)
+            calculate_hp_only=args.calculate_hp_only, number_of_devices=number_of_devices,
+            inline_checksum=args.inline_checksum)
 
     def storage_node__deploy_cleaner(self, sub_command, args):
         storage_ops.deploy_cleaner()
@@ -1001,12 +1002,14 @@ def cluster_add(self, args):
             with open(args.use_backup, 'r') as f:
                 backup_config = _json.load(f)
 
+        inline_checksum = getattr(args, 'inline_checksum', False)
         return cluster_ops.add_cluster(
             blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn, prov_cap_crit,
             distr_ndcs, distr_npcs, distr_bs, distr_chunk_bs, ha_type, enable_node_affinity,
             qpair_count, max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, fabric,
             client_data_nic, max_fault_tolerance=max_fault_tolerance, backup_config=backup_config,
-            nvmf_base_port=args.nvmf_base_port, rpc_base_port=args.rpc_base_port, snode_api_port=args.snode_api_port)
+            nvmf_base_port=args.nvmf_base_port, rpc_base_port=args.rpc_base_port, snode_api_port=args.snode_api_port,
+            inline_checksum=inline_checksum)
 
     def cluster_create(self, args):
         import json as _json
@@ -1043,6 +1046,7 @@ def cluster_create(self, args):
         is_single_node = args.is_single_node
         fabric = args.fabric
         client_data_nic = args.client_data_nic
+        inline_checksum = getattr(args, 'inline_checksum', False)
 
         max_fault_tolerance = min(distr_npcs, 2) if distr_npcs >= 1 else 1
 
@@ -1060,7 +1064,8 @@ def cluster_create(self, args):
             strict_node_anti_affinity, name, tls_secret, ingress_host_source, dns_name, fabric, is_single_node, client_data_nic,
             max_fault_tolerance=max_fault_tolerance,
             backup_config=backup_config,
-            nvmf_base_port=args.nvmf_base_port, rpc_base_port=args.rpc_base_port, snode_api_port=args.snode_api_port)
+            nvmf_base_port=args.nvmf_base_port, rpc_base_port=args.rpc_base_port, snode_api_port=args.snode_api_port,
+            inline_checksum=inline_checksum)
 
     def query_yes_no(self, question, default="yes"):
         """Ask a yes/no question via raw_input() and return their answer.
diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py
index c40b96875..0e9dc8239 100644
--- a/simplyblock_core/cluster_ops.py
+++ b/simplyblock_core/cluster_ops.py
@@ -15,7 +15,8 @@
 
 from docker.errors import DockerException
 from simplyblock_core import utils, scripts, constants, mgmt_node_ops, storage_node_ops
-from simplyblock_core.controllers import backup_controller, cluster_events, device_controller, qos_controller, tasks_controller
+from simplyblock_core.controllers import backup_controller, cluster_events, device_controller, qos_controller, tasks_controller, tcp_ports_events
+from simplyblock_core.fw_api_client import FirewallClient
 from simplyblock_core.db_controller import DBController
 from simplyblock_core.models.cluster import Cluster
 from simplyblock_core.models.job_schedule import JobSchedule
@@ -225,7 +226,8 @@ def create_cluster(blk_size, page_size_in_blocks, cli_pass,
                    enable_node_affinity, qpair_count, client_qpair_count, max_queue_size, inflight_io_threshold, disable_monitoring, strict_node_anti_affinity, name,
                    tls_secret, ingress_host_source, dns_name, fabric, is_single_node, client_data_nic,
                    nvmeof_tls_config=None, max_fault_tolerance=1, backup_config=None,
-                   nvmf_base_port=4420, rpc_base_port=8080, snode_api_port=50001, container_image_prefix=None) -> str:
+                   nvmf_base_port=4420, rpc_base_port=8080, snode_api_port=50001, container_image_prefix=None,
+                   inline_checksum=False) -> str:
 
     if distr_ndcs == 0 and distr_npcs == 0:
         raise ValueError("both distr_ndcs and distr_npcs cannot be 0")
@@ -348,6 +350,7 @@ def create_cluster(blk_size, page_size_in_blocks, cli_pass,
     cluster.disable_monitoring = disable_monitoring
     cluster.mode = mode
     cluster.full_page_unmap = False
+    cluster.inline_checksum = bool(inline_checksum)
     cluster.client_data_nic = client_data_nic or ""
     cluster.max_fault_tolerance = max_fault_tolerance
     cluster.nvmf_base_port = nvmf_base_port
@@ -468,7 +471,8 @@ def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn
                 max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, cr_name=None,
                 cr_namespace=None, cr_plural=None, fabric="tcp", cluster_ip=None, grafana_secret=None,
                 client_data_nic="", max_fault_tolerance=1, backup_config=None,
-                nvmf_base_port=4420, rpc_base_port=8080, snode_api_port=50001) -> str:
+                nvmf_base_port=4420, rpc_base_port=8080, snode_api_port=50001,
+                inline_checksum=False) -> str:
 
 
     default_cluster = None
@@ -565,6 +569,7 @@ def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn
     cluster.fabric_tcp = protocols["tcp"]
     cluster.fabric_rdma = protocols["rdma"]
     cluster.full_page_unmap = False
+    cluster.inline_checksum = bool(inline_checksum)
     cluster.client_data_nic = client_data_nic or ""
     cluster.max_fault_tolerance = max_fault_tolerance
     cluster.nvmf_base_port = nvmf_base_port
@@ -607,6 +612,22 @@ def cluster_activate(cl_id, force=False, force_lvstore_create=False) -> None:
         ols_status = Cluster.STATUS_UNREADY
     else:
         set_cluster_status(cl_id, Cluster.STATUS_IN_ACTIVATION)
+
+    # First-time activation runs while no primary LVS is serving fabric I/O
+    # yet, so the recreate paths run with activation_mode=True (peer LVS /
+    # leader / hublvol RPCs short-circuited — peer stacks aren't fully built
+    # during this phase, so they would not be safe to call). Re-activation
+    # (e.g. suspended → in_activation after JCERR, or force-reactivating an
+    # active/degraded cluster) is different: every primary's SPDK and lvstore
+    # are still alive and serving I/O — the secondary's examine of its non-
+    # leader raid0 races the live leader's blob-metadata writes and fails
+    # with bs_load_cur_extent_page_valid CRC mismatch on every retry
+    # (observed 2026-05-11, LVS_6769 on node 8084 — 22+ minute examine loop).
+    # We keep activation_mode=True (so peer LVS/hublvol RPCs stay disabled)
+    # and add only a firewall-only port-block on the live leader around the
+    # non-leader recreate in Pass 2. Port-block is benign on peers whose
+    # service isn't listening, so it's safe even against not-fully-built peers.
+    is_fresh_activation = (ols_status == Cluster.STATUS_UNREADY)
     snodes = db_controller.get_storage_nodes_by_cluster_id(cl_id)
     online_nodes = []
     dev_count = 0
@@ -744,16 +765,61 @@ def cluster_activate(cl_id, force=False, force_lvstore_create=False) -> None:
         for primary_node in primary_nodes:
             primary_node.lvstore_status = "in_creation"
             primary_node.write_to_db()
+
+            # On re-activation the primary's LVS is still alive and serving
+            # client I/O — snode's examine of its non-leader raid0 will race
+            # the leader's blob-metadata writes unless we quiesce the leader
+            # first. We do this with a firewall-only port-block on the leader:
+            # it has no effect on a peer whose service isn't listening (per
+            # design, safe even when peer stacks aren't fully built yet) but
+            # it stops the live leader from issuing writes that race the
+            # examine. We deliberately do NOT switch the helper out of
+            # activation_mode here: that would enable peer leader/distrib/
+            # lvstore/hublvol RPCs which presume the peer's full stack is up.
+            leader_blocked = False
+            leader_port = None
+            leader_ptype = "tcp"
+            if not is_fresh_activation and primary_node.status == StorageNode.STATUS_ONLINE:
+                try:
+                    leader_port = primary_node.get_lvol_subsys_port(primary_node.lvstore)
+                    leader_ptype = "udp" if primary_node.active_rdma else "tcp"
+                    FirewallClient(primary_node, timeout=3, retry=1).firewall_set_port(
+                        leader_port, leader_ptype, "block", primary_node.rpc_port)
+                    tcp_ports_events.port_deny(primary_node, leader_port)
+                    leader_blocked = True
+                    time.sleep(0.5)
+                except Exception as e:
+                    logger.warning(
+                        "Re-activation: port-block on leader %s for %s failed: %s — "
+                        "proceeding without block (secondary examine may race live leader writes)",
+                        primary_node.get_id(), primary_node.lvstore, e)
+
             try:
-                r = storage_node_ops.recreate_lvstore_on_non_leader(
-                    snode, primary_node, primary_node, activation_mode=True)
-            except storage_node_ops.LVSRestartRequiredError as e:
-                logger.error(e)
-                set_cluster_status(cl_id, ols_status)
-                raise ValueError(
-                    f"Failed to activate cluster: node {e.node_id} holds "
-                    f"partial state for LVS {e.lvs_name} (non-leader). "
-                    f"Restart node {e.node_id} before activating.")
+                try:
+                    r = storage_node_ops.recreate_lvstore_on_non_leader(
+                        snode, primary_node, primary_node, activation_mode=True)
+                except storage_node_ops.LVSRestartRequiredError as e:
+                    logger.error(e)
+                    set_cluster_status(cl_id, ols_status)
+                    raise ValueError(
+                        f"Failed to activate cluster: node {e.node_id} holds "
+                        f"partial state for LVS {e.lvs_name} (non-leader). "
+                        f"Restart node {e.node_id} before activating.")
+            finally:
+                if leader_blocked:
+                    try:
+                        FirewallClient(primary_node, timeout=3, retry=1).firewall_set_port(
+                            leader_port, leader_ptype, "allow", primary_node.rpc_port)
+                        tcp_ports_events.port_allowed(primary_node, leader_port)
+                    except Exception as ue:
+                        logger.error(
+                            "Failed to unblock leader %s:%s after non-leader recreate: %s — scheduling port_allow",
+                            primary_node.get_id(), leader_port, ue)
+                        try:
+                            tasks_controller.add_port_allow_task(
+                                primary_node.cluster_id, primary_node.get_id(), leader_port)
+                        except Exception as se:
+                            logger.error("Failed to schedule port_allow fallback: %s", se)
             if not r:
                 ret = False
 
diff --git a/simplyblock_core/controllers/device_controller.py b/simplyblock_core/controllers/device_controller.py
index b264c689c..b6fc94fd2 100644
--- a/simplyblock_core/controllers/device_controller.py
+++ b/simplyblock_core/controllers/device_controller.py
@@ -297,12 +297,21 @@ def _def_create_device_stack(device_obj, snode, force=False, clear_data=False):
 
     cluster = db_controller.get_cluster_by_id(snode.cluster_id)
     if alceml_name not in bdev_names:
+        checksum_method, cache_size, cache_eviction_threshold = utils.alceml_checksum_params(cluster, device_obj)
+        if cluster.inline_checksum and not device_obj.md_supported:
+            logger.warning(
+                f"Inline checksum: device {device_obj.get_id()} ({device_obj.pcie_address}) has no NVMe metadata; "
+                f"alceml will run in fallback mode (extra md page, ~1.17%% capacity overhead)."
+            )
         ret = snode.create_alceml(
             alceml_name, nvme_bdev, alceml_id,
             pba_init_mode=3 if clear_data else 2,
             write_protection=cluster.distr_ndcs > 1,
             pba_page_size=cluster.page_size_in_blocks,
-            full_page_unmap=cluster.full_page_unmap
+            full_page_unmap=cluster.full_page_unmap,
+            checksum_method=checksum_method,
+            cache_size=cache_size,
+            cache_eviction_threshold=cache_eviction_threshold,
         )
 
         if not ret:
diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py
index b01ded610..041f397aa 100644
--- a/simplyblock_core/controllers/lvol_controller.py
+++ b/simplyblock_core/controllers/lvol_controller.py
@@ -126,11 +126,27 @@ def _create_crypto_lvol(rpc_client, name, base_name, key1, key2):
     if not ret:
         logger.error(f"Failed to find LVol bdev {base_name}")
         return False
+
+    # Idempotent: if the crypto bdev already exists from a prior partial
+    # activation/restart pass, skip the key + crypto-bdev creates. SPDK
+    # rejects duplicate creates with hard errors that would otherwise
+    # break re-activation convergence.
+    if rpc_client.get_bdevs(name):
+        logger.info("crypto LVol %s already exists, skipping create", name)
+        return True
+
     key_name = f'key_{name}'
     ret = rpc_client.lvol_crypto_key_create(key_name, key1, key2)
     if not ret:
-        logger.error("failed to create crypto key")
-        return False
+        # SPDK returns failure when the key name already exists. On
+        # re-activation that's the same node re-issuing the same key —
+        # treat existing key as benign and proceed to the crypto-bdev
+        # create below. If creation genuinely failed for another reason,
+        # the next call will surface it.
+        logger.warning(
+            "lvol_crypto_key_create returned failure for %s; if the key "
+            "already exists from a prior pass this is expected — "
+            "proceeding to crypto bdev create", key_name)
     ret = rpc_client.lvol_crypto_create(name, base_name, key_name)
     if not ret:
         logger.error(f"failed to create crypto LVol {name}")
@@ -432,6 +448,11 @@ def add_lvol_ha(name, size, host_id_or_name, ha_type, pool_id_or_name, use_comp=
                 if dev.status == dev.STATUS_ONLINE:
                     dev_count += 1
                     cluster_size_total += dev.size
+                    # Inline-checksum fallback layout reserves 6 of every 510 data blocks
+                    # per 2 MiB extent for the extended md page + filler. Charge that as
+                    # initial utilization rather than reducing reported raw capacity.
+                    if cl.inline_checksum and not dev.md_supported:
+                        cluster_size_prov += utils.alceml_fallback_overhead_bytes(cl, dev.size)
 
     if len(online_nodes) == 0:
         logger.error("No online Storage nodes found")
@@ -1631,9 +1652,21 @@ def connect_lvol(uuid, ctrl_loss_tmo=constants.LVOL_NVME_CONNECT_CTRL_LOSS_TMO,
         for h in lvol.allowed_hosts:
             if h["nqn"] == host_nqn:
                 host_entry = h
-                pool = db_controller.get_pool_by_id(lvol.pool_uuid)
-                host_entry["dhchap_key"] = pool.dhchap_key
-                host_entry["dhchap_ctrlr_key"] = pool.dhchap_ctrlr_key
+                # Note: an earlier change (sfam-2722) unconditionally
+                # injected ``pool.dhchap_key`` / ``pool.dhchap_ctrlr_key``
+                # into ``host_entry`` here. That broke three contracts:
+                #   - it overrode existing host-level keys
+                #   - it injected pool keys when the pool had no DHCHAP
+                #     configured (pool.dhchap_key=None)
+                #   - it emitted ``--dhchap-secret`` for hosts using PSK
+                #     (TLS auth), which conflicts with the host's chosen
+                #     auth mode
+                # Pool-level DHCHAP keys are retrieved by clients via a
+                # separate path (see ``_register_pool_dhchap_keys_on_node``);
+                # the connect command must not embed them. If sfam-2722
+                # needs pool keys propagated for a specific case, add a
+                # guarded code path that gates on ``pool.dhchap`` and
+                # only sets keys the host_entry doesn't already have.
                 break
         if not host_entry:
             return False, f"Host NQN {host_nqn} not found in allowed hosts for volume {uuid}"
diff --git a/simplyblock_core/env_var b/simplyblock_core/env_var
index 5abdecd44..f90552c61 100644
--- a/simplyblock_core/env_var
+++ b/simplyblock_core/env_var
@@ -1,5 +1,5 @@
 SIMPLY_BLOCK_COMMAND_NAME=sbcli-dev
-SIMPLY_BLOCK_VERSION=19.2.34
+SIMPLY_BLOCK_VERSION=19.2.36
 
-SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:main
-SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:main-latest
+SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:inline-checksum-validation
+SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:checksum-validation-latest
diff --git a/simplyblock_core/models/cluster.py b/simplyblock_core/models/cluster.py
index 1f0588a1b..29b79c7db 100644
--- a/simplyblock_core/models/cluster.py
+++ b/simplyblock_core/models/cluster.py
@@ -73,6 +73,9 @@ class Cluster(BaseModel):
     is_re_balancing: bool = False
     full_page_unmap: bool = True
     is_single_node: bool = False
+    # Inline CRC checksum validation for silent-data-error protection.
+    # Frozen at cluster create time; no upgrade path for existing clusters.
+    inline_checksum: bool = False
     snapshot_replication_target_cluster: str = ""
     snapshot_replication_target_pool: str = ""
     snapshot_replication_timeout: int = 60*10
diff --git a/simplyblock_core/models/nvme_device.py b/simplyblock_core/models/nvme_device.py
index 1badd5942..b1e7299ee 100644
--- a/simplyblock_core/models/nvme_device.py
+++ b/simplyblock_core/models/nvme_device.py
@@ -66,6 +66,11 @@ class NVMeDevice(BaseModel):
     last_flap_tsc: float = 0.0
     serial_number: str = ""
     size: int = -1
+    # NVMe per-block metadata size in bytes, as reported by the bound SPDK bdev.
+    # >=8 means alceml can run in cv_md_method (no read/write amplification).
+    # 0 means alceml must use cv_fallback_method (extra md page per 2 MiB extent).
+    md_size: int = 0
+    md_supported: bool = False
     testing_bdev: str = ""
     connecting_from_node: str = ""
     previous_status: str = ""
diff --git a/simplyblock_core/rpc_client.py b/simplyblock_core/rpc_client.py
index 3b1243dff..ad289a8db 100755
--- a/simplyblock_core/rpc_client.py
+++ b/simplyblock_core/rpc_client.py
@@ -570,7 +570,8 @@ def qos_vbdev_delete(self, name):
 
     def bdev_alceml_create(self, alceml_name, nvme_name, uuid, pba_init_mode=3,
                            alceml_cpu_mask="", alceml_worker_cpu_mask="", pba_page_size=2097152,
-                           write_protection=False, full_page_unmap=False):
+                           write_protection=False, full_page_unmap=False,
+                           checksum_method=0, cache_size=0, cache_eviction_threshold=0):
         params = {
             "name": alceml_name,
             "cntr_path": nvme_name,
@@ -594,6 +595,15 @@ def bdev_alceml_create(self, alceml_name, nvme_name, uuid, pba_init_mode=3,
             params["write_protection"] = True
         if full_page_unmap:
             params["use_map_whole_page_on_1st_write"] = True
+        # Inline CRC checksum validation. method: 0=off, 1=md-on-device, 2=fallback (extra md page).
+        # The data plane reads md_size from spdk_bdev_get_md_size and refuses method=1 when md_size==0,
+        # so the caller must pick method=2 for devices without NVMe metadata support.
+        if checksum_method:
+            params["checksum_validation_method"] = int(checksum_method)
+            if cache_size:
+                params["cache_size"] = int(cache_size)
+            if cache_eviction_threshold:
+                params["cache_eviction_threshold"] = int(cache_eviction_threshold)
         return self._request("bdev_alceml_create", params)
        
     def bdev_distrib_create(self, name, vuid, ndcs, npcs, num_blocks, block_size, jm_names,
diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py
index 3e9a72c8d..1f5ea5e1a 100755
--- a/simplyblock_core/storage_node_ops.py
+++ b/simplyblock_core/storage_node_ops.py
@@ -122,6 +122,61 @@ def _rpc_lvstore_exists(rpc_client, lvs_name):
         return False
 
 
+def _kill_spdk_until_dead(snode, max_attempts=3, poll_per_attempt_sec=5,
+                           poll_interval=0.25):
+    """Kill SPDK on `snode` and return only after it is verifiably gone.
+
+    Per design: any abort during restart MUST kill SPDK so the next attempt
+    starts from a clean process — leftover bdevs (raid0_<vuid>, lvol
+    subsystems) cause "Duplicate bdev name" / "Subsystem already exists"
+    failures on retry that loop the auto-restart forever.
+
+    The previous behavior (single 5 s soft window, log warning, proceed)
+    silently left zombies behind. We now retry the kill until SPDK is
+    confirmed down. Bounded total wall-clock = max_attempts *
+    poll_per_attempt_sec so a wedged docker daemon cannot trap the caller.
+    Returns True if SPDK died, False if all attempts exhausted (caller is
+    responsible for whatever comes next; the node should still be marked
+    OFFLINE so it stops being treated as in_restart).
+    """
+    snode_api = snode.client(timeout=5, retry=5)
+    for attempt in range(1, max_attempts + 1):
+        try:
+            snode_api.spdk_process_kill(snode.rpc_port, snode.cluster_id)
+        except Exception as e:
+            logger.warning(
+                "spdk_process_kill RPC failed on %s (attempt %d/%d): %s",
+                snode.get_id(), attempt, max_attempts, e,
+            )
+
+        deadline = time.time() + poll_per_attempt_sec
+        while time.time() < deadline:
+            try:
+                up = snode_api.spdk_process_is_up(snode.rpc_port, snode.cluster_id)
+            except Exception:
+                up = False
+            if not up:
+                logger.info(
+                    "SPDK on %s confirmed down (kill attempt %d/%d)",
+                    snode.get_id(), attempt, max_attempts,
+                )
+                return True
+            time.sleep(poll_interval)
+
+        logger.warning(
+            "SPDK on %s still up after %ds (attempt %d/%d); re-issuing kill",
+            snode.get_id(), poll_per_attempt_sec, attempt, max_attempts,
+        )
+
+    logger.error(
+        "SPDK on %s did NOT die after %d kill attempts (%ds total) — "
+        "investigate snode_api / docker daemon health on %s",
+        snode.get_id(), max_attempts,
+        max_attempts * poll_per_attempt_sec, snode.mgmt_ip,
+    )
+    return False
+
+
 def _reapply_allowed_hosts(lvol, snode, rpc_client):
     """Re-register allowed hosts (with DHCHAP keys) on a subsystem after recreation."""
     from simplyblock_core.controllers.lvol_controller import _register_dhchap_keys_on_node, _get_dhchap_group
@@ -471,22 +526,43 @@ def _search_for_partitions(rpc_client, nvme_device):
 
 
 def _create_jm_stack_on_raid(rpc_client, jm_nvme_bdevs, snode, after_restart):
+    # When the two NVMes on a node format to different LBAFs (e.g.
+    # heterogeneous hardware where one drive supports an md-capable LBAF
+    # and the other doesn't), the resulting partition bdevs end up with
+    # different md_size and SPDK's RAID1 layer rejects the mirror with
+    # EINVAL ("different metadata format than base bdev"). For lab
+    # configurations where we deliberately want to exercise both
+    # md-on-device and fallback alceml modes in the same cluster, fall
+    # back to a single-bdev JM (no mirror) on the first partition so
+    # the deploy can proceed. We persist only that first bdev in
+    # jm_nvme_bdev_list so the restart path takes the existing single-
+    # bdev recreate branch and doesn't try to rebuild the RAID.
     if snode.jm_device and snode.jm_device.raid_bdev:
         raid_bdev = snode.jm_device.raid_bdev
         if raid_bdev.startswith("raid_jm_"):
             raid_level = "1"
             ret = rpc_client.bdev_raid_create(raid_bdev, jm_nvme_bdevs, raid_level)
             if not ret:
-                logger.error(f"Failed to create raid_jm_{snode.get_id()}")
-                return False
+                logger.warning(
+                    f"RAID create failed for {raid_bdev} on bdevs {jm_nvme_bdevs} "
+                    f"(likely heterogeneous metadata format); falling back to "
+                    f"single-bdev JM on {jm_nvme_bdevs[0]}"
+                )
+                raid_bdev = jm_nvme_bdevs[0]
+                jm_nvme_bdevs = [jm_nvme_bdevs[0]]
     else:
         if len(jm_nvme_bdevs) > 1:
             raid_bdev = f"raid_jm_{snode.get_id()}"
             raid_level = "1"
             ret = rpc_client.bdev_raid_create(raid_bdev, jm_nvme_bdevs, raid_level)
             if not ret:
-                logger.error(f"Failed to create raid_jm_{snode.get_id()}")
-                return False
+                logger.warning(
+                    f"RAID create failed for {raid_bdev} on bdevs {jm_nvme_bdevs} "
+                    f"(likely heterogeneous metadata format); falling back to "
+                    f"single-bdev JM on {jm_nvme_bdevs[0]}"
+                )
+                raid_bdev = jm_nvme_bdevs[0]
+                jm_nvme_bdevs = [jm_nvme_bdevs[0]]
         else:
             raid_bdev = jm_nvme_bdevs[0]
 
@@ -667,12 +743,22 @@ def _create_storage_device_stack(rpc_client, nvme, snode, after_restart):
 
     cluster = db_controller.get_cluster_by_id(snode.cluster_id)
 
+    checksum_method, cache_size, cache_eviction_threshold = utils.alceml_checksum_params(cluster, nvme)
+    if cluster.inline_checksum and not nvme.md_supported:
+        logger.warning(
+            f"Inline checksum: device {nvme.get_id()} ({nvme.pcie_address}) has no NVMe metadata; "
+            f"alceml will run in fallback mode (extra md page, ~1.17%% capacity overhead)."
+        )
+
     ret = snode.create_alceml(
         alceml_name, nvme_bdev, alceml_id,
         pba_init_mode=1 if (after_restart and nvme.status != NVMeDevice.STATUS_NEW) else 3,
         write_protection=cluster.distr_ndcs > 1,
         pba_page_size=cluster.page_size_in_blocks,
         full_page_unmap=cluster.full_page_unmap,
+        checksum_method=checksum_method,
+        cache_size=cache_size,
+        cache_eviction_threshold=cache_eviction_threshold,
     )
 
     if not ret:
@@ -2180,6 +2266,36 @@ def restart_storage_node(
                     f"Restart of {node_id} failed (post-status={post_node.status}); "
                     f"resetting to OFFLINE to unblock future attempts"
                 )
+
+                # Abort contract: SPDK MUST be killed on every failed
+                # restart that owned the lock, so the next attempt starts
+                # from a clean process. Without this, _restart_storage_node_impl
+                # has ~20 different `return False` paths (per-device setup,
+                # examine, subsystem create, listener add, remote-dev
+                # connect, etc.) that all leave SPDK running with whatever
+                # bdevs the impl already set up — causing the next attempt
+                # to fail on "Duplicate bdev name for manual examine:
+                # raid0_<vuid>" / "Subsystem NQN ... already exists" and
+                # loop forever (incident 2026-05-10, b278fd62 restart
+                # attempts 1–3). Routing every owned-lock failure through
+                # _kill_spdk_until_dead closes those gaps in one place.
+                # Idempotent: a fast no-op when SPDK was never started in
+                # this attempt. Inner abort paths (recreate_lvstore's
+                # _abort_restart_and_unblock, restart_storage_node's
+                # _abort_restart) emit the snode_restart_failed event
+                # already; the wrapper does NOT re-emit it to avoid
+                # duplicate events and to avoid the FDB write that
+                # `snode_restart_failed` performs unconditionally (which
+                # would raise SystemExit through base_model.write_to_db
+                # on hosts without FDB — the wrapper must not depend on
+                # FDB liveness for cleanup correctness).
+                try:
+                    _kill_spdk_until_dead(post_node)
+                except Exception as kill_exc:
+                    logger.error(
+                        f"Restart cleanup: kill SPDK on {node_id} raised: {kill_exc}"
+                    )
+
                 # Force the OFFLINE write — bypass the state-machine guard
                 # in set_node_status (which only restricts ONLINE writes
                 # anyway, but we use a direct write here to avoid any
@@ -2590,6 +2706,9 @@ def _restart_storage_node_impl(
                 db_dev.nvme_bdev = found_dev.nvme_bdev
                 db_dev.nvme_controller = found_dev.nvme_controller
                 db_dev.pcie_address = found_dev.pcie_address
+            # Refresh md detection so a re-format between restarts is reflected
+            db_dev.md_size = found_dev.md_size
+            db_dev.md_supported = found_dev.md_supported
 
             # if db_dev.status in [ NVMeDevice.STATUS_ONLINE]:
             #     db_dev.status = NVMeDevice.STATUS_UNAVAILABLE
@@ -2754,11 +2873,19 @@ def _restart_storage_node_impl(
         # Before each, perform disconnect checks on the other two nodes.
 
         def _abort_restart(reason):
-            """Kill SPDK and set offline on fatal error."""
+            """Kill SPDK and set offline on fatal error.
+
+            Contract: any abort during restart kills SPDK reliably (verified
+            down) before returning, so the next restart attempt starts from
+            a clean SPDK process. The previous implementation issued a
+            single fire-and-forget ``spdk_process_kill`` and proceeded —
+            which left zombie SPDK behind when docker-rm took >5 s,
+            causing the next attempt to fail with "Duplicate bdev name for
+            manual examine: raid0_<vuid>" and loop forever.
+            """
             logger.error(f"Restart abort: {reason}")
             storage_events.snode_restart_failed(snode)
-            snode_api_inner = snode.client(timeout=5, retry=5)
-            snode_api_inner.spdk_process_kill(snode.rpc_port, snode.cluster_id)
+            _kill_spdk_until_dead(snode)
             set_node_status(snode.get_id(), StorageNode.STATUS_OFFLINE)
 
         try:
@@ -2768,10 +2895,16 @@ def _abort_restart(reason):
             _abort_restart(f"LVS recreation failed: {e}")
             return False
         if not ret:
+            # Restart abort path. recreate_all_lvstores returning False is
+            # ALSO a restart abort and must honor the same kill+offline
+            # contract — otherwise SPDK keeps running with the partial
+            # bdev stack from this attempt (e.g. raid0_<vuid> created via
+            # auto-examine) and the next retry fails on "Duplicate bdev
+            # name". 10:58:11 in the AWS soak run hit exactly this gap.
             snode = db_controller.get_storage_node_by_id(snode.get_id())
             snode.lvstore_status = "failed"
             snode.write_to_db()
-            set_node_status(snode.get_id(), StorageNode.STATUS_OFFLINE)
+            _abort_restart("recreate_all_lvstores returned False")
             return False
 
         # === Phase 10: Finalization — post all LVS recreation ===
@@ -3345,7 +3478,6 @@ def suspend_storage_node(node_id, force=False, change_node_status=True):
 
     logger.info("Suspending node")
 
-    rpc_client = snode.rpc_client()
     fw_api = FirewallClient(snode, timeout=20, retry=1)
     port_type = "tcp"
     if snode.active_rdma:
@@ -3367,14 +3499,27 @@ def _revert_blocked_ports():
 
     try:
         # Block per-lvstore ports for secondary lvstores hosted on this node.
-        # Order: client (lvs) port FIRST, then 1 s for peers to drain in-flight
-        # IO via the still-open hublvol back to the acting leader, then the
-        # hublvol port. Reversing this order — blocking the hublvol port while
-        # the client port is still open — kills the redirect path while clients
-        # may still be funneling IO into the secondary, forcing the secondary's
-        # lvstore layer to auto-promote (lvol.c:3508 "Leadership changed due
-        # to receive new IO"), which races the in-flight leader and produces
-        # a writer conflict (incident 2026-05-06 iter 2, jm_vuid=4450).
+        # Per-LVS order:
+        #   1. block client (lvs) port — multipath clients fail over to peers
+        #   2. sleep so any pre-block in-flight IO already on our distrib
+        #      pipeline can complete locally before we close the hublvol
+        #   3. block the hublvol port — surviving peer detects "no redirect
+        #      target", JC consensus + auto-promotion on the peer elects the
+        #      new leader cleanly (the in-flight IO that triggered earlier
+        #      revisions of this code is already drained by step 2).
+        # We deliberately do NOT issue an explicit
+        # bdev_lvol_set_leader(leader=False) / bdev_distrib_force_to_non_leader
+        # at any point inside this loop. Prior incidents:
+        #   2026-05-06 iter 2, jm_vuid=4450 — hub closed before client; fixed
+        #     by client-first ordering.
+        #   2026-05-10 iter 3, jm_vuid=4245 — multipath refilled the
+        #     secondary's pipeline during the drain; tried "demote between
+        #     the two blocks" (3ae2a9b0). Still produced a writer conflict
+        #     because the explicit demote races pre-block IO still being
+        #     processed on the local distrib (port block does not halt
+        #     internal distrib processing of already-queued requests); the
+        #     IO completes as non-leader → conflict. Dropped the demote
+        #     entirely and rely on the peer's auto-promotion path.
         if snode.lvstore_stack_secondary or snode.lvstore_stack_tertiary:
             nodes = db_controller.get_primary_storage_nodes_by_secondary_node_id(node_id)
             if nodes:
@@ -3386,11 +3531,9 @@ def _revert_blocked_ports():
                     if sec_hub_port:
                         _block_port(sec_hub_port)
                     time.sleep(0.5)
-                    rpc_client.bdev_lvol_set_leader(node.lvstore, leader=False)
-                    rpc_client.bdev_distrib_force_to_non_leader(node.jm_vuid)
 
         # Block per-lvstore ports for this node's own primary lvstore
-        # (same client-port-first ordering — see comment above).
+        # (same per-LVS ordering — see comment above).
         own_lvs_port = snode.get_lvol_subsys_port(snode.lvstore)
         own_hub_port = snode.get_hublvol_port(snode.lvstore)
         _block_port(own_lvs_port)
@@ -3398,8 +3541,6 @@ def _revert_blocked_ports():
         if own_hub_port:
             _block_port(own_hub_port)
         time.sleep(0.5)
-        rpc_client.bdev_lvol_set_leader(snode.lvstore, leader=False)
-        rpc_client.bdev_distrib_force_to_non_leader(snode.jm_vuid)
         time.sleep(1)
     except Exception as e:
         logger.error(f"Failed during suspend port blocking/leadership transfer: {e}")
@@ -3689,7 +3830,7 @@ def upgrade_automated_deployment_config():
 
 def generate_automated_deployment_config(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_allowed, pci_blocked,
                                          cores_percentage=0, force=False, device_model="", size_range="", nvme_names=None, k8s=False,
-                                         calculate_hp_only=False, number_of_devices=0):
+                                         calculate_hp_only=False, number_of_devices=0, inline_checksum=False):
     if calculate_hp_only:
         minimum_hp_memory = utils.calculate_hp_only(max_lvol, number_of_devices, sockets_to_use, nodes_per_socket, cores_percentage)
         hp_number = math.ceil(minimum_hp_memory / 2)
@@ -3707,7 +3848,8 @@ def generate_automated_deployment_config(max_lvol, max_prov, sockets_to_use, nod
 
         nodes_config, system_info = utils.generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket,
                                                            pci_allowed, pci_blocked, cores_percentage, force=force,
-                                                           device_model=device_model, size_range=size_range, nvme_names=nvme_names)
+                                                           device_model=device_model, size_range=size_range, nvme_names=nvme_names,
+                                                           inline_checksum=inline_checksum)
         if not nodes_config or not nodes_config.get("nodes"):
             return False
         utils.store_config_file(nodes_config, constants.NODES_CONFIG_FILE, create_read_only_file=True)
@@ -4806,6 +4948,33 @@ def _abort_and_unblock(reason):
             "Raid %s and lvstore %s already present on %s; skipping examine",
             primary_node.raid, primary_node.lvstore, snode.get_id())
     else:
+        if raid_already and not lvstore_already:
+            # Same convergence trap as in recreate_lvstore: the raid was
+            # examined on a prior pass and the lvstore module did not
+            # surface it. SPDK rejects re-examine of an already-examined
+            # bdev with "Duplicate bdev name for manual examine", so a
+            # plain bdev_examine here is a silent no-op that loops the
+            # activation retry forever. Drop the raid and re-create via
+            # _create_bdev_stack (idempotent) so the next examine is
+            # against a freshly-registered raid.
+            logger.info(
+                "Raid %s present but lvstore %s did not surface on %s; "
+                "dropping raid for clean re-examine",
+                primary_node.raid, primary_node.lvstore, snode.get_id())
+            try:
+                snode_rpc_client.bdev_raid_delete(primary_node.raid)
+            except Exception as e:
+                logger.warning(
+                    "bdev_raid_delete(%s) raised: %s — proceeding to "
+                    "_create_bdev_stack which is idempotent",
+                    primary_node.raid, e)
+            ret, err = _create_bdev_stack(snode, primary_node.lvstore_stack,
+                                          primary_node=primary_node)
+            if not ret:
+                logger.error(
+                    "Failed to rebuild bdev stack on %s after raid drop: %s",
+                    snode.get_id(), err)
+
         # Examine is required whenever the lvstore isn't surfaced — whether
         # the raid was freshly created by _create_bdev_stack (normal restart
         # path) or pre-existing with stale state (activation retry).
@@ -5312,19 +5481,28 @@ def recreate_lvstore(snode, force=False, lvs_primary=None, activation_mode=False
 
     lvol_ana_state = "optimized"
 
-    ### 2- create lvols nvmf subsystems
+    ### 2- create lvols nvmf subsystems (idempotent: probe SPDK first; mirrors
+    ### the pattern in recreate_lvstore_on_non_leader so a re-activation that
+    ### finds the subsystem already present from a prior partial pass does not
+    ### emit "Subsystem NQN ... already exists" / "Unable to create subsystem".
     created_subsystems = []
     for lvol in lvol_list:
-        if lvol.nqn not in created_subsystems:
-            allow_any = not bool(lvol.allowed_hosts)
+        if lvol.nqn in created_subsystems:
+            continue
+        allow_any = not bool(lvol.allowed_hosts)
+        if _rpc_subsystem_exists(rpc_client, lvol.nqn):
+            logger.info("subsystem %s already exists on %s, skipping create",
+                        lvol.nqn, snode.get_id())
+            created_subsystems.append(lvol.nqn)
+        else:
             logger.info("creating subsystem %s (allow_any_host=%s)", lvol.nqn, allow_any)
             ret = rpc_client.subsystem_create(lvol.nqn, lvol.ha_type, lvol.uuid, 1,
                                               max_namespaces=constants.LVO_MAX_NAMESPACES_PER_SUBSYS,
                                               allow_any_host=allow_any)
             if ret:
                 created_subsystems.append(lvol.nqn)
-            if lvol.allowed_hosts:
-                _reapply_allowed_hosts(lvol, snode, rpc_client)
+        if lvol.allowed_hosts:
+            _reapply_allowed_hosts(lvol, snode, rpc_client)
 
     # ANA failback only when the original primary is coming back (not takeover)
     if not is_takeover and lvs_node.secondary_node_id and lvol_list:
@@ -5361,31 +5539,21 @@ def _unblock_peer_port(peer):
                 pass
 
     def _kill_app():
+        """Kill SPDK on snode and mark OFFLINE before peer ports unblock.
+
+        Holding the peer port blocks during this wait is intentional:
+        unblocking before SPDK is confirmed dead lets a residual primary
+        on snode race the acting-leader and produce a writer conflict.
+
+        Implemented via the module-level :func:`_kill_spdk_until_dead`
+        helper so the same hardened kill logic is used by every abort
+        path (recreate_lvstore aborts here; restart_storage_node aborts
+        in `_abort_restart`). On total kill failure we still mark the
+        node OFFLINE so it stops being treated as in_restart by the
+        cluster, and so peer ports get released by the caller.
+        """
         storage_events.snode_restart_failed(snode)
-        snode_api = snode.client(timeout=5, retry=5)
-        snode_api.spdk_process_kill(snode.rpc_port, snode.cluster_id)
-        # spdk_process_kill returns as soon as the HTTP request is
-        # queued — SPDK may keep serving IO for a short while after.
-        # Block here until SPDK is actually gone so the subsequent
-        # peer-port unblock in _abort_restart_and_unblock cannot race
-        # client IO back into the secondary while a still-alive primary
-        # on snode tries to serve as leader (→ writer conflict).
-        # We hold the peer port blocks during this wait; cap it at 5 s
-        # so a stuck kill does not leave peers permanently blocked.
-        deadline = time.time() + 5
-        while time.time() < deadline:
-            try:
-                up = snode_api.spdk_process_is_up(snode.rpc_port, snode.cluster_id)
-            except Exception:
-                up = False
-            if not up:
-                break
-            time.sleep(0.25)
-        else:
-            logger.warning(
-                "SPDK on %s still up 5s after kill signal; proceeding with unblock anyway",
-                snode.get_id(),
-            )
+        _kill_spdk_until_dead(snode)
         set_node_status(snode.get_id(), StorageNode.STATUS_OFFLINE)
 
     def _abort_restart_and_unblock(reason):
@@ -5556,6 +5724,42 @@ def _abort_restart_and_unblock(reason):
             "Raid %s and lvstore %s already present on %s; skipping examine",
             lvs_raid, lvs_name, snode.get_id())
     else:
+        if raid_already and not lvstore_already:
+            # Raid is present but the lvstore module never surfaced it on
+            # this SPDK process (e.g. a prior activation pass examined the
+            # raid and the lvstore-side examine failed/was incomplete).
+            # SPDK rejects re-examine of an already-examined bdev with
+            # "Duplicate bdev name for manual examine: <raid>", so calling
+            # bdev_examine again is a no-op that leaves the lvstore
+            # missing forever and burns the activation retry loop.
+            #
+            # Drop the raid so the underlying distribs are reusable, then
+            # re-create it via _create_bdev_stack (which is itself
+            # idempotent — it skips bdevs already present and only creates
+            # what's missing). The fresh bdev_examine below now runs
+            # against a newly-registered raid and the lvstore module gets
+            # a real chance to surface.
+            logger.info(
+                "Raid %s present but lvstore %s did not surface on %s; "
+                "dropping raid for clean re-examine",
+                lvs_raid, lvs_name, snode.get_id())
+            try:
+                rpc_client.bdev_raid_delete(lvs_raid)
+            except Exception as e:
+                logger.warning(
+                    "bdev_raid_delete(%s) raised: %s — proceeding to "
+                    "_create_bdev_stack which is idempotent", lvs_raid, e)
+            stack = lvs_node.lvstore_stack if is_takeover else None
+            if is_takeover:
+                ret, err = _create_bdev_stack(snode, stack, primary_node=lvs_node)
+            else:
+                ret, err = _create_bdev_stack(snode, [])
+            if not ret:
+                logger.error(
+                    "Failed to rebuild bdev stack on %s after raid drop: %s",
+                    snode.get_id(), err)
+                # Fall through; bdev_examine below will surface what we have.
+
         # Examine is required whenever the lvstore isn't surfaced — whether
         # the raid was freshly created by _create_bdev_stack (normal restart
         # path) or pre-existing with stale state (activation retry). The
diff --git a/simplyblock_core/utils/__init__.py b/simplyblock_core/utils/__init__.py
index 69a43aa12..f43dfa01b 100644
--- a/simplyblock_core/utils/__init__.py
+++ b/simplyblock_core/utils/__init__.py
@@ -1209,6 +1209,41 @@ def validate_sec_options(sec_options):
     return True, None
 
 
+def alceml_fallback_overhead_bytes(cluster, device_size_bytes):
+    """Bytes of capacity charged as initial utilization when alceml runs in
+    cv_fallback_method on a device. Per 2 MiB extent the layout shrinks from
+    510 to 504 data blocks (1 extended-md block + 5 filler), so we lose 6
+    blocks per page. Returns 0 when inline_checksum is off, when device size
+    is unknown, or when the device runs in md-on-device mode (caller must
+    have already filtered for md_supported=False).
+    """
+    if not getattr(cluster, 'inline_checksum', False):
+        return 0
+    if not device_size_bytes or device_size_bytes <= 0:
+        return 0
+    blk_size = cluster.blk_size or 4096
+    page_size = cluster.page_size_in_blocks or (2 * 1024 * 1024)
+    pages = device_size_bytes // page_size
+    return int(pages * 6 * blk_size)
+
+
+def alceml_checksum_params(cluster, nvme_device):
+    """Pick the inline-checksum method and tunables for bdev_alceml_create.
+
+    Returns (method, cache_size, cache_eviction_threshold). method:
+      0 = off (cluster.inline_checksum False)
+      1 = md-on-device (cv_md_method, no read/write amplification)
+      2 = fallback (cv_fallback_method, extra md page per 2 MiB extent)
+    cache_size and cache_eviction_threshold default to 0 so the data plane
+    keeps its built-in defaults (2000 entries, 90% eviction trigger).
+    """
+    if not getattr(cluster, 'inline_checksum', False):
+        return 0, 0, 0
+    if getattr(nvme_device, 'md_supported', False):
+        return 1, 0, 0
+    return 2, 0, 0
+
+
 def addNvmeDevices(rpc_client, snode, devs):
     devices = []
     ret = rpc_client.bdev_nvme_controller_list()
@@ -1271,6 +1306,12 @@ def addNvmeDevices(rpc_client, snode, devs):
                 else:
                     logger.error(f"No subsystem nqn found for device: {nvme_driver_data['pci_address']}")
 
+            # SPDK exposes per-namespace metadata size as a top-level uint32 in bdev_get_bdevs JSON
+            # (lib/bdev/bdev_rpc.c writes "md_size" via spdk_bdev_get_md_size). >=8 means alceml can run
+            # in cv_md_method on this device; 0 means it must run in cv_fallback_method.
+            md_size = int(nvme_dict.get('md_size', 0) or 0)
+            md_supported = md_size >= 8
+
             devices.append(
                 NVMeDevice({
                     'uuid': str(uuid.uuid4()),
@@ -1284,7 +1325,9 @@ def addNvmeDevices(rpc_client, snode, devs):
                     'nvme_controller': nvme_controller,
                     'node_id': snode.get_id(),
                     'cluster_id': snode.cluster_id,
-                    'status': NVMeDevice.STATUS_ONLINE
+                    'status': NVMeDevice.STATUS_ONLINE,
+                    'md_size': md_size,
+                    'md_supported': md_supported,
                 }))
     return devices
 
@@ -1772,7 +1815,8 @@ def regenerate_config(new_config, old_config, force=False):
 
 
 def generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_allowed, pci_blocked,
-                     cores_percentage=0, force=False, device_model="", size_range="", nvme_names=None):
+                     cores_percentage=0, force=False, device_model="", size_range="", nvme_names=None,
+                     inline_checksum=False):
     system_info = {}
     nodes_config: dict = {"nodes": []}
 
@@ -1797,8 +1841,24 @@ def generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_a
             nvme_device_path = f"/dev/{nvme_device}n1"
             clean_partitions(nvme_device_path)
             nvme_json_string = get_idns(nvme_device_path)
-            lbaf_id = find_lbaf_id(nvme_json_string, 0, 12)
-            format_nvme_device(nvme_device_path, lbaf_id)
+            lbaf_id = None
+            md_lbaf = False
+            if inline_checksum:
+                # Prefer an LBAF with metadata so alceml can run in cv_md_method on this drive.
+                lbaf_id = find_md_lbaf_id(nvme_json_string, target_ds=12, min_ms=8)
+                if lbaf_id is None:
+                    logger.warning(
+                        f"--enable-inline-checksum: device {nvme_device_path} exposes no 4K LBAF with >=8B metadata; "
+                        f"formatting plain 4K. alceml will run in fallback mode on this drive."
+                    )
+                else:
+                    md_lbaf = True
+                    logger.info(f"Formatting {nvme_device_path} with md-capable LBAF index {lbaf_id}")
+            if lbaf_id is None:
+                lbaf_id = find_lbaf_id(nvme_json_string, 0, 12)
+            # When switching to an md-capable LBAF, the namespace SectorSize stays 4096,
+            # so the in-list 4K early-out would skip the reformat. Force it.
+            format_nvme_device(nvme_device_path, lbaf_id, force_reformat=md_lbaf)
 
     for nid in sockets_to_use:
         if nid in cores_by_numa:
@@ -2824,6 +2884,26 @@ def find_lbaf_id(json_data: str, target_ms: int, target_ds: int) -> int:
     return 0
 
 
+def find_md_lbaf_id(json_data: str, target_ds: int = 12, min_ms: int = 8):
+    """Return the LBAF index for a format with data-size==target_ds (log2, 12=4K)
+    and metadata-size>=min_ms. Among matches, prefer the smallest ms to avoid
+    wasting space on 64B-md formats. Returns None if no such LBAF exists.
+    """
+    try:
+        data = json.loads(json_data)
+    except (json.JSONDecodeError, TypeError):
+        return None
+    candidates = []
+    for index, lbaf in enumerate(data.get('lbafs', [])):
+        ms = lbaf.get('ms', 0)
+        if lbaf.get('ds') == target_ds and ms >= min_ms:
+            candidates.append((ms, index))
+    if not candidates:
+        return None
+    candidates.sort()
+    return candidates[0][1]
+
+
 def get_idns(nvme_device: str):
     command = ['nvme', 'id-ns', nvme_device, '--output-format', 'json']
     try:
@@ -2895,8 +2975,11 @@ def is_namespace_4k_from_nvme_list(device_path: str) -> bool:
         return False
 
 
-def format_nvme_device(nvme_device: str, lbaf_id: int):
-    if is_namespace_4k_from_nvme_list(nvme_device):
+def format_nvme_device(nvme_device: str, lbaf_id: int, force_reformat: bool = False):
+    # The 4K early-out only checks SectorSize, not metadata size, so it would
+    # silently skip a reformat needed to switch a 4K-no-md namespace to 4K-with-md.
+    # Callers that need a specific LBAF (e.g. md-capable) pass force_reformat=True.
+    if not force_reformat and is_namespace_4k_from_nvme_list(nvme_device):
         logger.debug(f"Device {nvme_device} already formatted with 4K...skipping")
         return
     command = ['nvme', 'format', nvme_device, f"--lbaf={lbaf_id}", '--force']
diff --git a/tests/test_activate_portblock_and_suspend_no_demote.py b/tests/test_activate_portblock_and_suspend_no_demote.py
new file mode 100644
index 000000000..ebe56c7a2
--- /dev/null
+++ b/tests/test_activate_portblock_and_suspend_no_demote.py
@@ -0,0 +1,409 @@
+# coding=utf-8
+"""
+Regression tests for two related fixes:
+
+1. cluster_activate (Pass 2): on re-activation (cluster.status != UNREADY),
+   the configured primary's LVS port is firewall-blocked before
+   recreate_lvstore_on_non_leader runs and unblocked afterwards. On a fresh
+   activation (cluster.status == UNREADY) NO port-block is issued — peers
+   aren't serving yet, so there is nothing to quiesce.
+   Bug: a JCERR-driven cluster suspend followed by re-activation looped
+   forever with bs_load_cur_extent_page_valid CRC mismatch on the
+   secondary's examine because the live primary kept writing into the LVS
+   blob metadata. Observed 2026-05-11, LVS_6769 on node 8084.
+
+2. suspend_storage_node: after blocking the lvs+hublvol ports, the
+   explicit bdev_lvol_set_leader(leader=False) and
+   bdev_distrib_force_to_non_leader RPCs are NOT issued. With both ports
+   blocked the surviving peer auto-promotes; demoting now races pre-block
+   in-flight IO still completing on the local distrib → writer conflict.
+"""
+
+import unittest
+from unittest.mock import MagicMock, patch
+
+from simplyblock_core.models.cluster import Cluster
+from simplyblock_core.models.iface import IFace
+from simplyblock_core.models.storage_node import StorageNode
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _node(uuid, status=StorageNode.STATUS_ONLINE, lvstore="LVS_A",
+          jm_vuid=1, primary_secondary=None, primary_tertiary=None,
+          mgmt_ip="10.0.0.1", rpc_port=8080, n_devices=4):
+    from simplyblock_core.models.nvme_device import NVMeDevice
+    n = StorageNode()
+    n.uuid = uuid
+    n.cluster_id = "cluster-1"
+    n.status = status
+    n.hostname = f"host-{uuid}"
+    n.mgmt_ip = mgmt_ip
+    n.rpc_port = rpc_port
+    n.rpc_username = "u"
+    n.rpc_password = "p"
+    n.lvstore = lvstore
+    n.lvstore_status = "ready"
+    n.jm_vuid = jm_vuid
+    n.is_secondary_node = False
+    n.lvstore_stack_secondary = ""
+    n.lvstore_stack_tertiary = ""
+    n.lvstore_ports = {lvstore: {"lvol_subsys_port": 4420, "hublvol_port": 4427}}
+    devs = []
+    for i in range(n_devices):
+        d = NVMeDevice()
+        d.uuid = f"dev-{uuid}-{i}"
+        d.status = NVMeDevice.STATUS_ONLINE
+        devs.append(d)
+    n.nvme_devices = devs
+    n.remote_devices = []
+    n.remote_jm_devices = []
+    n.physical_label = 0
+    n.secondary_node_id = primary_secondary or ""
+    n.tertiary_node_id = primary_tertiary or ""
+    n.data_nics = [IFace()]
+    n.data_nics[0].ip4_address = mgmt_ip
+    n.data_nics[0].trtype = "TCP"
+    n.active_tcp = True
+    n.active_rdma = False
+    return n
+
+
+def _cluster(status=Cluster.STATUS_SUSPENDED, ha_type="ha", ftt=1):
+    c = Cluster()
+    c.uuid = "cluster-1"
+    c.status = status
+    c.ha_type = ha_type
+    c.max_fault_tolerance = ftt
+    c.distr_ndcs = 4
+    c.distr_npcs = 2
+    c.distr_bs = 4096
+    c.distr_chunk_bs = 4096
+    c.page_size_in_blocks = 128
+    c.nqn = "nqn.cluster"
+    c.is_single_node = False
+    c.enable_node_affinity = False
+    c.backup_config = None
+    return c
+
+
+# ===========================================================================
+# 1. cluster_activate Pass 2 port-block wrapper
+# ===========================================================================
+
+
+class TestActivatePortBlockWrapper(unittest.TestCase):
+    """The Pass 2 firewall block runs only on re-activation, not on first
+    activation, and wraps recreate_lvstore_on_non_leader exactly once per
+    (snode, primary_node) pair."""
+
+    def _patch_cluster_activate_environment(
+        self, cluster, primary, secondary,
+        recreate_lvstore_ret=True,
+        recreate_non_leader_ret=True,
+        recreate_non_leader_exc=None,
+        firewall_block_exc=None,
+        firewall_allow_exc=None,
+    ):
+        """Returns a tuple of (patches_started, recorded_calls).
+
+        Caller is responsible for stopping the patches (via the returned
+        contextmanager-like list) — done in the test's tearDown via addCleanup.
+        """
+        from simplyblock_core import cluster_ops
+
+        db = MagicMock()
+        db.get_cluster_by_id.return_value = cluster
+        db.get_storage_nodes_by_cluster_id.return_value = [primary, secondary]
+        db.get_storage_node_by_id.side_effect = lambda nid: (
+            primary if nid == primary.get_id() else secondary)
+        db.get_cluster_capacity.return_value = [{"size_total": 1 << 40}]
+        db.get_qos.return_value = []
+
+        def _primary_for(node_id):
+            return [primary] if node_id == secondary.get_id() else []
+        db.get_primary_storage_nodes_by_secondary_node_id.side_effect = _primary_for
+
+        # FirewallClient: record block/allow calls.
+        fw_calls = []
+
+        class FakeFW:
+            def __init__(self, node, timeout=3, retry=1):
+                self._node = node
+
+            def firewall_set_port(self, port, ptype, action, rpc_port, **kw):
+                fw_calls.append((self._node.get_id(), port, action))
+                if action == "block" and firewall_block_exc:
+                    raise firewall_block_exc
+                if action == "allow" and firewall_allow_exc:
+                    raise firewall_allow_exc
+
+        # storage_node_ops.recreate_lvstore* are heavy — replace with stubs.
+        recreate_calls = []
+
+        def _recreate_primary(snode, activation_mode=False, **kw):
+            recreate_calls.append(("primary", snode.get_id(), activation_mode))
+            return recreate_lvstore_ret
+
+        def _recreate_non_leader(snode, leader, primary_node,
+                                  activation_mode=False, **kw):
+            recreate_calls.append(("non_leader", snode.get_id(),
+                                   primary_node.get_id(), activation_mode))
+            if recreate_non_leader_exc:
+                raise recreate_non_leader_exc
+            return recreate_non_leader_ret
+
+        # tasks_controller: drop schedule fallback (the wrapper falls back to
+        # add_port_allow_task only when unblock RPC fails — we don't trigger
+        # that in the green-path tests below; still need a no-op shim).
+        scheduled_port_allow = []
+
+        def _add_port_allow_task(cluster_id, node_id, port):
+            scheduled_port_allow.append((cluster_id, node_id, port))
+
+        port_events = []
+
+        def _port_deny(node, port):
+            port_events.append(("deny", node.get_id(), port))
+
+        def _port_allowed(node, port):
+            port_events.append(("allow", node.get_id(), port))
+
+        patches = [
+            patch.object(cluster_ops, "db_controller", db),
+            patch.object(cluster_ops, "DBController", return_value=db),
+            patch.object(cluster_ops, "FirewallClient", FakeFW),
+            patch.object(cluster_ops.tcp_ports_events, "port_deny", _port_deny),
+            patch.object(cluster_ops.tcp_ports_events, "port_allowed", _port_allowed),
+            patch.object(cluster_ops.tasks_controller, "add_port_allow_task",
+                         _add_port_allow_task),
+            patch.object(cluster_ops.storage_node_ops, "recreate_lvstore",
+                         _recreate_primary),
+            patch.object(cluster_ops.storage_node_ops, "recreate_lvstore_on_non_leader",
+                         _recreate_non_leader),
+            patch.object(cluster_ops.storage_node_ops, "get_next_physical_device_order",
+                         lambda *a, **kw: 0),
+            patch.object(cluster_ops.storage_node_ops, "get_secondary_nodes",
+                         lambda *a, **kw: [secondary.get_id()]),
+            patch.object(cluster_ops.storage_node_ops, "get_secondary_nodes_2",
+                         lambda *a, **kw: []),
+            patch.object(cluster_ops, "set_cluster_status", lambda *a, **kw: None),
+            patch.object(cluster_ops, "time", MagicMock()),
+            # qos: prevent FDB writes
+            patch.object(cluster_ops.qos_controller, "get_qos_weights_list",
+                         lambda *a, **kw: []),
+        ]
+        # Ensure each node's rpc_client and snode.recreate_hublvol / connect /
+        # write_to_db are no-ops (Pass 3 + post-loop work). The model objects
+        # already exist; we patch their methods inline.
+        for n in (primary, secondary):
+            n.write_to_db = MagicMock(return_value=True)
+            n.rpc_client = MagicMock()
+            n.recreate_hublvol = MagicMock(return_value=True)
+            n.create_secondary_hublvol = MagicMock(return_value=True)
+            n.connect_to_hublvol = MagicMock(return_value=True)
+            n.client = MagicMock()
+        # Primary's per-LVS port lookup — already populated in _node().
+        # Make is_qos_set on the cluster return False so the QOS branch is skipped.
+        cluster.is_qos_set = lambda: False
+
+        for p in patches:
+            p.start()
+        return patches, fw_calls, recreate_calls, port_events, scheduled_port_allow
+
+    def _run_activate(self, cluster, primary, secondary, **kw):
+        from simplyblock_core import cluster_ops
+        patches, fw_calls, recreate_calls, port_events, scheduled = \
+            self._patch_cluster_activate_environment(cluster, primary,
+                                                     secondary, **kw)
+        self.addCleanup(lambda: [p.stop() for p in patches])
+        try:
+            cluster_ops.cluster_activate("cluster-1", force=True)
+        except ValueError:
+            # cluster_activate may raise on the LVSRestartRequiredError path;
+            # the tests below assert on it explicitly.
+            pass
+        return fw_calls, recreate_calls, port_events, scheduled
+
+    # ----- tests -----
+
+    def test_reactivation_blocks_and_unblocks_leader_port(self):
+        cluster = _cluster(status=Cluster.STATUS_SUSPENDED)
+        primary = _node("primary-1", primary_secondary="secondary-1")
+        secondary = _node("secondary-1", mgmt_ip="10.0.0.2", rpc_port=8081)
+
+        fw_calls, recreate_calls, port_events, _ = self._run_activate(
+            cluster, primary, secondary)
+
+        # The wrapper must have issued block on primary:4420, then allow on
+        # primary:4420 — once each, in that order, and surrounding the
+        # non_leader recreate call.
+        fw_for_primary = [c for c in fw_calls if c[0] == "primary-1"]
+        self.assertEqual(
+            fw_for_primary,
+            [("primary-1", 4420, "block"), ("primary-1", 4420, "allow")],
+            f"unexpected firewall sequence: {fw_calls}")
+        # Recreate on the non-leader ran with activation_mode=True (we deliberately
+        # do NOT switch the helper out of activation_mode — only add the firewall).
+        non_leader_runs = [c for c in recreate_calls if c[0] == "non_leader"]
+        self.assertEqual(len(non_leader_runs), 1, recreate_calls)
+        _, snode_id, primary_id, activation_mode = non_leader_runs[0]
+        self.assertEqual(snode_id, "secondary-1")
+        self.assertEqual(primary_id, "primary-1")
+        self.assertTrue(activation_mode,
+                        "Pass 2 must still call helper with activation_mode=True "
+                        "— the firewall wrapper provides the only added op")
+        # tcp_ports_events emitted deny + allowed events on the primary.
+        self.assertIn(("deny", "primary-1", 4420), port_events)
+        self.assertIn(("allow", "primary-1", 4420), port_events)
+
+    def test_fresh_activation_does_not_block_leader_port(self):
+        cluster = _cluster(status=Cluster.STATUS_UNREADY)
+        primary = _node("primary-1", primary_secondary="secondary-1")
+        secondary = _node("secondary-1", mgmt_ip="10.0.0.2", rpc_port=8081)
+
+        fw_calls, recreate_calls, port_events, _ = self._run_activate(
+            cluster, primary, secondary)
+
+        # On fresh activation NO port-block is issued — peers aren't serving
+        # yet and the existing activation_mode=True short-circuit handles
+        # everything.
+        self.assertEqual(
+            [c for c in fw_calls if c[0] == "primary-1"], [],
+            f"fresh activation must not block primary's port; got {fw_calls}")
+        self.assertEqual(
+            [e for e in port_events if e[1] == "primary-1"], [],
+            f"fresh activation must not emit port deny/allow events; got {port_events}")
+
+    def test_reactivation_unblocks_when_recreate_raises(self):
+        """LVSRestartRequiredError out of recreate_lvstore_on_non_leader must
+        not leak a stuck-blocked leader port — the finally clause unblocks."""
+        from simplyblock_core import storage_node_ops
+        cluster = _cluster(status=Cluster.STATUS_SUSPENDED)
+        primary = _node("primary-1", primary_secondary="secondary-1")
+        secondary = _node("secondary-1", mgmt_ip="10.0.0.2", rpc_port=8081)
+
+        err = storage_node_ops.LVSRestartRequiredError(
+            "secondary-1", "LVS_A", detail="examine did not produce lvstore")
+        fw_calls, _, _, scheduled = self._run_activate(
+            cluster, primary, secondary, recreate_non_leader_exc=err)
+
+        fw_for_primary = [c for c in fw_calls if c[0] == "primary-1"]
+        # block then allow — even on the exception path.
+        self.assertEqual(
+            fw_for_primary,
+            [("primary-1", 4420, "block"), ("primary-1", 4420, "allow")],
+            f"finally-unblock missing on exception path: {fw_calls}")
+        # No port_allow_task scheduled — the unblock RPC itself succeeded.
+        self.assertEqual(scheduled, [])
+
+    def test_reactivation_schedules_port_allow_task_on_unblock_failure(self):
+        cluster = _cluster(status=Cluster.STATUS_SUSPENDED)
+        primary = _node("primary-1", primary_secondary="secondary-1")
+        secondary = _node("secondary-1", mgmt_ip="10.0.0.2", rpc_port=8081)
+
+        fw_calls, _, _, scheduled = self._run_activate(
+            cluster, primary, secondary,
+            firewall_allow_exc=RuntimeError("network down"))
+
+        # block recorded, allow attempted (and raised), so it is in fw_calls.
+        self.assertEqual(
+            [c for c in fw_calls if c[0] == "primary-1"],
+            [("primary-1", 4420, "block"), ("primary-1", 4420, "allow")])
+        # Fallback task scheduled.
+        self.assertEqual(scheduled, [("cluster-1", "primary-1", 4420)],
+                         f"add_port_allow_task fallback missing: {scheduled}")
+
+
+# ===========================================================================
+# 2. suspend_storage_node — no leadership drop after port block
+# ===========================================================================
+
+
+class TestSuspendNoLeadershipDropAfterBlock(unittest.TestCase):
+    """suspend_storage_node must block lvs+hublvol ports but NOT issue
+    bdev_lvol_set_leader(leader=False) or bdev_distrib_force_to_non_leader
+    afterwards — the surviving peer auto-promotes on the closed redirect
+    and an explicit demote races pre-block in-flight IO."""
+
+    def _run(self, snode, secondary_owners=None):
+        from simplyblock_core import storage_node_ops
+
+        db = MagicMock()
+        db.get_storage_node_by_id.return_value = snode
+        db.get_primary_storage_nodes_by_secondary_node_id.return_value = \
+            secondary_owners or []
+
+        rpc_client = MagicMock()
+        snode.rpc_client = MagicMock(return_value=rpc_client)
+        snode.write_to_db = MagicMock(return_value=True)
+
+        fw_calls = []
+
+        class FakeFW:
+            def __init__(self, node, timeout=20, retry=1):
+                self._node = node
+
+            def firewall_set_port(self, port, ptype, action, rpc_port, **kw):
+                fw_calls.append((port, action))
+
+        patches = [
+            patch.object(storage_node_ops, "DBController", return_value=db),
+            patch.object(storage_node_ops, "FirewallClient", FakeFW),
+            patch.object(storage_node_ops, "set_node_status",
+                         lambda *a, **kw: None),
+            patch.object(storage_node_ops, "time", MagicMock()),
+            patch.object(storage_node_ops.tasks_controller,
+                         "get_active_node_restart_task",
+                         lambda *a, **kw: None),
+            patch.object(storage_node_ops.tasks_controller,
+                         "get_active_node_tasks", lambda *a, **kw: []),
+            patch.object(storage_node_ops, "_check_ftt_allows_node_removal",
+                         lambda *a, **kw: (True, "")),
+        ]
+        for p in patches:
+            p.start()
+        self.addCleanup(lambda: [p.stop() for p in patches])
+
+        ret = storage_node_ops.suspend_storage_node(snode.get_id())
+        return ret, rpc_client, fw_calls
+
+    def test_own_primary_lvs_blocks_ports_without_demote(self):
+        snode = _node("node-A", lvstore="LVS_A")
+        ret, rpc, fw_calls = self._run(snode)
+        self.assertTrue(ret)
+        # Both ports were blocked.
+        self.assertIn((4420, "block"), fw_calls)
+        self.assertIn((4427, "block"), fw_calls)
+        # And critically: no leadership drop after the block.
+        rpc.bdev_lvol_set_leader.assert_not_called()
+        rpc.bdev_distrib_force_to_non_leader.assert_not_called()
+
+    def test_secondary_tertiary_lvs_blocks_ports_without_demote(self):
+        # snode also hosts secondary copy for primary "node-B"
+        snode = _node("node-A", lvstore="LVS_A")
+        snode.lvstore_stack_secondary = "node-B"
+
+        primary_b = _node("node-B", lvstore="LVS_B",
+                          mgmt_ip="10.0.0.99", rpc_port=8082)
+        primary_b.lvstore_ports = {
+            "LVS_B": {"lvol_subsys_port": 4430, "hublvol_port": 4431}}
+
+        ret, rpc, fw_calls = self._run(snode, secondary_owners=[primary_b])
+        self.assertTrue(ret)
+        # Sec lvs+hub ports blocked AND own primary lvs+hub ports blocked.
+        ports_blocked = {p for p, action in fw_calls if action == "block"}
+        self.assertIn(4430, ports_blocked)   # sec lvs port (primary_b's lvs port)
+        self.assertIn(4431, ports_blocked)   # sec hub port (primary_b's hub port)
+        self.assertIn(4420, ports_blocked)   # own lvs port
+        self.assertIn(4427, ports_blocked)   # own hub port
+        # No explicit demote on any LVS.
+        rpc.bdev_lvol_set_leader.assert_not_called()
+        rpc.bdev_distrib_force_to_non_leader.assert_not_called()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_dhchap_e2e.py b/tests/test_dhchap_e2e.py
index dbee35413..78415413b 100644
--- a/tests/test_dhchap_e2e.py
+++ b/tests/test_dhchap_e2e.py
@@ -441,7 +441,7 @@ def test_connect_lvol_includes_dhchap_secrets(self):
 
         with patch("simplyblock_core.controllers.lvol_controller.DBController",
                     return_value=mock_db):
-            result = lvol_ctl.connect_lvol("lvol-1", host_nqn=host_nqn)
+            result, _err = lvol_ctl.connect_lvol("lvol-1", host_nqn=host_nqn)
 
         self.assertTrue(len(result) > 0)
         cmd = result[0]["connect"]
@@ -495,7 +495,7 @@ def test_connect_lvol_tls_only_with_psk(self):
 
         with patch("simplyblock_core.controllers.lvol_controller.DBController",
                     return_value=mock_db):
-            result = lvol_ctl.connect_lvol("lvol-1", host_nqn=host_nqn)
+            result, _err = lvol_ctl.connect_lvol("lvol-1", host_nqn=host_nqn)
 
         cmd = result[0]["connect"]
         self.assertIn("--tls", cmd)
@@ -546,7 +546,7 @@ def test_connect_lvol_without_host_nqn_is_rejected_when_acl_exists(self):
 
         with patch("simplyblock_core.controllers.lvol_controller.DBController",
                     return_value=mock_db):
-            result = lvol_ctl.connect_lvol("lvol-1")
+            result, _err = lvol_ctl.connect_lvol("lvol-1")
 
         self.assertFalse(result)
 
diff --git a/tests/test_dhchap_pool_level.py b/tests/test_dhchap_pool_level.py
index 332184d2b..fbd33e13d 100644
--- a/tests/test_dhchap_pool_level.py
+++ b/tests/test_dhchap_pool_level.py
@@ -700,7 +700,7 @@ def test_host_with_dhchap_keys_injected_into_connect_cmd(self):
         }
         patcher, _ = _make_connect_ctx([host_entry])
         try:
-            result = connect_lvol("lvol-1", host_nqn="nqn:host-a")
+            result, _err = connect_lvol("lvol-1", host_nqn="nqn:host-a")
         finally:
             patcher.stop()
 
@@ -726,7 +726,7 @@ def test_host_with_psk_sets_tls_flag(self):
         }
         patcher, _ = _make_connect_ctx([host_entry])
         try:
-            result = connect_lvol("lvol-1", host_nqn="nqn:host-a")
+            result, _err = connect_lvol("lvol-1", host_nqn="nqn:host-a")
         finally:
             patcher.stop()
 
@@ -745,7 +745,7 @@ def test_missing_host_nqn_when_allowed_hosts_present_returns_false(self):
 
         patcher, _ = _make_connect_ctx([{"nqn": "nqn:host-a"}])
         try:
-            result = connect_lvol("lvol-1", host_nqn=None)
+            result, _err = connect_lvol("lvol-1", host_nqn=None)
         finally:
             patcher.stop()
         self.assertFalse(result)
@@ -756,7 +756,7 @@ def test_unknown_host_nqn_returns_false(self):
 
         patcher, _ = _make_connect_ctx([{"nqn": "nqn:host-a"}])
         try:
-            result = connect_lvol("lvol-1", host_nqn="nqn:intruder")
+            result, _err = connect_lvol("lvol-1", host_nqn="nqn:intruder")
         finally:
             patcher.stop()
         self.assertFalse(result)
@@ -768,7 +768,7 @@ def test_no_allowed_hosts_pass_through_with_host_nqn(self):
 
         patcher, _ = _make_connect_ctx([])
         try:
-            result = connect_lvol("lvol-1", host_nqn="nqn:whoever")
+            result, _err = connect_lvol("lvol-1", host_nqn="nqn:whoever")
         finally:
             patcher.stop()
 
@@ -791,7 +791,7 @@ def test_pool_level_dhchap_lvol_has_no_secret_in_connect_cmd(self):
         # Pool-level DHCHAP: lvol.allowed_hosts contains only nqn, no keys.
         patcher, _ = _make_connect_ctx([{"nqn": "nqn:host-a"}])
         try:
-            result = connect_lvol("lvol-1", host_nqn="nqn:host-a")
+            result, _err = connect_lvol("lvol-1", host_nqn="nqn:host-a")
         finally:
             patcher.stop()
 
diff --git a/tests/test_dual_ft_secondary_fixes.py b/tests/test_dual_ft_secondary_fixes.py
index 74fd05028..111df3ed8 100644
--- a/tests/test_dual_ft_secondary_fixes.py
+++ b/tests/test_dual_ft_secondary_fixes.py
@@ -149,7 +149,7 @@ def get_node(nid):
         db.get_storage_node_by_id.side_effect = get_node
         db.get_cluster_by_id.return_value = cluster
 
-        result = connect_lvol("vol-1")
+        result, _err = connect_lvol("vol-1")
         self.assertTrue(result)
         self.assertEqual(len(result), 3)
 
@@ -174,7 +174,7 @@ def test_connect_single_node_uses_primary_port(self, mock_db_cls):
         db.get_storage_node_by_id.return_value = primary
         db.get_cluster_by_id.return_value = cluster
 
-        result = connect_lvol("vol-1")
+        result, _err = connect_lvol("vol-1")
         self.assertTrue(result)
         self.assertEqual(result[0]["port"], 4420)
 
diff --git a/tests/test_inline_checksum.py b/tests/test_inline_checksum.py
new file mode 100644
index 000000000..f83bc6644
--- /dev/null
+++ b/tests/test_inline_checksum.py
@@ -0,0 +1,308 @@
+# coding=utf-8
+"""
+test_inline_checksum.py – unit tests for the per-cluster inline CRC checksum
+validation feature (TD.100226.1).
+
+Covers:
+  * Cluster.inline_checksum / NVMeDevice.md_size / md_supported model defaults.
+  * find_md_lbaf_id helper – LBAF selection from `nvme id-ns` JSON.
+  * alceml_checksum_params helper – cluster flag + per-device md combo logic.
+  * alceml_fallback_overhead_bytes helper – capacity-overhead math.
+  * bdev_alceml_create RPC – correct param wire-up for each method.
+  * addNvmeDevices – md_size flowing from SPDK bdev JSON onto NVMeDevice.
+"""
+
+import json
+import unittest
+from unittest.mock import patch, MagicMock
+
+from simplyblock_core import utils
+from simplyblock_core.models.cluster import Cluster
+from simplyblock_core.models.nvme_device import NVMeDevice
+from simplyblock_core.rpc_client import RPCClient
+
+
+def _make_rpc_client():
+    with patch("requests.session"):
+        return RPCClient("127.0.0.1", 8081, "user", "pass", timeout=1, retry=0)
+
+
+# ---------------------------------------------------------------------------
+# Model defaults & persistence
+# ---------------------------------------------------------------------------
+class TestModelDefaults(unittest.TestCase):
+    def test_cluster_inline_checksum_defaults_off(self):
+        c = Cluster()
+        self.assertFalse(c.inline_checksum)
+
+    def test_cluster_inline_checksum_can_be_set(self):
+        c = Cluster()
+        c.inline_checksum = True
+        self.assertTrue(c.inline_checksum)
+
+    def test_nvme_device_md_fields_default_off(self):
+        d = NVMeDevice()
+        self.assertEqual(d.md_size, 0)
+        self.assertFalse(d.md_supported)
+
+    def test_nvme_device_md_fields_round_trip(self):
+        d = NVMeDevice({'md_size': 16, 'md_supported': True})
+        self.assertEqual(d.md_size, 16)
+        self.assertTrue(d.md_supported)
+
+
+# ---------------------------------------------------------------------------
+# find_md_lbaf_id
+# ---------------------------------------------------------------------------
+class TestFindMdLbafId(unittest.TestCase):
+    def _idns(self, lbafs):
+        return json.dumps({'lbafs': lbafs})
+
+    def test_returns_none_when_no_md_lbaf(self):
+        # Only 4K-no-md available.
+        s = self._idns([
+            {'ms': 0, 'ds': 9},
+            {'ms': 0, 'ds': 12},
+        ])
+        self.assertIsNone(utils.find_md_lbaf_id(s))
+
+    def test_picks_smallest_ms_above_min(self):
+        # 4K-with-8B is preferred over 4K-with-64B (waste less space).
+        s = self._idns([
+            {'ms': 0, 'ds': 12},     # idx 0 – no md
+            {'ms': 64, 'ds': 12},    # idx 1
+            {'ms': 8, 'ds': 12},     # idx 2 – preferred
+            {'ms': 16, 'ds': 12},    # idx 3
+        ])
+        self.assertEqual(utils.find_md_lbaf_id(s), 2)
+
+    def test_skips_non_matching_ds(self):
+        # An LBAF with ms>=8 but ds!=12 must not be selected.
+        s = self._idns([
+            {'ms': 8, 'ds': 9},      # 512B with md – ignore
+            {'ms': 0, 'ds': 12},
+        ])
+        self.assertIsNone(utils.find_md_lbaf_id(s))
+
+    def test_below_min_ms_excluded(self):
+        s = self._idns([
+            {'ms': 4, 'ds': 12},     # below 8B threshold
+            {'ms': 0, 'ds': 12},
+        ])
+        self.assertIsNone(utils.find_md_lbaf_id(s))
+
+    def test_invalid_json_returns_none(self):
+        self.assertIsNone(utils.find_md_lbaf_id("not json"))
+        self.assertIsNone(utils.find_md_lbaf_id(None))
+
+    def test_empty_lbafs_returns_none(self):
+        self.assertIsNone(utils.find_md_lbaf_id(self._idns([])))
+
+
+# ---------------------------------------------------------------------------
+# alceml_checksum_params
+# ---------------------------------------------------------------------------
+class TestAlcemlChecksumParams(unittest.TestCase):
+    def test_off_when_cluster_flag_off(self):
+        c = Cluster({'inline_checksum': False})
+        d = NVMeDevice({'md_supported': True})
+        self.assertEqual(utils.alceml_checksum_params(c, d), (0, 0, 0))
+
+    def test_method_1_when_md_supported(self):
+        c = Cluster({'inline_checksum': True})
+        d = NVMeDevice({'md_supported': True, 'md_size': 8})
+        self.assertEqual(utils.alceml_checksum_params(c, d), (1, 0, 0))
+
+    def test_method_2_when_md_unsupported(self):
+        c = Cluster({'inline_checksum': True})
+        d = NVMeDevice({'md_supported': False, 'md_size': 0})
+        self.assertEqual(utils.alceml_checksum_params(c, d), (2, 0, 0))
+
+    def test_off_for_cluster_without_attribute(self):
+        # Old DB record (no inline_checksum field) must behave as off.
+        class _Old:
+            pass
+        d = NVMeDevice({'md_supported': True})
+        self.assertEqual(utils.alceml_checksum_params(_Old(), d), (0, 0, 0))
+
+
+# ---------------------------------------------------------------------------
+# alceml_fallback_overhead_bytes
+# ---------------------------------------------------------------------------
+class TestFallbackOverhead(unittest.TestCase):
+    def test_zero_when_flag_off(self):
+        c = Cluster({'inline_checksum': False, 'blk_size': 4096, 'page_size_in_blocks': 2 * 1024 * 1024})
+        self.assertEqual(utils.alceml_fallback_overhead_bytes(c, 100 * 2 * 1024 * 1024), 0)
+
+    def test_zero_for_zero_or_negative_size(self):
+        c = Cluster({'inline_checksum': True, 'blk_size': 4096, 'page_size_in_blocks': 2 * 1024 * 1024})
+        self.assertEqual(utils.alceml_fallback_overhead_bytes(c, 0), 0)
+        self.assertEqual(utils.alceml_fallback_overhead_bytes(c, -1), 0)
+
+    def test_six_blocks_per_page(self):
+        # 100 pages × 2 MiB = 200 MiB device. Overhead = 100 × 6 × 4 KiB = 2400 KiB.
+        c = Cluster({'inline_checksum': True, 'blk_size': 4096, 'page_size_in_blocks': 2 * 1024 * 1024})
+        device_size = 100 * 2 * 1024 * 1024
+        expected = 100 * 6 * 4096
+        self.assertEqual(utils.alceml_fallback_overhead_bytes(c, device_size), expected)
+
+    def test_partial_page_floored(self):
+        # 1.5 pages → only 1 full page counts (page-granular accounting).
+        c = Cluster({'inline_checksum': True, 'blk_size': 4096, 'page_size_in_blocks': 2 * 1024 * 1024})
+        partial = (1 * 2 * 1024 * 1024) + (1 * 1024 * 1024)
+        self.assertEqual(utils.alceml_fallback_overhead_bytes(c, partial), 1 * 6 * 4096)
+
+    def test_overhead_is_about_1_17_percent(self):
+        # Sanity: the design doc cites ~1.17% overhead in fallback mode.
+        c = Cluster({'inline_checksum': True, 'blk_size': 4096, 'page_size_in_blocks': 2 * 1024 * 1024})
+        size = 1024 * 2 * 1024 * 1024  # 2 GiB, 1024 pages
+        ratio = utils.alceml_fallback_overhead_bytes(c, size) / size
+        self.assertAlmostEqual(ratio, 6 / 512, places=6)
+
+
+# ---------------------------------------------------------------------------
+# bdev_alceml_create RPC params
+# ---------------------------------------------------------------------------
+class TestBdevAlcemlCreateRPC(unittest.TestCase):
+    @patch.object(RPCClient, "_request")
+    def test_no_checksum_params_when_method_zero(self, mock_req):
+        mock_req.return_value = True
+        client = _make_rpc_client()
+        client.bdev_alceml_create("alc_x", "nvme0", "uuid-1")
+        params = mock_req.call_args[0][1]
+        self.assertNotIn("checksum_validation_method", params)
+        self.assertNotIn("cache_size", params)
+        self.assertNotIn("cache_eviction_threshold", params)
+
+    @patch.object(RPCClient, "_request")
+    def test_method_1_only_emits_method_field(self, mock_req):
+        mock_req.return_value = True
+        client = _make_rpc_client()
+        client.bdev_alceml_create("alc_x", "nvme0", "uuid-1", checksum_method=1)
+        params = mock_req.call_args[0][1]
+        self.assertEqual(params["checksum_validation_method"], 1)
+        # Defaults of 0 must not be sent so the data plane uses its own defaults.
+        self.assertNotIn("cache_size", params)
+        self.assertNotIn("cache_eviction_threshold", params)
+
+    @patch.object(RPCClient, "_request")
+    def test_method_2_with_explicit_cache_overrides(self, mock_req):
+        mock_req.return_value = True
+        client = _make_rpc_client()
+        client.bdev_alceml_create(
+            "alc_x", "nvme0", "uuid-1",
+            checksum_method=2, cache_size=1500, cache_eviction_threshold=85,
+        )
+        params = mock_req.call_args[0][1]
+        self.assertEqual(params["checksum_validation_method"], 2)
+        self.assertEqual(params["cache_size"], 1500)
+        self.assertEqual(params["cache_eviction_threshold"], 85)
+
+    @patch.object(RPCClient, "_request")
+    def test_existing_params_unchanged(self, mock_req):
+        # Regression guard: the new kwargs must not perturb the well-known params
+        # the data plane expects.
+        mock_req.return_value = True
+        client = _make_rpc_client()
+        client.bdev_alceml_create(
+            "alc_x", "nvme0", "uuid-1",
+            pba_init_mode=2, pba_page_size=2 * 1024 * 1024,
+            write_protection=True, full_page_unmap=True, checksum_method=1,
+        )
+        params = mock_req.call_args[0][1]
+        self.assertEqual(params["name"], "alc_x")
+        self.assertEqual(params["cntr_path"], "nvme0")
+        self.assertEqual(params["uuid"], "uuid-1")
+        self.assertEqual(params["pba_init_mode"], 2)
+        self.assertEqual(params["pba_page_size"], 2 * 1024 * 1024)
+        self.assertTrue(params["write_protection"])
+        self.assertTrue(params["use_map_whole_page_on_1st_write"])
+        self.assertEqual(params["checksum_validation_method"], 1)
+
+
+# ---------------------------------------------------------------------------
+# addNvmeDevices md detection
+# ---------------------------------------------------------------------------
+class TestAddNvmeDevicesMd(unittest.TestCase):
+    def _make_rpc_with_bdev(self, *, md_size):
+        rpc = MagicMock()
+        rpc.bdev_nvme_controller_list.return_value = []
+        rpc.bdev_nvme_controller_attach.return_value = ["nvmeX_n1"]
+        rpc.bdev_examine.return_value = True
+        rpc.bdev_wait_for_examine.return_value = True
+        # SPDK bdev_get_bdevs payload – the only md-relevant field is the
+        # top-level uint32 md_size set by spdk_bdev_get_md_size.
+        bdev_payload = [{
+            'name': 'nvmeX_n1',
+            'block_size': 4096,
+            'num_blocks': 100 * 1024 * 1024 // 4096,  # 100 MiB
+            'md_size': md_size,
+            'driver_specific': {
+                'nvme': [{
+                    'pci_address': '0000:00:01.0',
+                    'ctrlr_data': {
+                        'model_number': 'TEST_NVME',
+                        'serial_number': 'SN-TEST-1',
+                    },
+                }],
+            },
+        }]
+        rpc.get_bdevs.return_value = bdev_payload
+        return rpc
+
+    def _make_snode(self):
+        snode = MagicMock()
+        snode.physical_label = 0
+        snode.id_device_by_nqn = False
+        snode.get_id.return_value = "snode-1"
+        snode.cluster_id = "cluster-1"
+        return snode
+
+    def test_md_size_zero_marks_unsupported(self):
+        rpc = self._make_rpc_with_bdev(md_size=0)
+        snode = self._make_snode()
+        devs = utils.addNvmeDevices(rpc, snode, ["0000:00:01.0"])
+        self.assertEqual(len(devs), 1)
+        self.assertEqual(devs[0].md_size, 0)
+        self.assertFalse(devs[0].md_supported)
+
+    def test_md_size_8_marks_supported(self):
+        rpc = self._make_rpc_with_bdev(md_size=8)
+        snode = self._make_snode()
+        devs = utils.addNvmeDevices(rpc, snode, ["0000:00:01.0"])
+        self.assertEqual(devs[0].md_size, 8)
+        self.assertTrue(devs[0].md_supported)
+
+    def test_md_size_below_threshold_marks_unsupported(self):
+        # Pre-existing PI-only formats expose ms=4 (T10 PI's reftag field
+        # alone). That's < 8 bytes so checksums won't fit – treat as no-md.
+        rpc = self._make_rpc_with_bdev(md_size=4)
+        snode = self._make_snode()
+        devs = utils.addNvmeDevices(rpc, snode, ["0000:00:01.0"])
+        self.assertEqual(devs[0].md_size, 4)
+        self.assertFalse(devs[0].md_supported)
+
+    def test_missing_md_size_field_treated_as_zero(self):
+        # Older SPDK builds that omit the field entirely must not crash.
+        rpc = MagicMock()
+        rpc.bdev_nvme_controller_list.return_value = []
+        rpc.bdev_nvme_controller_attach.return_value = ["nvmeX_n1"]
+        rpc.get_bdevs.return_value = [{
+            'name': 'nvmeX_n1',
+            'block_size': 4096,
+            'num_blocks': 100 * 1024 * 1024 // 4096,
+            'driver_specific': {
+                'nvme': [{
+                    'pci_address': '0000:00:01.0',
+                    'ctrlr_data': {'model_number': 'M', 'serial_number': 'S'},
+                }],
+            },
+        }]
+        snode = self._make_snode()
+        devs = utils.addNvmeDevices(rpc, snode, ["0000:00:01.0"])
+        self.assertEqual(devs[0].md_size, 0)
+        self.assertFalse(devs[0].md_supported)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_nvmeof_security.py b/tests/test_nvmeof_security.py
index 644615854..44aae70ae 100644
--- a/tests/test_nvmeof_security.py
+++ b/tests/test_nvmeof_security.py
@@ -671,7 +671,7 @@ def test_connect_with_psk_includes_tls_flag(self, MockDBCtrl):
         mock_db.get_cluster_by_id.return_value = cl
         MockDBCtrl.return_value = mock_db
 
-        result = connect_lvol("lvol-1", host_nqn="nqn:host1")
+        result, _err = connect_lvol("lvol-1", host_nqn="nqn:host1")
         self.assertTrue(len(result) > 0)
         entry = result[0]
         self.assertIn("--tls", entry["connect"])
@@ -696,7 +696,7 @@ def test_connect_without_tls_no_flag(self, MockDBCtrl):
         mock_db.get_cluster_by_id.return_value = cl
         MockDBCtrl.return_value = mock_db
 
-        result = connect_lvol("lvol-1")
+        result, _err = connect_lvol("lvol-1")
         self.assertTrue(len(result) > 0)
         entry = result[0]
         self.assertNotIn("tls", entry)
diff --git a/tests/test_peer_disconnect.py b/tests/test_peer_disconnect.py
index 816d86ee6..31ccecce9 100644
--- a/tests/test_peer_disconnect.py
+++ b/tests/test_peer_disconnect.py
@@ -41,12 +41,20 @@ class TestCheckPeerDisconnected(unittest.TestCase):
     def _run(self, status, quorum_result=False):
         # _check_peer_disconnected imports ``is_node_data_plane_disconnected_quorum``
         # locally from the services module at call time, so the patch must target
-        # the source module (where the symbol lives).
+        # the source module (where the symbol lives). The function also re-fetches
+        # the peer from FDB before reading status (cc3b4811), so DBController must
+        # be patched to return the same mock — otherwise the lookup hits
+        # uninitialised FDB and the KeyError fallback returns True for every input,
+        # masking the branch under test.
         from simplyblock_core import storage_node_ops as mod
         peer = _node(status=status)
+        db = MagicMock()
+        db.get_storage_node_by_id.return_value = peer
         with patch(
             "simplyblock_core.services.storage_node_monitor.is_node_data_plane_disconnected_quorum",
-            return_value=quorum_result) as q:
+            return_value=quorum_result) as q, \
+             patch("simplyblock_core.storage_node_ops.DBController",
+                   return_value=db):
             return mod._check_peer_disconnected(peer), q
 
     # -----------------------------------------------------------------