diff --git a/scripts/lab_dual_node_outage_soak_mixed_churn.py b/scripts/lab_dual_node_outage_soak_mixed_churn.py new file mode 100644 index 000000000..139dd48d5 --- /dev/null +++ b/scripts/lab_dual_node_outage_soak_mixed_churn.py @@ -0,0 +1,1892 @@ +#!/usr/bin/env python3 +"""Lab variant of aws_dual_node_outage_soak_mixed_churn.py. + +Designed to run from the simplyblock jump host against the bare-metal lab +cluster (mgmt 192.168.10.210, storage .201-.204) deployed by +setup_lab_perf_test1.py. + +Iteration pattern (load during outage, unload during settle): + 1. start fio on every volume + 2. apply the dual-node outage pair (fio takes the hit) + 3. wait for both nodes back online + post-outage check_fio (fault gate) + 4. stop fio + 5. optionally rebuild one randomly-selected volume + (every --churn-every-n-iters iterations) + 6. wait_for_cluster_stable + wait_for_data_migration_complete -- now + UNLOADED, so migration drains fast + 7. next iteration + +This trades the AWS-variant's "fio runs continuously across outages plus +a 3-20 minute background churn timer" for a deterministic per-iteration +pattern: settling never happens under fio load, so iteration time drops +to whatever rebalance takes on a quiet cluster. There is no inter- +iteration NIC chaos. + +Differences vs. the AWS variant beyond the iteration pattern: + - SSH uses a single shared root password (CLI flag, env var, or prompt), + not an EC2 keypair. Install paramiko if you can ("pip3 install --user + paramiko" on the jump host); otherwise the script falls back to + `sshpass + ssh`, which is slower because every command opens a fresh + connection. + - Defaults match the lab topology: --expected-node-count 4, metadata + file cluster_metadata_base.json (the file setup_lab_perf_test1.py + writes), mount root /root/lab_outage_soak_*. + +Typical invocation (from the jump host, after setup_lab_perf_test1.py has +created the cluster and written cluster_metadata_base.json next to this +script): + + python3 ~/lab_dual_node_outage_soak_mixed_churn.py # prompts for password + SBCLI_ROOT_PASSWORD='...' python3 ~/lab_dual_node_outage_soak_mixed_churn.py + python3 ~/lab_dual_node_outage_soak_mixed_churn.py --password '...' +""" +import argparse +import getpass +import itertools +import json +import logging +import os +import posixpath +import random +import re +import shlex +import subprocess +import sys +import threading +import time +from dataclasses import dataclass +from pathlib import Path + +try: + import paramiko +except ImportError: + paramiko = None + +# Silence paramiko's Transport-thread "Socket exception: Connection +# reset by peer (104)" prints. They fire whenever an open SSH +# connection to a storage node gets RST'd by a planned event — +# host_reboot outage tearing down sshd, NIC down/up flapping, etc. +# The retry/reconnect logic handles it cleanly; the stack-trace-less +# stderr lines just clutter the soak output. +logging.getLogger("paramiko").setLevel(logging.CRITICAL) +logging.getLogger("paramiko.transport").setLevel(logging.CRITICAL) + + +UUID_RE = re.compile(r"[a-f0-9]{8}(?:-[a-f0-9]{4}){3}-[a-f0-9]{12}") +# `sbctl lvol connect` emits `sudo nvme connect ... --nqn= ...` +# (long form with `=`, see lvol_controller.py:1737). Tolerate the legacy +# short form `-n ` as well so older sbctl deployments still parse. +NQN_RE = re.compile(r"(?:--nqn[=\s]+|-n\s+)(\S+)") + + +OUTAGE_METHODS = ( + "graceful", "forced", "container_kill", "host_reboot", + "network_outage_20", "network_outage_50", +) +# Methods that leave the node in a state where it recovers on its own +# (no sbctl restart required from the soak driver). +AUTO_RECOVER_METHODS = ( + "container_kill", "host_reboot", + "network_outage_20", "network_outage_50", +) + +# Scenario enumeration: +# 3 role categories × P(M,2) ordered distinct-method pairs +# = 3 × M·(M-1) scenarios per cycle. +# Examples: +# M=5 → 3 × 20 = 60 +# M=6 → 3 × 30 = 90 +# Role categories (relative ring-distance preserved; the actual node pair +# is re-rolled randomly per scenario at execution time so the soak hits +# many different concrete pairs while keeping the topological distance +# fixed for each category). +# Order matters: the soak walks the full method permutation for one +# category before moving on. "unrelated" runs first so the outage with +# the widest blast-radius coverage (two nodes from different LVS rings) +# exercises the cluster before the within-ring categories. +# - unrelated : pair sharing no LVS in any role — ring-distance +# ≥ 3 (≥ 2 nodes between). +# - primary_tertiary : primary + tertiary of same LVS — ring-distance +# 2 (exactly one node between); no replication +# edge connects them (jumps over the secondary). +# - primary_secondary : primary + secondary of same LVS — ring-distance +# 1 (direct successor). Represents both (P,S) and +# (S,T): two adjacent replicas of the same LVS +# going down is structurally symmetric regardless +# of which end. +# Same-method pairs (graceful,graceful etc.) are not enumerated — the +# user-agreed count 30 for 6 methods equals 6·5, not 6². +ROLE_CATEGORIES = ("unrelated", "primary_tertiary", "primary_secondary") + + +def parse_args(): + default_metadata = Path(__file__).with_name("cluster_metadata_base.json") + default_log_dir = Path(__file__).parent + + parser = argparse.ArgumentParser( + description=( + "Run a long fio soak against the lab cluster while cycling random " + "two-node outages with mixed outage methods. Each iteration: start " + "fio on every volume, apply the outage pair, fault-check fio, " + "stop fio, optionally rebuild one volume, then wait for the cluster " + "to settle UNLOADED (no IO pressure on rebalance/data-migration). " + "Trades 'fio always running' for fast iteration time." + ) + ) + parser.add_argument("--metadata", default=str(default_metadata), help="Path to cluster metadata JSON.") + parser.add_argument("--pool", default="pool01", help="Pool name for volume creation.") + parser.add_argument("--expected-node-count", type=int, default=4, help="Required storage node count.") + parser.add_argument("--volume-size", default="25G", help="Volume size to create per storage node.") + parser.add_argument("--runtime", type=int, default=72000, help="fio runtime in seconds.") + parser.add_argument("--restart-timeout", type=int, default=900, help="Seconds to wait for restarted nodes.") + parser.add_argument("--rebalance-timeout", type=int, default=7200, help="Seconds to wait for rebalancing.") + parser.add_argument("--poll-interval", type=int, default=10, help="Poll interval for health checks.") + parser.add_argument( + "--shutdown-gap", + type=int, + default=0, + help="Optional delay between shutting down the two selected nodes.", + ) + parser.add_argument( + "--log-file", + default=str(default_log_dir / f"aws_dual_node_outage_soak_{time.strftime('%Y%m%d_%H%M%S')}.log"), + help="Single log file for script and CLI output.", + ) + parser.add_argument( + "--run-on-mgmt", + action="store_true", + help="Run management-node commands locally instead of over SSH.", + ) + parser.add_argument( + "--password", + default=None, + help=( + "Root password shared by mgmt+storage nodes. If omitted, falls " + "back to the SBCLI_ROOT_PASSWORD env var, then to an interactive " + "prompt. Avoid the flag form on shared hosts (visible in `ps`)." + ), + ) + parser.add_argument( + "--methods", + default=",".join(OUTAGE_METHODS), + help=( + "Comma-separated subset of outage methods to pick from per iteration. " + f"Choices: {','.join(OUTAGE_METHODS)}. " + "Each iteration picks 2 distinct methods at random." + ), + ) + parser.add_argument( + "--auto-recover-wait", + type=int, + default=900, + help=( + "Seconds to wait for a node to return online after a container_kill " + "or host_reboot outage (no sbctl restart is issued)." + ), + ) + parser.add_argument( + "--cycles", + type=int, + default=1, + help=( + "Number of passes through the full deterministic scenario list. " + "Each pass covers C(N,2)*M² scenarios (250 for 5 nodes × 5 methods; " + "540 for 6 × 6). 0 means loop forever." + ), + ) + parser.add_argument( + "--shuffle-scenarios", + action="store_true", + help=( + "Shuffle scenario order per cycle (seeded deterministically off " + "the cycle index). Useful when a full cycle is too long to finish " + "and you want even coverage across early/mid/late pairs." + ), + ) + parser.add_argument( + "--start-at", + type=int, + default=1, + help=( + "Start the first cycle at scenario N (1-indexed). Scenarios " + "1..N-1 are skipped in the first cycle only; subsequent cycles " + "run from scenario 1 as normal. Use to resume after a failure — " + "e.g. --start-at 60 if scenario 60 is the one that failed." + ), + ) + parser.add_argument( + "--churn-every-n-iters", + type=int, + default=3, + help=( + "Rebuild one randomly-chosen volume every N outage iterations, " + "in the unloaded window between fio-stop and rebalance-wait. " + "Default 3. Set to 0 to disable; --no-churn is an explicit alias." + ), + ) + parser.add_argument( + "--no-churn", + action="store_true", + help="Disable per-iteration volume churn entirely.", + ) + args = parser.parse_args() + methods = [m.strip() for m in args.methods.split(",") if m.strip()] + bad = [m for m in methods if m not in OUTAGE_METHODS] + if bad: + parser.error(f"Unknown outage method(s): {bad}. Choices: {list(OUTAGE_METHODS)}") + if not methods: + parser.error("At least one outage method must be enabled") + args.methods = methods + if args.churn_every_n_iters < 0: + parser.error("--churn-every-n-iters must be >= 0") + args.password = resolve_password(args.password) + if not args.password: + parser.error("Empty root password; supply --password, SBCLI_ROOT_PASSWORD, or answer the prompt.") + if subprocess.run(["which", "sshpass"], capture_output=True).returncode != 0 and paramiko is None: + parser.error( + "Neither paramiko nor sshpass is available on this host. " + "Install one ('pip3 install --user paramiko' or 'sudo dnf install sshpass') " + "before running." + ) + return args + + +def resolve_password(cli_value): + if cli_value: + return cli_value + env_value = os.environ.get("SBCLI_ROOT_PASSWORD") + if env_value: + return env_value + return getpass.getpass("Root password for lab nodes (.210, .201-.204): ") + + +def load_metadata(path): + with open(path, "r", encoding="utf-8") as handle: + return json.load(handle) + + +class Logger: + def __init__(self, path): + self.path = path + self.lock = threading.Lock() + Path(path).parent.mkdir(parents=True, exist_ok=True) + + def log(self, message): + line = f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] {message}" + with self.lock: + print(line, flush=True) + with open(self.path, "a", encoding="utf-8") as handle: + handle.write(line + "\n") + + def block(self, header, content): + if content is None: + return + text = content.rstrip() + if not text: + return + with self.lock: + with open(self.path, "a", encoding="utf-8") as handle: + handle.write(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] {header}\n") + handle.write(text + "\n") + + +class RemoteCommandError(RuntimeError): + pass + + +class RemoteHost: + def __init__(self, hostname, user, password, logger, name): + self.hostname = hostname + self.user = user + self.password = password + self.logger = logger + self.name = name + self.client = None + self.connect() + + def connect(self): + if paramiko is None: + return + self.close() + last_error = None + for attempt in range(1, 16): + try: + client = paramiko.SSHClient() + client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + client.connect( + hostname=self.hostname, + username=self.user, + password=self.password, + timeout=15, + banner_timeout=15, + auth_timeout=15, + allow_agent=False, + look_for_keys=False, + ) + transport = client.get_transport() + if transport is not None: + transport.set_keepalive(30) + self.client = client + return + except Exception as exc: + last_error = exc + self.logger.log( + f"{self.name}: SSH attempt {attempt}/15 failed to {self.hostname}: {exc}" + ) + time.sleep(5) + raise RemoteCommandError(f"{self.name}: failed to connect to {self.hostname}: {last_error}") + + def run(self, command, timeout=600, check=True, label=None): + if paramiko is None: + return self._run_via_ssh_cli(command, timeout=timeout, check=check, label=label) + if self.client is None: + self.connect() + label = label or command + self.logger.log(f"{self.name}: RUN {label}") + try: + stdin, stdout, stderr = self.client.exec_command(command, timeout=timeout) + stdout_text = stdout.read().decode("utf-8", errors="replace") + stderr_text = stderr.read().decode("utf-8", errors="replace") + rc = stdout.channel.recv_exit_status() + except Exception as exc: + self.logger.log(f"{self.name}: command transport failure for {label}: {exc}; reconnecting once") + self.connect() + stdin, stdout, stderr = self.client.exec_command(command, timeout=timeout) + stdout_text = stdout.read().decode("utf-8", errors="replace") + stderr_text = stderr.read().decode("utf-8", errors="replace") + rc = stdout.channel.recv_exit_status() + self.logger.block(f"{self.name}: STDOUT for {label}", stdout_text) + self.logger.block(f"{self.name}: STDERR for {label}", stderr_text) + if check and rc != 0: + raise RemoteCommandError( + f"{self.name}: command failed with rc={rc}: {label}" + ) + return rc, stdout_text, stderr_text + + def _run_via_ssh_cli(self, command, timeout=600, check=True, label=None): + label = label or command + self.logger.log(f"{self.name}: RUN {label}") + ssh_cmd = [ + "sshpass", "-e", + "ssh", + "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", + "-o", "LogLevel=ERROR", + "-o", "ConnectTimeout=15", + "-o", "ServerAliveInterval=30", + f"{self.user}@{self.hostname}", + command, + ] + env = os.environ.copy() + env["SSHPASS"] = self.password + try: + completed = subprocess.run( + ssh_cmd, + capture_output=True, + text=True, + timeout=timeout, + check=False, + env=env, + ) + except subprocess.TimeoutExpired as exc: + stdout_text = exc.stdout or "" + stderr_text = exc.stderr or "" + self.logger.block(f"{self.name}: STDOUT for {label}", stdout_text) + self.logger.block(f"{self.name}: STDERR for {label}", stderr_text) + raise RemoteCommandError(f"{self.name}: command timed out: {label}") from exc + stdout_text = completed.stdout or "" + stderr_text = completed.stderr or "" + rc = completed.returncode + self.logger.block(f"{self.name}: STDOUT for {label}", stdout_text) + self.logger.block(f"{self.name}: STDERR for {label}", stderr_text) + if check and rc != 0: + raise RemoteCommandError(f"{self.name}: command failed with rc={rc}: {label}") + return rc, stdout_text, stderr_text + + def close(self): + if self.client is not None: + self.client.close() + self.client = None + + +class LocalHost: + def __init__(self, logger, name): + self.logger = logger + self.name = name + + def run(self, command, timeout=600, check=True, label=None): + label = label or command + self.logger.log(f"{self.name}: RUN {label}") + try: + completed = subprocess.run( + ["/bin/bash", "-lc", command], + capture_output=True, + text=True, + timeout=timeout, + check=False, + ) + except subprocess.TimeoutExpired as exc: + stdout_text = exc.stdout or "" + stderr_text = exc.stderr or "" + self.logger.block(f"{self.name}: STDOUT for {label}", stdout_text) + self.logger.block(f"{self.name}: STDERR for {label}", stderr_text) + raise RemoteCommandError(f"{self.name}: command timed out: {label}") from exc + stdout_text = completed.stdout or "" + stderr_text = completed.stderr or "" + rc = completed.returncode + self.logger.block(f"{self.name}: STDOUT for {label}", stdout_text) + self.logger.block(f"{self.name}: STDERR for {label}", stderr_text) + if check and rc != 0: + raise RemoteCommandError(f"{self.name}: command failed with rc={rc}: {label}") + return rc, stdout_text, stderr_text + + def close(self): + return + + +# Number of fio worker processes per --name. Must match --numjobs in +# start_fio(). --group_reporting aggregates all workers into one report, +# so a single fio summary + per-run stderr stream is sufficient to +# diagnose any fio fault. +FIO_NUMJOBS = 4 + + +@dataclass +class FioJob: + volume_id: str + volume_name: str + mount_point: str + fio_log: str # fio's --output summary file (written on exit) + fio_stderr: str # captured stdout+stderr during the run (progress, + # errors, "max_latency exceeded" messages). This is + # the primary source of ground truth for fio faults. + rc_file: str + pid: int + fio_name: str # matches --name= in the fio command line + + +class TestRunError(RuntimeError): + pass + + +class SoakRunner: + def __init__(self, args, metadata, logger): + self.args = args + self.metadata = metadata + self.logger = logger + self.user = metadata["user"] + self.password = args.password + self.run_id = time.strftime("%Y%m%d_%H%M%S") + if args.run_on_mgmt: + self.mgmt = LocalHost(logger, "mgmt") + else: + self.mgmt = RemoteHost(metadata["mgmt"]["public_ip"], self.user, self.password, logger, "mgmt") + client_entry = metadata["clients"][0] + if args.run_on_mgmt: + client_addr = client_entry.get("private_ip") or client_entry["public_ip"] + else: + client_addr = client_entry["public_ip"] + self.client = RemoteHost(client_addr, self.user, self.password, logger, "client") + self.cluster_id = metadata.get("cluster_uuid") or "" + self.fio_jobs = [] + # Stored so the churn cycle can pick a random volume to rebuild. + self.volumes = [] + self.created_volume_ids = [] + # Mixed-outage state + self.methods = list(args.methods) + # On multipath clusters, network-layer coverage is provided by the + # inter-iteration single-NIC chaos. Dropping all data NICs on a node + # (network_outage_*) is a simple-cluster-only scenario. + if self._is_multipath(): + filtered = [m for m in self.methods if not m.startswith("network_outage_")] + dropped = [m for m in self.methods if m not in filtered] + if dropped: + self.logger.log( + f"multipath cluster detected: excluding {dropped} from outage methods" + ) + if not filtered: + raise TestRunError( + "No outage methods remain after excluding network_outage_* " + "on multipath cluster; pass --methods with at least one " + "non-network_outage method" + ) + self.methods = filtered + self.node_hosts = {} # uuid -> RemoteHost (private_ip of storage node) + self.node_ip_map = self._build_node_ip_map() + # Counter incremented by every churn cycle; embedded in fio --name and + # the rebuilt volume name so logs/pkill targets remain unique across + # iterations even when the same volume index is rebuilt repeatedly. + self.churn_counter = 0 + self.mount_root = None + + def close(self): + self.client.close() + self.mgmt.close() + for host in self.node_hosts.values(): + try: + host.close() + except Exception: + pass + + def _build_node_ip_map(self): + """Return {uuid: private_ip} for every storage node we know about.""" + ip_map = {} + topology = self.metadata.get("topology") or {} + for node in topology.get("nodes", []): + uuid = node.get("uuid") + ip = node.get("management_ip") or node.get("private_ip") + if uuid and ip: + ip_map[uuid] = ip + # Fallback: pair storage_nodes list with sbctl-returned UUIDs by mgmt IP, + # which is done lazily in _resolve_node_ip below. + return ip_map + + def _resolve_node_ip(self, uuid): + """Return the private/mgmt IP for a storage node UUID, refreshing via + sbctl if we haven't seen it in metadata.""" + ip = self.node_ip_map.get(uuid) + if ip: + return ip + # Try fetching via sbctl sn list JSON. + nodes = self.sbctl("sn list --json", json_output=True) + for node in nodes: + candidate_ip = ( + node.get("Management IP") + or node.get("Mgmt IP") + or node.get("mgmt_ip") + or node.get("management_ip") + ) + if node.get("UUID") == uuid and candidate_ip: + self.node_ip_map[uuid] = candidate_ip + return candidate_ip + raise TestRunError(f"Cannot resolve storage-node IP for UUID {uuid}") + + def _node_host(self, uuid): + """Lazily create a RemoteHost for a storage node identified by UUID.""" + if uuid in self.node_hosts: + return self.node_hosts[uuid] + ip = self._resolve_node_ip(uuid) + host = RemoteHost(ip, self.user, self.password, self.logger, f"sn[{ip}]") + self.node_hosts[uuid] = host + return host + + def sbctl(self, args, timeout=600, json_output=False): + command = "sudo /usr/local/bin/sbctl -d " + args + _, stdout_text, stderr_text = self.mgmt.run( + command, + timeout=timeout, + check=True, + label=f"sbctl {args}", + ) + if not json_output: + return stdout_text + for candidate in (stdout_text, stderr_text, stdout_text + "\n" + stderr_text): + candidate = candidate.strip() + if not candidate: + continue + try: + return json.loads(candidate) + except json.JSONDecodeError: + pass + decoder = json.JSONDecoder() + final_payloads = [] + list_payloads = [] + dict_payloads = [] + for start, char in enumerate(candidate): + if char not in "[{": + continue + try: + obj, end = decoder.raw_decode(candidate[start:]) + except json.JSONDecodeError: + continue + if not isinstance(obj, (dict, list)): + continue + if not candidate[start + end:].strip(): + final_payloads.append(obj) + elif isinstance(obj, list): + list_payloads.append(obj) + else: + dict_payloads.append(obj) + if final_payloads: + return final_payloads[-1] + if list_payloads: + return list_payloads[-1] + if dict_payloads: + return dict_payloads[-1] + raise TestRunError(f"Failed to parse JSON from sbctl {args}") + + def ensure_prerequisites(self): + self.logger.log(f"Using password auth as {self.user}; paramiko={'yes' if paramiko else 'no (sshpass fallback)'}") + self.client.run( + "if command -v dnf >/dev/null 2>&1; then " + "sudo dnf install -y nvme-cli fio xfsprogs; " + "else sudo apt-get update && sudo apt-get install -y nvme-cli fio xfsprogs; fi", + timeout=1800, + label="install client packages", + ) + self.client.run("sudo modprobe nvme_tcp", timeout=60, label="load nvme_tcp") + + def get_cluster_id(self): + if self.cluster_id: + return self.cluster_id + clusters = self.sbctl("cluster list --json", json_output=True) + if not clusters: + raise TestRunError("No clusters returned by sbctl cluster list") + self.cluster_id = clusters[0]["UUID"] + return self.cluster_id + + def get_nodes(self): + nodes = self.sbctl("sn list --json", json_output=True) + parsed = [] + for node in nodes: + parsed.append( + { + "uuid": node["UUID"], + "status": str(node.get("Status", "")).lower(), + "mgmt_ip": node.get("Mgmt IP") or node.get("mgmt_ip") or "", + "hostname": node.get("Hostname") or "", + } + ) + return parsed + + def ensure_expected_nodes(self): + nodes = self.get_nodes() + if len(nodes) != self.args.expected_node_count: + raise TestRunError( + f"Expected {self.args.expected_node_count} storage nodes, found {len(nodes)}. " + f"Update metadata or pass --expected-node-count." + ) + return nodes + + def assert_cluster_not_suspended(self): + clusters = self.sbctl("cluster list --json", json_output=True) + if not clusters: + raise TestRunError("Cluster list returned no rows") + status = str(clusters[0].get("Status", "")).lower() + if status == "suspended": + raise TestRunError("Cluster is suspended") + return status + + def wait_for_all_online(self, target_nodes=None, timeout=None): + timeout = timeout or self.args.restart_timeout + expected = self.args.expected_node_count + target_nodes = set(target_nodes or []) + started = time.time() + while time.time() - started < timeout: + self.assert_cluster_not_suspended() + nodes = self.ensure_expected_nodes() + statuses = {node["uuid"]: node["status"] for node in nodes} + offline = [uuid for uuid, status in statuses.items() if status != "online"] + unaffected_bad = [ + uuid for uuid, status in statuses.items() + if uuid not in target_nodes and status != "online" + ] + if unaffected_bad: + raise TestRunError( + "Unaffected nodes are not online: " + + ", ".join(f"{uuid}:{statuses[uuid]}" for uuid in unaffected_bad) + ) + if not offline and len(statuses) == expected: + return nodes + self.logger.log( + "Waiting for all nodes online: " + + ", ".join(f"{uuid}:{status}" for uuid, status in statuses.items()) + ) + time.sleep(self.args.poll_interval) + raise TestRunError("Timed out waiting for nodes to return online") + + def wait_for_cluster_stable(self): + cluster_id = self.get_cluster_id() + started = time.time() + while time.time() - started < self.args.rebalance_timeout: + cluster_list = self.sbctl("cluster list --json", json_output=True) + status = str(cluster_list[0].get("Status", "")).lower() + if status == "suspended": + raise TestRunError("Cluster entered suspended state") + cluster_info = self.sbctl(f"cluster get {cluster_id}", json_output=True) + rebalancing = bool(cluster_info.get("is_re_balancing", False)) + nodes = self.ensure_expected_nodes() + node_statuses = {node["uuid"]: node["status"] for node in nodes} + if status == "active" and not rebalancing and all( + state == "online" for state in node_statuses.values() + ): + self.logger.log("Cluster stable: ACTIVE, online, not rebalancing") + return + self.logger.log( + "Waiting for cluster stability: " + f"status={status}, rebalancing={rebalancing}, " + + ", ".join(f"{uuid}:{state}" for uuid, state in node_statuses.items()) + ) + time.sleep(self.args.poll_interval) + raise TestRunError("Timed out waiting for cluster rebalancing to finish") + + def get_active_tasks(self): + cluster_id = self.get_cluster_id() + script = ( + "import json; " + "from simplyblock_core import db_controller; " + "from simplyblock_core.models.job_schedule import JobSchedule; " + "db = db_controller.DBController(); " + f"tasks = db.get_job_tasks({cluster_id!r}, reverse=False); " + "out = [t.get_clean_dict() for t in tasks " + "if t.status != JobSchedule.STATUS_DONE and not getattr(t, 'canceled', False)]; " + "print(json.dumps(out))" + ) + out = self.mgmt.run( + f"sudo python3 -c {shlex.quote(script)}", + timeout=60, + label="list active tasks", + )[1].strip() + return json.loads(out or "[]") + + def wait_for_no_active_tasks(self, reason): + started = time.time() + while time.time() - started < self.args.rebalance_timeout: + self.assert_cluster_not_suspended() + active_tasks = self.get_active_tasks() + if not active_tasks: + return + details = ", ".join( + f"{task.get('function_name')}:{task.get('status')}:{task.get('node_id') or task.get('device_id')}" + for task in active_tasks + ) + self.logger.log(f"Waiting before {reason}; active tasks: {details}") + time.sleep(self.args.poll_interval) + raise TestRunError(f"Timed out waiting for active tasks to finish before {reason}") + + @staticmethod + def _is_data_migration_task(task): + function_name = str(task.get("function_name", "")).lower() + task_name = str(task.get("task_name", "")).lower() + task_type = str(task.get("task_type", "")).lower() + haystack = " ".join([function_name, task_name, task_type]) + markers = ( + "migration", + "rebalanc", + "sync", + ) + return any(marker in haystack for marker in markers) + + def wait_for_data_migration_complete(self, reason): + started = time.time() + while time.time() - started < self.args.rebalance_timeout: + self.assert_cluster_not_suspended() + active_tasks = self.get_active_tasks() + migration_tasks = [task for task in active_tasks if self._is_data_migration_task(task)] + if not migration_tasks: + return + details = ", ".join( + f"{task.get('function_name')}:{task.get('status')}:{task.get('node_id') or task.get('device_id')}" + for task in migration_tasks + ) + self.logger.log(f"Waiting before {reason}; data migration tasks: {details}") + time.sleep(self.args.poll_interval) + raise TestRunError( + f"Timed out waiting for data migration tasks to finish before {reason}" + ) + + def sbctl_allow_failure(self, args, timeout=600): + command = "sudo /usr/local/bin/sbctl -d " + args + rc, stdout_text, stderr_text = self.mgmt.run( + command, + timeout=timeout, + check=False, + label=f"sbctl {args}", + ) + return rc, stdout_text, stderr_text + + def shutdown_with_migration_retry(self, node_id): + while True: + rc, stdout_text, stderr_text = self.sbctl_allow_failure( + f"sn shutdown {node_id}", + timeout=300, + ) + if rc == 0: + return + output = f"{stdout_text}\n{stderr_text}".lower() + retry_markers = ( + "migration", + "migrat", + "rebalanc", + "active task", + "running task", + "in_progress", + "in progress", + ) + if any(marker in output for marker in retry_markers): + self.logger.log( + f"Shutdown of {node_id} blocked by migration/rebalance/task; retrying in 15s" + ) + time.sleep(15) + continue + raise RemoteCommandError( + f"mgmt: command failed with rc={rc}: sbctl sn shutdown {node_id}" + ) + + def prepare_client(self): + home = "/root" if self.user == "root" else posixpath.join("/home", self.user) + mount_root = posixpath.join(home, f"lab_outage_soak_{self.run_id}") + command = ( + "sudo pkill -f '[f]io --name=aws_dual_soak_' || true\n" + f"sudo mkdir -p {shlex.quote(mount_root)}\n" + f"sudo chown {shlex.quote(self.user)}:{shlex.quote(self.user)} {shlex.quote(mount_root)}\n" + ) + self.client.run(f"bash -lc {shlex.quote(command)}", timeout=120, label="prepare client workspace") + return mount_root + + def extract_uuid(self, text): + for line in reversed(text.splitlines()): + stripped = line.strip() + if UUID_RE.fullmatch(stripped): + return stripped + raise TestRunError(f"Failed to extract standalone UUID from output: {text}") + + def _create_one_volume(self, volume_name, node_uuid, index): + """Create one lvol bound to ``node_uuid`` and return its volume dict. + + Retries inside the rebalance window if the LVStore is being recreated + or while a rebalance / data migration is in flight, matching the + behaviour of the bulk ``create_volumes`` path. + """ + volume_id = None + started = time.time() + while time.time() - started < self.args.rebalance_timeout: + self.wait_for_all_online(timeout=self.args.restart_timeout) + self.wait_for_cluster_stable() + output = self.sbctl( + f"lvol add {volume_name} {self.args.volume_size} {self.args.pool} --host-id {node_uuid}" + ) + if "ERROR:" in output or "LVStore is being recreated" in output: + self.logger.log(f"Volume create for {volume_name} deferred: {output.strip()}") + time.sleep(self.args.poll_interval) + continue + volume_id = self.extract_uuid(output) + break + if volume_id is None: + raise TestRunError(f"Timed out creating volume {volume_name} on node {node_uuid}") + self.created_volume_ids.append(volume_id) + self.logger.log(f"Created volume {volume_name} ({volume_id}) on node {node_uuid}") + return { + "index": index, + "volume_name": volume_name, + "volume_id": volume_id, + "node_uuid": node_uuid, + } + + def create_volumes(self, nodes): + self.logger.log( + f"Creating {len(nodes)} volumes of size {self.args.volume_size}, one per storage node" + ) + volumes = [] + for index, node in enumerate(nodes, start=1): + volume_name = f"aws_dual_soak_{self.run_id}_v{index}" + volumes.append(self._create_one_volume(volume_name, node["uuid"], index)) + return volumes + + def connect_and_mount_volumes(self, volumes, mount_root): + self.logger.log("Connecting volumes to client and preparing filesystems") + for volume in volumes: + self._connect_and_mount_one(volume, mount_root) + + def _connect_and_mount_one(self, volume, mount_root): + """Connect, mkfs, mount a single volume. Mutates ``volume`` to add + mount_point / fio_log / fio_stderr / rc_file / nqn keys. + + Saving ``nqn`` lets the churn cycle disconnect via ``nvme disconnect + -n `` without having to re-derive it from the device path. + """ + connect_output = self.sbctl(f"lvol connect {volume['volume_id']}") + connect_commands = [] + for line in connect_output.splitlines(): + stripped = line.strip() + if stripped.startswith("sudo nvme connect"): + connect_commands.append(stripped) + if not connect_commands: + raise TestRunError(f"No nvme connect command returned for {volume['volume_id']}") + nqn = None + for cmd in connect_commands: + m = NQN_RE.search(cmd) + if m: + nqn = m.group(1) + break + if nqn is None: + raise TestRunError( + f"Failed to parse NQN from lvol connect output for {volume['volume_id']}" + ) + volume["nqn"] = nqn + successful_connects = 0 + failed_connects = [] + for connect_cmd in connect_commands: + try: + self.client.run(connect_cmd, timeout=120, label=f"connect {volume['volume_id']}") + successful_connects += 1 + except TestRunError as exc: + failed_connects.append(str(exc)) + self.logger.log(f"Path connect failed for {volume['volume_id']}: {exc}") + if successful_connects == 0: + raise TestRunError( + f"No nvme paths connected for {volume['volume_id']}: {'; '.join(failed_connects)}" + ) + if failed_connects: + self.logger.log( + f"Continuing with {successful_connects}/{len(connect_commands)} connected paths " + f"for {volume['volume_id']}" + ) + volume["mount_point"] = posixpath.join(mount_root, f"vol{volume['index']}") + volume["fio_log"] = posixpath.join(mount_root, f"fio_vol{volume['index']}.log") + volume["fio_stderr"] = posixpath.join(mount_root, f"fio_vol{volume['index']}.stderr") + volume["rc_file"] = posixpath.join(mount_root, f"fio_vol{volume['index']}.rc") + find_and_mount = ( + "set -euo pipefail\n" + f"dev=$(readlink -f /dev/disk/by-id/*{volume['volume_id']}* | head -n 1)\n" + "if [ -z \"$dev\" ]; then\n" + f" echo 'Failed to locate NVMe device for {volume['volume_id']}' >&2\n" + " exit 1\n" + "fi\n" + f"sudo mkfs.xfs -f \"$dev\"\n" + f"sudo mkdir -p {shlex.quote(volume['mount_point'])}\n" + f"sudo mount \"$dev\" {shlex.quote(volume['mount_point'])}\n" + f"sudo chown {shlex.quote(self.user)}:{shlex.quote(self.user)} {shlex.quote(volume['mount_point'])}\n" + ) + self.client.run( + f"bash -lc {shlex.quote(find_and_mount)}", + timeout=600, + label=f"format and mount {volume['volume_id']}", + ) + + def _build_fio_name(self, index, churn_id): + # Names embed both the volume index and the churn counter so the name + # is unique even after a churn replaces a volume — avoids prefix + # collisions when pkill -f matches by --name=. + return f"aws_dual_soak_v{index}_c{churn_id}" + + def _start_fio_for_volume(self, volume, fio_name): + # Capture fio's stdout+stderr to a dedicated file. --output only + # writes the aggregate summary on exit; progress lines and error + # messages ("fio: max_latency exceeded", IO error details, etc.) + # go to stderr during the run. That stream is the authoritative + # source for "what went wrong" — surface it on every fault. + start_script = ( + "set -euo pipefail\n" + f"rm -f {shlex.quote(volume['rc_file'])} {shlex.quote(volume['fio_stderr'])}\n" + "nohup bash -lc " + + shlex.quote( + f"cd {shlex.quote(volume['mount_point'])} && " + f"fio --name={fio_name} --directory={shlex.quote(volume['mount_point'])} " + "--direct=1 --rw=randrw --bs=4K --group_reporting --time_based " + f"--numjobs={FIO_NUMJOBS} --iodepth=4 --size=4G --runtime={self.args.runtime} " + "--ioengine=aiolib --max_latency=20s --exitall_on_error=1 " + f"--output={shlex.quote(volume['fio_log'])}; " + "rc=$?; " + f"echo $rc > {shlex.quote(volume['rc_file'])}" + ) + + f" > {shlex.quote(volume['fio_stderr'])} 2>&1 & echo $!" + ) + _, stdout_text, _ = self.client.run( + f"bash -lc {shlex.quote(start_script)}", + timeout=60, + label=f"start fio {volume['volume_id']}", + ) + pid_text = stdout_text.strip().splitlines()[-1] + pid = int(pid_text) + job = FioJob( + volume_id=volume["volume_id"], + volume_name=volume["volume_name"], + mount_point=volume["mount_point"], + fio_log=volume["fio_log"], + fio_stderr=volume["fio_stderr"], + rc_file=volume["rc_file"], + pid=pid, + fio_name=fio_name, + ) + self.logger.log(f"Started fio for {volume['volume_name']} with pid {pid} (name={fio_name})") + return job + + def start_fio(self, volumes): + self.logger.log("Starting fio on all mounted volumes in parallel") + fio_jobs = [] + for volume in volumes: + fio_name = self._build_fio_name(volume["index"], 0) + fio_jobs.append(self._start_fio_for_volume(volume, fio_name)) + self.fio_jobs = fio_jobs + # Give fio a few seconds to begin; don't block on worker fork — the + # authoritative "fio is in trouble" signal is rc_file / stderr, not + # process counts. If fio never issues IO the pre-stop check and + # outage cluster-health checks will surface that. + time.sleep(5) + + def read_remote_file(self, path): + rc, stdout_text, _ = self.client.run( + f"bash -lc {shlex.quote(f'cat {shlex.quote(path)}')}", + timeout=30, + check=False, + label=f"read {path}", + ) + if rc != 0: + return "" + return stdout_text + + # ----- fio fault detection -------------------------------------------- + + # Any line in fio_stderr matching one of these (case-sensitive, fixed + # string) is treated as a fio fault — even if fio is still running. + # ``--max_latency`` violations in particular log "fio: latency of … + # exceeds specified max" and do NOT always terminate fio when run + # with --group_reporting + --numjobs>1, so a process-still-alive + # check alone misses them. + FIO_STDERR_ERROR_MARKERS = ( + "fio: latency of", # --max_latency violation + "fio: io_u error", # io_u submission/completion error + "fio: pid=", # generic fio per-job error dump + "io_u error on file", # alternate io_u error format + "verify failed", # data verification fault + "fio: verify", # alternate verify error + "fio: error", # generic fio error + "Killed", # bash reports fio got SIGKILL + "Terminated", # bash reports fio got SIGTERM (no churn here) + ) + + def _read_rc_file(self, job): + """Return the rc string if fio's wrapping bash wrote rc_file, else None. + + ``rc_file`` is one of three independent fault signals; see + ``_check_fio_fault``. + """ + probe = ( + f"if [ -f {shlex.quote(job.rc_file)} ]; then " + f"cat {shlex.quote(job.rc_file)}; fi" + ) + _, stdout_text, _ = self.client.run( + f"bash -lc {shlex.quote(probe)}", + timeout=15, + check=False, + label=f"check rc_file {job.volume_name}", + ) + rc = (stdout_text or "").strip() + return rc or None + + def _wrapper_alive(self, job): + """Return True iff the wrapping bash that runs fio is still alive. + + ``job.pid`` is the pid printed by ``echo $!`` at start_fio time + (the nohup'd bash, parent of fio). If that pid is gone AND no + rc_file was written, fio was signalled away and bash never got + to record an exit code — that case is a fault, not "still running". + """ + probe = ( + f"if kill -0 {int(job.pid)} 2>/dev/null; then echo alive; fi" + ) + _, stdout_text, _ = self.client.run( + f"bash -lc {shlex.quote(probe)}", + timeout=15, + check=False, + label=f"check wrapper pid {job.volume_name}", + ) + return stdout_text.strip() == "alive" + + def _scan_fio_stderr_for_errors(self, job): + """Return matching error lines from fio_stderr (up to 20), or "". + + See FIO_STDERR_ERROR_MARKERS for the list. ``--max_latency`` + violations in particular are reported here even while fio + continues running, so this catches faults the rc_file / pid + checks would miss. + """ + if not self.FIO_STDERR_ERROR_MARKERS: + return "" + grep_args = " ".join( + f"-e {shlex.quote(p)}" for p in self.FIO_STDERR_ERROR_MARKERS + ) + grep_cmd = ( + f"grep -F -m 20 {grep_args} " + f"{shlex.quote(job.fio_stderr)} 2>/dev/null || true" + ) + _, stdout_text, _ = self.client.run( + f"bash -lc {shlex.quote(grep_cmd)}", + timeout=15, + check=False, + label=f"scan stderr {job.volume_name}", + ) + return stdout_text.strip() + + def _check_fio_fault(self, job): + """Detect any fio fault for ``job``. Returns ``(kind, detail)`` or None. + + Three independent signals — ANY one is a fault: + * ``exited``: fio's wrapping bash wrote rc_file (any rc, including 0, + is a fault mid-run because fio's --runtime is orders of magnitude + longer than an outage iteration). + * ``missing``: the wrapping bash pid is gone and no rc_file was + written — fio was signalled away (or its wrapper died) without + recording an exit code. + * ``stderr_error``: fio_stderr contains a known fio error marker + (max_latency violation, io_u error, verify failure, etc.) — fio + may still be running but is degraded; treat it as a fault. + + ``detail`` is a human-readable one-liner. The full stderr/output + is dumped via ``_dump_fio_streams`` by the callers. + """ + rc = self._read_rc_file(job) + if rc is not None: + return ("exited", f"fio exited rc={rc}") + + if not self._wrapper_alive(job): + return ( + "missing", + f"fio wrapper pid {job.pid} is gone and no rc_file was written", + ) + + err = self._scan_fio_stderr_for_errors(job) + if err: + first_line = err.splitlines()[0][:240] + return ("stderr_error", f"stderr error marker: {first_line}") + + return None + + def _dump_fio_streams(self, job, context): + """Write fio's captured stderr and --output summary into the soak + log so the actual fio error text (max_latency violations, IO + errors, "fio: pid=…, err=…, func=…" lines) is visible next to + the outage scenario that triggered it.""" + for label, path, lines in [ + ("fio stderr", job.fio_stderr, 200), + ("fio summary", job.fio_log, 60), + ]: + _, body, _ = self.client.run( + f"bash -lc {shlex.quote(f'tail -{lines} {shlex.quote(path)} 2>/dev/null || true')}", + timeout=30, + check=False, + label=f"dump {label} {job.volume_name}", + ) + if body.strip(): + self.logger.block( + f"[{context}] {job.volume_name} {label} ({path}):", + body, + ) + else: + self.logger.log( + f"[{context}] {job.volume_name} {label} ({path}): (empty)" + ) + + def check_fio(self): + """Raise if any tracked fio shows a fault. + + Three independent signals are evaluated per ``_check_fio_fault``: + rc_file written (fio exited), wrapper pid gone with no rc_file + (signalled away), or a fio error marker in stderr (max_latency + violation, io_u/verify error, etc.). ANY of these is a fault — + fio's ``--runtime`` is orders of magnitude longer than a single + outage iteration, and a degraded-but-running fio is just as + invalid a result as a dead one. + + On fault, every faulting job's captured stderr and --output + summary are dumped into the soak log so the exact fio error + lines are visible next to the iteration that triggered them. + """ + faulted = [] + for job in self.fio_jobs: + fault = self._check_fio_fault(job) + if fault is not None: + faulted.append((job, fault)) + if not faulted: + return + for job, (kind, detail) in faulted: + self._dump_fio_streams(job, context=f"fio fault [{kind}] {detail}") + details = ", ".join( + f"{j.volume_name}={kind}:{detail}" for j, (kind, detail) in faulted + ) + raise TestRunError(f"fio fault detected: {details}") + + def ensure_fio_running(self): + self.check_fio() + + def stop_fio(self): + """Stop every fio process launched by this soak on the client host. + + Called between outage iterations so rebalancing runs unloaded. + Before killing, calls ``check_fio`` — any fio that wrote its + rc_file is a mid-run exit, which is a fault. Dumps the captured + fio stderr/summary into the soak log so the actual fio error + text is side-by-side with the outage scenario that triggered it. + After the check passes we SIGTERM (short grace window) then + SIGKILL; matching by ``fio --name=aws_dual_soak_*`` catches both + the bash wrapper and any fio workers. + """ + if not self.fio_jobs: + return + + # Pre-kill verification: any fio having exited is a fault. + self.check_fio() + + self.logger.log("All fio still running; stopping them between iterations") + kill_script = ( + "set +e\n" + "sudo pkill -TERM -f 'fio --name=aws_dual_soak_' 2>/dev/null || true\n" + "for i in $(seq 1 15); do\n" + " if ! pgrep -f 'fio --name=aws_dual_soak_' >/dev/null; then\n" + " exit 0\n" + " fi\n" + " sleep 2\n" + "done\n" + "sudo pkill -KILL -f 'fio --name=aws_dual_soak_' 2>/dev/null || true\n" + ) + self.client.run( + f"bash -lc {shlex.quote(kill_script)}", + timeout=90, + check=False, + label="stop fio", + ) + # Drop the job list; start_fio will rebuild it from self.volumes. + self.fio_jobs = [] + + # ----- single-volume fio + churn --------------------------------------- + + def _disconnect_one_volume(self, volume): + nqn = volume.get("nqn") + if not nqn: + self.logger.log( + f"WARNING: no NQN saved for {volume['volume_name']}; skipping nvme disconnect" + ) + return + # ``nvme disconnect -n `` tears down every controller (path) for + # that subsystem in one call, so multipath connections are handled + # without a per-path teardown loop. + self.client.run( + f"sudo nvme disconnect -n {shlex.quote(nqn)}", + timeout=60, + check=False, + label=f"nvme disconnect {volume['volume_name']}", + ) + + def _unmount_one_volume(self, volume): + mount_point = volume.get("mount_point") + if not mount_point: + return + # Try plain unmount first, then -f, then lazy as last resort. We've + # already SIGKILLed the fio holding the mount, so plain umount + # should succeed on the happy path; the fallbacks only matter if + # buffered IO is still draining. + umount_script = ( + f"sudo umount {shlex.quote(mount_point)} 2>/dev/null || " + f"sudo umount -f {shlex.quote(mount_point)} 2>/dev/null || " + f"sudo umount -l {shlex.quote(mount_point)} 2>/dev/null || true" + ) + self.client.run( + f"bash -lc {shlex.quote(umount_script)}", + timeout=60, + check=False, + label=f"umount {volume['volume_name']}", + ) + + def _delete_one_lvol(self, volume): + rc, stdout_text, stderr_text = self.sbctl_allow_failure( + f"lvol delete {volume['volume_id']} --force", + timeout=600, + ) + if rc != 0: + raise TestRunError( + f"lvol delete failed for {volume['volume_name']} ({volume['volume_id']}): " + f"{stdout_text.strip()} | {stderr_text.strip()}" + ) + if volume["volume_id"] in self.created_volume_ids: + self.created_volume_ids.remove(volume["volume_id"]) + + def _churn_one_volume(self): + """Rebuild one randomly-selected volume. + + Called between iterations, AFTER stop_fio. fio is not running, so + no per-job teardown is needed — we just delete + recreate + + remount the volume. The next iteration's start_fio() will pick up + the fresh volume and start a new fio job for it. + """ + if not self.volumes: + return + idx = random.randrange(len(self.volumes)) + old_volume = self.volumes[idx] + + self.churn_counter += 1 + churn_id = self.churn_counter + new_name = f"aws_dual_soak_{self.run_id}_v{old_volume['index']}_c{churn_id}" + self.logger.log( + f"churn {churn_id}: rebuilding {old_volume['volume_name']} " + f"({old_volume['volume_id']}) on node {old_volume['node_uuid']} " + f"-> {new_name}" + ) + + self._unmount_one_volume(old_volume) + self._disconnect_one_volume(old_volume) + self._delete_one_lvol(old_volume) + + # Recreate on the SAME storage node so the topology used by the + # outage scenario list (pinned at startup off role-representative + # pairs) doesn't drift. + new_volume = self._create_one_volume( + new_name, + old_volume["node_uuid"], + old_volume["index"], + ) + self._connect_and_mount_one(new_volume, self.mount_root) + + self.volumes[idx] = new_volume + self.logger.log( + f"churn {churn_id}: complete; {new_name} ({new_volume['volume_id']}) " + f"will be picked up by next start_fio" + ) + + # ----- outage methods --------------------------------------------------- + + def _forced_shutdown(self, node_id): + """Shutdown with --force; still retry if blocked by migration.""" + while True: + rc, stdout_text, stderr_text = self.sbctl_allow_failure( + f"sn shutdown {node_id} --force", + timeout=300, + ) + if rc == 0: + return + output = f"{stdout_text}\n{stderr_text}".lower() + retry_markers = ( + "migration", "migrat", "rebalanc", + "active task", "running task", + "in_progress", "in progress", + ) + if any(m in output for m in retry_markers): + self.logger.log( + f"Forced shutdown of {node_id} blocked by migration/task; retrying in 15s" + ) + time.sleep(15) + continue + raise RemoteCommandError( + f"mgmt: command failed with rc={rc}: sbctl sn shutdown {node_id} --force" + ) + + def _container_kill(self, node_id): + """Kill the SPDK container on the storage node's host. Node is expected + to auto-recover; no sbctl restart is issued.""" + host = self._node_host(node_id) + cmd = ( + "set -euo pipefail; " + "cns=$(sudo docker ps --format '{{.Names}}' | grep -E '^spdk_[0-9]+$' || true); " + "if [ -z \"$cns\" ]; then echo 'no spdk_* container found' >&2; exit 0; fi; " + "for cn in $cns; do echo \"killing $cn\"; sudo docker kill \"$cn\" || true; done" + ) + host.run( + f"bash -lc {shlex.quote(cmd)}", + timeout=120, + check=False, + label=f"container_kill {node_id}", + ) + + def _host_reboot(self, node_id): + """Reboot the storage node's host. Node is expected to auto-recover; + no sbctl restart is issued.""" + host = self._node_host(node_id) + # nohup + background + sleep so the shell exit beats reboot cleanly + cmd = "sudo nohup bash -c 'sleep 2; reboot -f' >/dev/null 2>&1 &" + try: + host.run( + f"bash -lc {shlex.quote(cmd)}", + timeout=30, + check=False, + label=f"host_reboot {node_id}", + ) + except RemoteCommandError as exc: + # SSH may drop as the host goes down — not fatal. + self.logger.log(f"host_reboot {node_id}: ssh terminated as expected: {exc}") + # Drop the cached SSH client; it's going to die anyway. + cached = self.node_hosts.pop(node_id, None) + if cached is not None: + try: + cached.close() + except Exception: + pass + + # --- Multipath NIC chaos --- + + def _is_multipath(self): + return bool(self.metadata.get("multipath")) + + def _get_data_nics(self): + """Return the list of data NIC names (e.g. ['eth1', 'eth2']).""" + nics = self.metadata.get("data_nics") + if nics: + return nics + iface = self.metadata.get("data_iface") + if iface: + return [iface] + return [] + + def _network_outage(self, node_id, duration): + """Take all data NICs down on one storage node for *duration* seconds, + then bring them back up. Simulates a transient network partition of + a single node. Node is expected to auto-recover once the NICs return + — no sbctl restart is issued.""" + host = self._node_host(node_id) + nics = self._get_data_nics() or ["eth1"] + self.logger.log( + f"network_outage on {node_id}: dropping {nics} for {duration}s" + ) + for nic in nics: + try: + host.run(f"sudo ip link set {nic} down", timeout=10, check=False, + label=f"netout down {nic} on {node_id}") + except Exception as e: + self.logger.log(f"WARNING: failed to down {nic} on {node_id}: {e}") + try: + time.sleep(duration) + finally: + for nic in nics: + try: + host.run(f"sudo ip link set {nic} up", timeout=10, check=False, + label=f"netout up {nic} on {node_id}") + except Exception as e: + self.logger.log(f"WARNING: failed to up {nic} on {node_id}: {e}") + + def _apply_outage(self, node_id, method): + self.logger.log(f"Applying outage '{method}' on {node_id}") + if method == "graceful": + self.shutdown_with_migration_retry(node_id) + elif method == "forced": + self._forced_shutdown(node_id) + elif method == "container_kill": + self._container_kill(node_id) + elif method == "host_reboot": + self._host_reboot(node_id) + elif method.startswith("network_outage_"): + try: + duration = int(method.rsplit("_", 1)[-1]) + except ValueError: + raise TestRunError(f"Unknown outage method: {method}") + self._network_outage(node_id, duration) + else: + raise TestRunError(f"Unknown outage method: {method}") + + def _needs_manual_restart(self, method): + return method not in AUTO_RECOVER_METHODS + + def wait_node_leaves_online(self, node_id, timeout=90, poll=2): + """Poll sbctl until the control plane observes node_id leaving 'online'. + Returns True once any non-online status is seen, False on timeout. + + Why this exists: the CP's health-check loop updates status on its own + cadence. If the soak polls wait_for_all_online *before* the CP has + noticed the outage, the first poll reports all-online and we return + while the target is actually still down. The next iteration then + stacks extra outages on a silently-offline node and breaks the FTT + budget (see incident: 2026-04-20 iter 17 container_kill on 2870dfa5, + CP status transition lagged the soak's first sn-list by ~1 s). + """ + deadline = time.time() + timeout + while time.time() < deadline: + try: + nodes = self.get_nodes() + except Exception as exc: + self.logger.log(f"wait_node_leaves_online: sn list failed ({exc})") + time.sleep(poll) + continue + status = next( + (n["status"] for n in nodes if n["uuid"] == node_id), + "unknown", + ) + if status != "online": + self.logger.log( + f"CP observed {node_id[:8]} leaving online (now {status})" + ) + return True + time.sleep(poll) + return False + + def run_outage_pair(self, node1, node2, method1, method2): + self.logger.log( + f"Outage pair: {node1}={method1} and {node2}={method2}" + ) + # Apply first outage, then optional gap, then second outage. + self._apply_outage(node1, method1) + if self.args.shutdown_gap: + time.sleep(self.args.shutdown_gap) + self._apply_outage(node2, method2) + + # Issue sbctl restart only for methods that leave the node in a + # "shutdown" state that the CP won't recover on its own. + # Retry with backoff: when the other node in the pair used an + # auto-recover method (container_kill / host_reboot), it may + # still be in_shutdown or in_restart when we try to restart the + # manually-recovered peer — the per-cluster guard rejects + # concurrent restarts. Retrying gives the auto-recovering node + # time to come back. + for node_id, method in [(node1, method1), (node2, method2)]: + if not self._needs_manual_restart(method): + continue + deadline = time.time() + self.args.restart_timeout + while True: + try: + # Emit a RESTART header with the wall-clock timestamp, + # then dump the raw sbctl -d restart stdout below it + # (without per-line timestamp prefix) so the CP trace + # produced by -d lines up with a single moment in time. + self.logger.log( + f"RESTART: {time.strftime('%Y-%m-%d %H:%M:%S')} {node_id}" + ) + stdout_text = self.sbctl(f"sn restart {node_id}", timeout=300) + with self.logger.lock: + print(stdout_text, flush=True, end="" + if stdout_text.endswith("\n") else "\n") + with open(self.logger.path, "a", encoding="utf-8") as handle: + handle.write(stdout_text) + if not stdout_text.endswith("\n"): + handle.write("\n") + break + except Exception as e: + if time.time() >= deadline: + raise + self.logger.log( + f"Restart of {node_id} failed ({e}), " + f"retrying in 15s (peer may still be recovering)") + time.sleep(15) + + # Before we call wait_for_all_online, make sure the control plane has + # actually observed each auto-recover target leaving 'online' state. + # Otherwise wait_for_all_online can race the CP: the first sn-list + # poll may still report the just-killed node as 'online' (stale), + # all statuses look good, and we return immediately — the node is + # then in a silent offline state when the next iteration stacks + # more outages on top, crossing the FTT budget. + # network_outage_* methods can finish before the CP notices; that's + # fine (short outages often recover from HA multipath without CP + # involvement), so we don't fail if the observation window expires. + for node_id, method in [(node1, method1), (node2, method2)]: + if method not in AUTO_RECOVER_METHODS: + continue + if method.startswith("network_outage_"): + observed = self.wait_node_leaves_online(node_id, timeout=30) + if not observed: + self.logger.log( + f"CP did not observe {node_id[:8]} offline for " + f"{method} within 30s (expected for short NIC drops)" + ) + else: + # container_kill, host_reboot: the node IS down; we must see it. + observed = self.wait_node_leaves_online(node_id, timeout=90) + if not observed: + self.logger.log( + f"WARN: CP never observed {node_id[:8]} offline after " + f"{method} within 90s; sn-list may be stale" + ) + + # For auto-recovery methods, allow a longer wait window since the host + # has to reboot / the container has to come back under its supervisor. + wait_timeout = self.args.restart_timeout + if any( + m in AUTO_RECOVER_METHODS for m in (method1, method2) + ): + wait_timeout = max(wait_timeout, self.args.auto_recover_wait) + + self.wait_for_all_online( + target_nodes={node1, node2}, timeout=wait_timeout + ) + # Intentionally no check_fio / wait_for_cluster_stable here: the + # outer loop calls check_fio right after this returns, then + # stop_fio, then waits for cluster stability unloaded. + + # ----- topology & scenario enumeration --------------------------------- + + def discover_topology(self): + """Return {lvs_name: {'primary': uuid, 'secondary': uuid, 'tertiary': uuid}}. + + Queried once at soak startup to identify the 4 role-representative + node pairs. Leader takeover mid-soak may shift role assignments; + the scenario list is pinned at startup so the 4 chosen pairs stay + fixed across retries even if the CP has re-promoted since. + """ + script = ( + "import json; " + "from simplyblock_core import db_controller; " + "db = db_controller.DBController(); " + "nodes = db.get_storage_nodes(); " + "out = {n.lvstore: {" + "'primary': n.get_id(), " + "'secondary': getattr(n, 'secondary_node_id', '') or '', " + "'tertiary': getattr(n, 'tertiary_node_id', '') or ''" + "} for n in nodes " + "if getattr(n, 'lvstore', '') " + "and not getattr(n, 'is_secondary_node', False)}; " + "print(json.dumps(out))" + ) + _, stdout_text, _ = self.mgmt.run( + f"sudo python3 -c {shlex.quote(script)}", + timeout=60, + label="discover topology", + ) + for line in reversed((stdout_text or "").strip().splitlines()): + line = line.strip() + if line.startswith("{"): + try: + return json.loads(line) + except json.JSONDecodeError: + continue + raise TestRunError( + f"Failed to parse topology JSON from mgmt; stdout was:\n{stdout_text}" + ) + + def _validate_topology_for_categories(self): + """Verify the pinned topology can supply at least one pair per category. + + Raises TestRunError if: + * the pinned topology has no LVS (empty cluster) + * no LVS has both primary and secondary (primary_secondary unservable) + * no LVS has both primary and tertiary (primary_tertiary unservable) + * no unrelated pair exists — in a dense FT=2 ring with N ≤ 4 this + is possible; raise so the coverage gap is explicit. + """ + if not self.topology: + raise TestRunError("Empty topology; cannot pick representative pairs") + + if not self._candidate_pairs_for_role("secondary"): + raise TestRunError( + "No LVS in topology has both primary and secondary; " + "primary_secondary category is unservable" + ) + if not self._candidate_pairs_for_role("tertiary"): + raise TestRunError( + "No LVS in topology has both primary and tertiary; " + "primary_tertiary category is unservable" + ) + + all_nodes, lvs_members = self._lvs_membership() + if not self._unrelated_pairs(all_nodes, lvs_members): + raise TestRunError( + "No unrelated node pair found in topology " + f"({len(all_nodes)} nodes across {len(lvs_members)} LVSs)" + ) + + def _candidate_pairs_for_role(self, role_b): + """All (primary, role_b) pairs across the pinned topology.""" + pairs = [] + for roles in self.topology.values(): + a = roles.get("primary") + b = roles.get(role_b) + if a and b: + pairs.append((a, b)) + return pairs + + def _lvs_membership(self): + """Return (all_nodes, lvs_members) derived from the pinned topology.""" + all_nodes = set() + lvs_members = [] + for r in self.topology.values(): + members = {v for v in r.values() if v} + lvs_members.append(members) + all_nodes.update(members) + return all_nodes, lvs_members + + def _unrelated_pairs(self, all_nodes, lvs_members): + """All node pairs that share no LVS in any role.""" + pairs = [] + for a, b in itertools.combinations(sorted(all_nodes), 2): + if not any(a in m and b in m for m in lvs_members): + pairs.append((a, b)) + return pairs + + def pick_pair_for_category(self, category): + """Randomly pick a (node_a, node_b) pair for the given role category. + + Distance preserved per category (so each scenario in a "group" hits + the same topological relationship, just on different concrete nodes): + - primary_secondary: ring-distance 1 (direct successor) + - primary_tertiary: ring-distance 2 (exactly one node between) + - unrelated: ring-distance ≥ 3 (≥ 2 nodes between) + """ + if category == "unrelated": + all_nodes, lvs_members = self._lvs_membership() + candidates = self._unrelated_pairs(all_nodes, lvs_members) + elif category == "primary_secondary": + candidates = self._candidate_pairs_for_role("secondary") + elif category == "primary_tertiary": + candidates = self._candidate_pairs_for_role("tertiary") + else: + raise TestRunError(f"Unknown role category: {category}") + + if not candidates: + raise TestRunError( + f"No candidate pairs available for category {category}" + ) + return random.choice(candidates) + + def build_scenarios(self, nodes): + """Enumerate role categories × P(M,2) ordered method pairs. + + Returns a list of dicts with keys: method_a, method_b, category. + The actual (a, b) node pair is rolled at iteration time via + ``pick_pair_for_category`` so the soak hits many concrete pairs per + group while keeping the relative ring-distance fixed per category. + Same-method method pairs are NOT included — ordered distinct pairs + only, per itertools.permutations(methods, 2). + """ + _ = nodes # unused: pair picking happens at iteration time + scenarios = [] + for category in ROLE_CATEGORIES: + for m_a, m_b in itertools.permutations(self.methods, 2): + scenarios.append({ + "method_a": m_a, + "method_b": m_b, + "category": category, + }) + method_pair_count = len(self.methods) * (len(self.methods) - 1) + self.logger.log( + f"Built {len(scenarios)} scenarios: " + f"{len(ROLE_CATEGORIES)} role categories × " + f"P({len(self.methods)},2)={method_pair_count} ordered method pairs " + f"(node pair rolled randomly per scenario)" + ) + return scenarios + + def run(self): + self.ensure_prerequisites() + nodes = self.ensure_expected_nodes() + self.wait_for_all_online(timeout=self.args.restart_timeout) + # Wait for the cluster to be fully stable (no in-flight rebalance + # or data migration) before starting iterations. + self.wait_for_cluster_stable() + self.wait_for_data_migration_complete("test start") + mount_root = self.prepare_client() + # Saved so the churn cycle can mount its newly-created volume back + # into the same workspace tree. + self.mount_root = mount_root + volumes = self.create_volumes(nodes) + # Stored so the churn cycle can drive per-volume teardown/rebuild + # without re-creating / re-mounting the underlying soak workspace. + self.volumes = volumes + self.connect_and_mount_volumes(volumes, mount_root) + + # Pin the topology once, before any outages. Leader takeover during + # the soak can permanently shift role assignments, but the + # representative pairs are fixed at startup so each cycle targets + # the same pairs for the same role categories. + self.topology = self.discover_topology() + self.logger.log(f"Pinned topology: {json.dumps(self.topology, sort_keys=True)}") + self._validate_topology_for_categories() + self.scenarios = self.build_scenarios(nodes) + if not self.scenarios: + raise TestRunError("No outage scenarios built; method/node list empty") + + start_at = max(1, self.args.start_at) + if start_at > len(self.scenarios): + raise TestRunError( + f"--start-at {start_at} exceeds scenario count " + f"{len(self.scenarios)}; nothing to run" + ) + + churn_every = 0 if self.args.no_churn else self.args.churn_every_n_iters + if churn_every > 0: + self.logger.log( + f"Volume churn enabled: rebuild one random volume every " + f"{churn_every} iteration(s) in the unloaded settle window" + ) + else: + self.logger.log("Volume churn disabled") + + # iteration counter is aligned to scenario_idx: when --start-at N is + # used, the first executed scenario logs as iteration=N so post-hoc + # grep for "iteration 60" finds the resumed scenario and its prior + # failure side by side. + iteration = start_at - 1 + cycle = 0 + while True: + cycle += 1 + if self.args.cycles and cycle > self.args.cycles: + self.logger.log( + f"Completed {cycle - 1} full cycle(s) of {len(self.scenarios)} " + f"scenarios; exiting" + ) + return + + cycle_scenarios = list(self.scenarios) + if self.args.shuffle_scenarios: + # Seed off cycle number so two soaks with the same --cycles + # walk identical sequences, but successive cycles rotate + # through different orderings. + random.Random(cycle).shuffle(cycle_scenarios) + + cycle_start_at = start_at if cycle == 1 else 1 + self.logger.log( + f"Starting cycle {cycle} ({len(cycle_scenarios)} scenarios" + f"{', shuffled' if self.args.shuffle_scenarios else ''}" + f"{f', starting at scenario {cycle_start_at}' if cycle_start_at > 1 else ''})" + ) + + for scenario_idx, scenario in enumerate(cycle_scenarios, 1): + if scenario_idx < cycle_start_at: + continue + iteration += 1 + + node1, node2 = self.pick_pair_for_category(scenario["category"]) + method1 = scenario["method_a"] + method2 = scenario["method_b"] + + self.logger.log( + f"Starting outage iteration {iteration} " + f"(cycle {cycle} scenario {scenario_idx}/{len(cycle_scenarios)}): " + f"category={scenario['category']} " + f"pair=({node1[:8]},{node2[:8]}) " + f"methods=({method1},{method2})" + ) + + # Skip scenarios whose nodes are not currently in the + # expected-node set (e.g. one has been removed from the + # cluster mid-soak). Better to log-and-skip than to try to + # restart a ghost. + current_uuids = {n["uuid"] for n in self.ensure_expected_nodes()} + missing = [uid for uid in (node1, node2) if uid not in current_uuids] + if missing: + self.logger.log( + f"Scenario {iteration} skipped: nodes {missing} not in " + f"current cluster set {sorted(current_uuids)}" + ) + continue + + # Load during outage: start fresh fio so the outage hits + # live IO. This is the only window where fio runs. + self.start_fio(self.volumes) + + self.run_outage_pair(node1, node2, method1, method2) + + # Fault gate: any fio that exited / faulted during the + # outage is a real failure. check_fio raises on faults. + self.check_fio() + + # Unload before settle: stop fio so the rebalance / data- + # migration drain runs without IO pressure. This is what + # makes the iteration cycle short. + self.stop_fio() + + # Optional volume rebuild. fio is already stopped, so we + # don't need any per-job teardown — just delete + recreate + # + remount; the next iteration's start_fio picks it up. + if churn_every > 0 and iteration % churn_every == 0: + self._churn_one_volume() + + self.wait_for_all_online(timeout=self.args.restart_timeout) + self.wait_for_cluster_stable() + self.wait_for_data_migration_complete("next iteration") + + +def main(): + args = parse_args() + logger = Logger(args.log_file) + logger.log(f"Logging to {args.log_file}") + metadata = load_metadata(args.metadata) + if not metadata.get("clients"): + raise SystemExit("Metadata file does not contain a client host") + + runner = SoakRunner(args, metadata, logger) + try: + runner.run() + except (RemoteCommandError, TestRunError, ValueError) as exc: + logger.log(f"ERROR: {exc}") + sys.exit(1) + finally: + runner.close() + + +if __name__ == "__main__": + main() diff --git a/scripts/setup_lab_perf_test1.py b/scripts/setup_lab_perf_test1.py index 305d836fd..ca44ce49f 100644 --- a/scripts/setup_lab_perf_test1.py +++ b/scripts/setup_lab_perf_test1.py @@ -41,8 +41,8 @@ USER = "root" IFACE = "eth0" DATA_IFACE = "eth1" -BRANCH = "inline-checksum-validation" -MAX_LVOL = "100" +BRANCH = "main" +MAX_LVOL = "25" # Same volume plan layout as the AWS variant; consumed by downstream perf tooling. VOLUME_PLAN = [ @@ -280,6 +280,78 @@ def normalize_ref(value): return json.loads(output) +def fetch_alceml_modes(mgmt_ip, cluster_uuid): + """Return per-alceml mode info for every storage device in the cluster. + + Mirrors simplyblock_core.utils.alceml_checksum_params: + 0 = off (cluster.inline_checksum False) + 1 = md-on-device (cluster ON, device md_supported) + 2 = fallback / emulation (cluster ON, device has no md-capable LBAF) + """ + script = f"""python3 - <<'PY' +import json +from simplyblock_core.db_controller import DBController + +db = DBController() +cluster = db.get_cluster_by_id({cluster_uuid!r}) +nodes = db.get_storage_nodes_by_cluster_id({cluster_uuid!r}) or [] +inline = bool(getattr(cluster, "inline_checksum", False)) + +rows = [] +for node in nodes: + label = getattr(node, "hostname", "") or node.get_id() + for dev in (getattr(node, "nvme_devices", None) or []): + md_supported = bool(getattr(dev, "md_supported", False)) + md_size = int(getattr(dev, "md_size", 0) or 0) + if not inline: + method, mode_label = 0, "off" + elif md_supported: + method, mode_label = 1, "md-on-device" + else: + method, mode_label = 2, "fallback (emulation)" + rows.append({{ + "node": label, + "alceml": getattr(dev, "alceml_name", "") or getattr(dev, "uuid", ""), + "method": method, + "mode": mode_label, + "md_supported": md_supported, + "md_size": md_size, + }}) + +print(json.dumps({{"inline_checksum": inline, "devices": rows}}, indent=2)) +PY""" + output = ssh_exec(mgmt_ip, [script], get_output=True, check=True)[0] + return json.loads(output) + + +def print_alceml_summary(summary): + inline = summary.get("inline_checksum", False) + devices = summary.get("devices", []) + print("\n--- ALCEML inline-checksum modes ---") + print(f"Cluster inline_checksum: {'ENABLED' if inline else 'disabled'}") + if not devices: + print(" (no devices reported)") + return + by_node = {} + for row in devices: + by_node.setdefault(row["node"], []).append(row) + for node, rows in sorted(by_node.items()): + print(f" {node}:") + for row in rows: + print( + f" - {row['alceml'] or '(unnamed)':<40} " + f"method={row['method']} {row['mode']:<22} " + f"md_size={row['md_size']} md_supported={row['md_supported']}" + ) + md_count = sum(1 for r in devices if r["method"] == 1) + fb_count = sum(1 for r in devices if r["method"] == 2) + off_count = sum(1 for r in devices if r["method"] == 0) + print( + f"Totals: md-on-device={md_count} fallback={fb_count} off={off_count} " + f"(of {len(devices)} devices)" + ) + + def parse_args(): parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( @@ -296,6 +368,16 @@ def parse_args(): default="cluster_metadata_base.json", help="Where to write the cluster metadata JSON (default: ./cluster_metadata_base.json).", ) + parser.add_argument( + "--no-inline-checksum", + action="store_true", + help=( + "Disable inline CRC checksum validation. By default the cluster is " + "created with --enable-inline-checksum (matching the inline-checksum-" + "validation branch + ultra:checksum-validation-latest image). The " + "flag is frozen at create time and cannot be changed later." + ), + ) return parser.parse_args() @@ -348,19 +430,140 @@ def main(): t.result() print("Phase 1: DONE - all nodes have sbcli installed.") + # --- Phase 1.5: cleanup leftover state from any prior deploy --- + # Order matters: + # 1. sn deploy-cleaner first (tears down SPDK containers + NVMe state). + # 2. docker rm -f any stragglers, then `docker system prune -af --volumes`. + # Per the deployment notes: SAFE before cluster create (no active FDB + # volumes yet); NEVER run after activate (it would wipe FDB). + # 3. Fresh `docker pull` of the simplyblock + ultra images named in the + # installed env_var, so we don't reuse a stale cached layer. + print("Phase 1.5a: Running sbctl sn deploy-cleaner on every node...") + deploy_cleaner_cmds = ["/usr/local/bin/sbctl -d sn deploy-cleaner"] + with ThreadPoolExecutor(max_workers=len(all_setup_ips)) as executor: + tasks = [executor.submit(ssh_exec, ip, deploy_cleaner_cmds, check=False) + for ip in all_setup_ips] + for t in tasks: + t.result() + print("Phase 1.5a: DONE.") + + print("Phase 1.5b: Removing any straggler containers and pruning Docker...") + docker_cleanup_cmds = [ + "containers=$(docker ps -aq); " + "if [ -n \"$containers\" ]; then docker rm -f $containers; fi", + "docker system prune -af --volumes", + ] + with ThreadPoolExecutor(max_workers=len(all_setup_ips)) as executor: + tasks = [executor.submit(ssh_exec, ip, docker_cleanup_cmds, check=False) + for ip in all_setup_ips] + for t in tasks: + t.result() + print("Phase 1.5b: DONE.") + + # NVMe partition cleanup. deploy-cleaner already pulls SPDK off the + # drives, but a prior deploy may have left GPT tables / filesystem + # signatures / leftover namespace state behind. Wipe signatures, then + # nvme-format every non-root NVMe so the data plane sees a clean slate. + # sn configure --enable-inline-checksum --force will reformat to a + # metadata-capable LBAF on top of this. Storage nodes only -- the mgmt + # node is never used for SPDK data devices. + print("Phase 1.5d: Wiping partitions and formatting NVMes on storage nodes...") + nvme_cleanup_script = r"""set -u +root_src=$(findmnt -no SOURCE / 2>/dev/null || true) +root_dev=$(echo "$root_src" | sed -E 's|p?[0-9]+$||') +echo "Root NVMe (will be skipped): $root_dev" +for d in $(lsblk -dno NAME,TYPE | awk '$2=="disk" && $1 ~ /^nvme/ {print "/dev/"$1}'); do + [ -b "$d" ] || continue + if [ "$d" = "$root_dev" ]; then + echo "Skip $d (root)" + continue + fi + for p in ${d}p*; do + [ -b "$p" ] || continue + umount -f "$p" 2>/dev/null || true + done + echo "Wiping $d (wipefs)" + wipefs -af "$d" 2>/dev/null || true + echo "Formatting $d (nvme format -s 0)" + nvme format "$d" -f -s 0 2>/dev/null || \ + echo " WARN: nvme format failed on $d (continuing; sn configure will retry)" +done +""" + with ThreadPoolExecutor(max_workers=len(sn_ips)) as executor: + tasks = [executor.submit(ssh_exec, ip, [nvme_cleanup_script], check=False) + for ip in sn_ips] + for t in tasks: + t.result() + print("Phase 1.5d: DONE.") + + print("Phase 1.5c: Fresh-pulling simplyblock + ultra images on every node...") + # Pull with retry: public.ecr.aws occasionally returns transient errors + # (IPv6 source-address races, S3 signed-URL hiccups, etc.). Retry up to + # 6 times with 15s backoff so one node's blip doesn't abort the deploy. + pull_script = """python3 - <<'PY' +import os, subprocess, sys, time +import simplyblock_core +envf = os.path.join(os.path.dirname(simplyblock_core.__file__), 'env_var') +images = [] +with open(envf) as f: + for line in f: + if '=' not in line: + continue + k, v = line.strip().split('=', 1) + if k in ('SIMPLY_BLOCK_DOCKER_IMAGE', 'SIMPLY_BLOCK_SPDK_ULTRA_IMAGE') and v: + images.append(v) +if not images: + print('no images found in env_var', file=sys.stderr) + sys.exit(1) +for img in images: + print(f'Pulling {img}', flush=True) + last_rc = 1 + for attempt in range(1, 7): + last_rc = subprocess.call(['docker', 'pull', img]) + if last_rc == 0: + break + print(f' pull failed (rc={last_rc}), attempt {attempt}/6 - retry in 15s', flush=True) + time.sleep(15) + if last_rc != 0: + print(f' giving up on {img} after 6 attempts', file=sys.stderr) + sys.exit(last_rc) +PY""" + with ThreadPoolExecutor(max_workers=len(all_setup_ips)) as executor: + tasks = [executor.submit(ssh_exec, ip, [pull_script], check=True) + for ip in all_setup_ips] + for t in tasks: + t.result() + print("Phase 1.5c: DONE - all nodes have fresh images.") + + inline_checksum = not args.no_inline_checksum + checksum_flag = " --enable-inline-checksum" if inline_checksum else "" + print(f"Inline checksum validation: {'ENABLED' if inline_checksum else 'disabled'}") + # --- Phase 2: cluster create + sn configure/deploy --- print("Phase 2a: Creating cluster on management node...") ssh_exec(mgmt_ip, [ "/usr/local/bin/sbctl -d cluster create --enable-node-affinity" " --data-chunks-per-stripe 2 --parity-chunks-per-stripe 2" + + checksum_flag ], check=True) print("Phase 2a: DONE - cluster created.") + # sn configure --force always prompts "Type YES/Y to continue" before + # formatting NVMes (see simplyblock_core/utils/__init__.py:~1789). The + # prompt is for interactive safety; here we feed YES on stdin so the + # automated deploy doesn't hang the full SSH timeout (10 min) on the + # confirmation. Wrap with `echo YES | ...` instead of plumbing stdin + # through ssh_exec because it's localized to this one command. print("Phase 2b: Configuring storage nodes...") + configure_cmd = ( + f"/usr/local/bin/sbctl -d sn configure --max-lvol {shlex.quote(args.max_lvol)}" + + checksum_flag + (" --force" if inline_checksum else "") + ) + if inline_checksum: + configure_cmd = f"echo YES | {configure_cmd}" with ThreadPoolExecutor(max_workers=len(sn_ips)) as executor: - tasks = [executor.submit(ssh_exec, ip, [ - f"/usr/local/bin/sbctl -d sn configure --max-lvol {shlex.quote(args.max_lvol)}" - ], check=True) for ip in sn_ips] + tasks = [executor.submit(ssh_exec, ip, [configure_cmd], check=True) + for ip in sn_ips] for t in tasks: t.result() print("Phase 2b: DONE - all SNs configured.") @@ -469,6 +672,12 @@ def main(): with open(args.metadata_out, "w") as f: json.dump(final_metadata, f, indent=4) + try: + alceml_summary = fetch_alceml_modes(mgmt_ip, cluster_uuid) + print_alceml_summary(alceml_summary) + except Exception as exc: + print(f"WARNING: failed to fetch ALCEML mode summary: {exc}") + print("\n--- Setup Complete ---") print(f"Cluster {cluster_uuid} is active. Metadata saved to {args.metadata_out}.") diff --git a/simplyblock_cli/cli.py b/simplyblock_cli/cli.py index d0b3ee71a..f00f0b662 100755 --- a/simplyblock_cli/cli.py +++ b/simplyblock_cli/cli.py @@ -104,6 +104,7 @@ def init_storage_node__configure(self, subparser): argument = subcommand.add_argument('--size-range', help='NVMe SSD device size range separated by -, can be X(m,g,t) or bytes as integer, example: --size-range 50G-1T or --size-range 1232345-67823987, --device-model and --size-range must be set together.', type=str, default='', dest='size_range', required=False) argument = subcommand.add_argument('--nvme-names', help='Comma separated list of nvme namespace names like nvme0n1,nvme1n1.', type=str, default='', dest='nvme_names', required=False) argument = subcommand.add_argument('--force', help='Force format detected or passed nvme pci address to 4K and clean partitions.', dest='force', action='store_true') + argument = subcommand.add_argument('--enable-inline-checksum', help='When formatting (with --force), prefer an LBAF that supports >=8 bytes of NVMe metadata per block, so alceml can run inline checksum validation in md-on-device mode. Drives with no md-capable LBAF still format to plain 4K and will use the fallback layout.', dest='inline_checksum', action='store_true') argument = subcommand.add_argument('--calculate-hp-only', help='Calculate the minimum required huge pages, it depends on the following params: --cores-percentage, --sockets-to-use, --max-lvol, --nodes-per-socket, --number-of-devices.', dest='calculate_hp_only', action='store_true') argument = subcommand.add_argument('--number-of-devices', help='Number of devices that will be used on this host. For calculating huge pages memory only.', type=int, dest='number_of_devices') @@ -419,6 +420,7 @@ def init_cluster__create(self, subparser): if self.developer_mode: argument = subcommand.add_argument('--disable-monitoring', help='Disable monitoring stack, false by default. Default: `false`.', dest='disable_monitoring', action='store_true') argument = subcommand.add_argument('--strict-node-anti-affinity', help='Enable strict node anti affinity for storage nodes. Never more than one chunk is placed on a node. This requires a minimum of _data-chunks-in-stripe + parity-chunks-in-stripe + 1_ nodes in the cluster.', dest='strict_node_anti_affinity', action='store_true') + argument = subcommand.add_argument('--enable-inline-checksum', help='Enable inline CRC checksum validation on every IO for silent-data-error protection. Cannot be enabled or disabled after cluster creation. Per-device alceml mode (md-on-device vs fallback) is auto-detected at add-node.', dest='inline_checksum', action='store_true') argument = subcommand.add_argument('--name', '-n', help='Assigns a name to the newly created cluster.', type=str, dest='name') argument = subcommand.add_argument('--qpair-count', help='The NVMe/TCP transport qpair count per logical volume. Default: `32`.', type=range_type(0, 128), default=32, dest='qpair_count') argument = subcommand.add_argument('--client-qpair-count', help='The default NVMe/TCP transport qpair count per logical volume for client. Default: `3`.', type=range_type(0, 128), default=3, dest='client_qpair_count') @@ -453,6 +455,7 @@ def init_cluster__add(self, subparser): if self.developer_mode: argument = subcommand.add_argument('--inflight-io-threshold', help='The number of inflight IOs allowed before the IO queuing starts. Default: `4`.', type=int, default=4, dest='inflight_io_threshold') argument = subcommand.add_argument('--strict-node-anti-affinity', help='Enable strict node anti affinity for storage nodes. Never more than one chunk is placed on a node. This requires a minimum of _data-chunks-in-stripe + parity-chunks-in-stripe + 1_ nodes in the cluster."', dest='strict_node_anti_affinity', action='store_true') + argument = subcommand.add_argument('--enable-inline-checksum', help='Enable inline CRC checksum validation on every IO for silent-data-error protection. Cannot be enabled or disabled after cluster creation.', dest='inline_checksum', action='store_true') argument = subcommand.add_argument('--name', '-n', help='Assigns a name to the newly created cluster.', type=str, dest='name') argument = subcommand.add_argument('--client-data-nic', help='Network interface name from client to use for logical volume connection.', type=str, dest='client_data_nic') argument = subcommand.add_argument('--use-backup', help='The path to JSON file with S3/MinIO backup configuration.', type=str, dest='use_backup') diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index 05378f51a..e49727ada 100755 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -140,7 +140,8 @@ def storage_node__configure(self, sub_command, args): args.max_lvol, max_prov, sockets_to_use,args.nodes_per_socket, pci_allowed, pci_blocked, force=args.force, device_model=args.device_model, size_range=args.size_range, cores_percentage=cores_percentage, nvme_names=nvme_names, - calculate_hp_only=args.calculate_hp_only, number_of_devices=number_of_devices) + calculate_hp_only=args.calculate_hp_only, number_of_devices=number_of_devices, + inline_checksum=args.inline_checksum) def storage_node__deploy_cleaner(self, sub_command, args): storage_ops.deploy_cleaner() @@ -1001,12 +1002,14 @@ def cluster_add(self, args): with open(args.use_backup, 'r') as f: backup_config = _json.load(f) + inline_checksum = getattr(args, 'inline_checksum', False) return cluster_ops.add_cluster( blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn, prov_cap_crit, distr_ndcs, distr_npcs, distr_bs, distr_chunk_bs, ha_type, enable_node_affinity, qpair_count, max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, fabric, client_data_nic, max_fault_tolerance=max_fault_tolerance, backup_config=backup_config, - nvmf_base_port=args.nvmf_base_port, rpc_base_port=args.rpc_base_port, snode_api_port=args.snode_api_port) + nvmf_base_port=args.nvmf_base_port, rpc_base_port=args.rpc_base_port, snode_api_port=args.snode_api_port, + inline_checksum=inline_checksum) def cluster_create(self, args): import json as _json @@ -1043,6 +1046,7 @@ def cluster_create(self, args): is_single_node = args.is_single_node fabric = args.fabric client_data_nic = args.client_data_nic + inline_checksum = getattr(args, 'inline_checksum', False) max_fault_tolerance = min(distr_npcs, 2) if distr_npcs >= 1 else 1 @@ -1060,7 +1064,8 @@ def cluster_create(self, args): strict_node_anti_affinity, name, tls_secret, ingress_host_source, dns_name, fabric, is_single_node, client_data_nic, max_fault_tolerance=max_fault_tolerance, backup_config=backup_config, - nvmf_base_port=args.nvmf_base_port, rpc_base_port=args.rpc_base_port, snode_api_port=args.snode_api_port) + nvmf_base_port=args.nvmf_base_port, rpc_base_port=args.rpc_base_port, snode_api_port=args.snode_api_port, + inline_checksum=inline_checksum) def query_yes_no(self, question, default="yes"): """Ask a yes/no question via raw_input() and return their answer. diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py index c40b96875..0e9dc8239 100644 --- a/simplyblock_core/cluster_ops.py +++ b/simplyblock_core/cluster_ops.py @@ -15,7 +15,8 @@ from docker.errors import DockerException from simplyblock_core import utils, scripts, constants, mgmt_node_ops, storage_node_ops -from simplyblock_core.controllers import backup_controller, cluster_events, device_controller, qos_controller, tasks_controller +from simplyblock_core.controllers import backup_controller, cluster_events, device_controller, qos_controller, tasks_controller, tcp_ports_events +from simplyblock_core.fw_api_client import FirewallClient from simplyblock_core.db_controller import DBController from simplyblock_core.models.cluster import Cluster from simplyblock_core.models.job_schedule import JobSchedule @@ -225,7 +226,8 @@ def create_cluster(blk_size, page_size_in_blocks, cli_pass, enable_node_affinity, qpair_count, client_qpair_count, max_queue_size, inflight_io_threshold, disable_monitoring, strict_node_anti_affinity, name, tls_secret, ingress_host_source, dns_name, fabric, is_single_node, client_data_nic, nvmeof_tls_config=None, max_fault_tolerance=1, backup_config=None, - nvmf_base_port=4420, rpc_base_port=8080, snode_api_port=50001, container_image_prefix=None) -> str: + nvmf_base_port=4420, rpc_base_port=8080, snode_api_port=50001, container_image_prefix=None, + inline_checksum=False) -> str: if distr_ndcs == 0 and distr_npcs == 0: raise ValueError("both distr_ndcs and distr_npcs cannot be 0") @@ -348,6 +350,7 @@ def create_cluster(blk_size, page_size_in_blocks, cli_pass, cluster.disable_monitoring = disable_monitoring cluster.mode = mode cluster.full_page_unmap = False + cluster.inline_checksum = bool(inline_checksum) cluster.client_data_nic = client_data_nic or "" cluster.max_fault_tolerance = max_fault_tolerance cluster.nvmf_base_port = nvmf_base_port @@ -468,7 +471,8 @@ def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, cr_name=None, cr_namespace=None, cr_plural=None, fabric="tcp", cluster_ip=None, grafana_secret=None, client_data_nic="", max_fault_tolerance=1, backup_config=None, - nvmf_base_port=4420, rpc_base_port=8080, snode_api_port=50001) -> str: + nvmf_base_port=4420, rpc_base_port=8080, snode_api_port=50001, + inline_checksum=False) -> str: default_cluster = None @@ -565,6 +569,7 @@ def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn cluster.fabric_tcp = protocols["tcp"] cluster.fabric_rdma = protocols["rdma"] cluster.full_page_unmap = False + cluster.inline_checksum = bool(inline_checksum) cluster.client_data_nic = client_data_nic or "" cluster.max_fault_tolerance = max_fault_tolerance cluster.nvmf_base_port = nvmf_base_port @@ -607,6 +612,22 @@ def cluster_activate(cl_id, force=False, force_lvstore_create=False) -> None: ols_status = Cluster.STATUS_UNREADY else: set_cluster_status(cl_id, Cluster.STATUS_IN_ACTIVATION) + + # First-time activation runs while no primary LVS is serving fabric I/O + # yet, so the recreate paths run with activation_mode=True (peer LVS / + # leader / hublvol RPCs short-circuited — peer stacks aren't fully built + # during this phase, so they would not be safe to call). Re-activation + # (e.g. suspended → in_activation after JCERR, or force-reactivating an + # active/degraded cluster) is different: every primary's SPDK and lvstore + # are still alive and serving I/O — the secondary's examine of its non- + # leader raid0 races the live leader's blob-metadata writes and fails + # with bs_load_cur_extent_page_valid CRC mismatch on every retry + # (observed 2026-05-11, LVS_6769 on node 8084 — 22+ minute examine loop). + # We keep activation_mode=True (so peer LVS/hublvol RPCs stay disabled) + # and add only a firewall-only port-block on the live leader around the + # non-leader recreate in Pass 2. Port-block is benign on peers whose + # service isn't listening, so it's safe even against not-fully-built peers. + is_fresh_activation = (ols_status == Cluster.STATUS_UNREADY) snodes = db_controller.get_storage_nodes_by_cluster_id(cl_id) online_nodes = [] dev_count = 0 @@ -744,16 +765,61 @@ def cluster_activate(cl_id, force=False, force_lvstore_create=False) -> None: for primary_node in primary_nodes: primary_node.lvstore_status = "in_creation" primary_node.write_to_db() + + # On re-activation the primary's LVS is still alive and serving + # client I/O — snode's examine of its non-leader raid0 will race + # the leader's blob-metadata writes unless we quiesce the leader + # first. We do this with a firewall-only port-block on the leader: + # it has no effect on a peer whose service isn't listening (per + # design, safe even when peer stacks aren't fully built yet) but + # it stops the live leader from issuing writes that race the + # examine. We deliberately do NOT switch the helper out of + # activation_mode here: that would enable peer leader/distrib/ + # lvstore/hublvol RPCs which presume the peer's full stack is up. + leader_blocked = False + leader_port = None + leader_ptype = "tcp" + if not is_fresh_activation and primary_node.status == StorageNode.STATUS_ONLINE: + try: + leader_port = primary_node.get_lvol_subsys_port(primary_node.lvstore) + leader_ptype = "udp" if primary_node.active_rdma else "tcp" + FirewallClient(primary_node, timeout=3, retry=1).firewall_set_port( + leader_port, leader_ptype, "block", primary_node.rpc_port) + tcp_ports_events.port_deny(primary_node, leader_port) + leader_blocked = True + time.sleep(0.5) + except Exception as e: + logger.warning( + "Re-activation: port-block on leader %s for %s failed: %s — " + "proceeding without block (secondary examine may race live leader writes)", + primary_node.get_id(), primary_node.lvstore, e) + try: - r = storage_node_ops.recreate_lvstore_on_non_leader( - snode, primary_node, primary_node, activation_mode=True) - except storage_node_ops.LVSRestartRequiredError as e: - logger.error(e) - set_cluster_status(cl_id, ols_status) - raise ValueError( - f"Failed to activate cluster: node {e.node_id} holds " - f"partial state for LVS {e.lvs_name} (non-leader). " - f"Restart node {e.node_id} before activating.") + try: + r = storage_node_ops.recreate_lvstore_on_non_leader( + snode, primary_node, primary_node, activation_mode=True) + except storage_node_ops.LVSRestartRequiredError as e: + logger.error(e) + set_cluster_status(cl_id, ols_status) + raise ValueError( + f"Failed to activate cluster: node {e.node_id} holds " + f"partial state for LVS {e.lvs_name} (non-leader). " + f"Restart node {e.node_id} before activating.") + finally: + if leader_blocked: + try: + FirewallClient(primary_node, timeout=3, retry=1).firewall_set_port( + leader_port, leader_ptype, "allow", primary_node.rpc_port) + tcp_ports_events.port_allowed(primary_node, leader_port) + except Exception as ue: + logger.error( + "Failed to unblock leader %s:%s after non-leader recreate: %s — scheduling port_allow", + primary_node.get_id(), leader_port, ue) + try: + tasks_controller.add_port_allow_task( + primary_node.cluster_id, primary_node.get_id(), leader_port) + except Exception as se: + logger.error("Failed to schedule port_allow fallback: %s", se) if not r: ret = False diff --git a/simplyblock_core/controllers/device_controller.py b/simplyblock_core/controllers/device_controller.py index b264c689c..b6fc94fd2 100644 --- a/simplyblock_core/controllers/device_controller.py +++ b/simplyblock_core/controllers/device_controller.py @@ -297,12 +297,21 @@ def _def_create_device_stack(device_obj, snode, force=False, clear_data=False): cluster = db_controller.get_cluster_by_id(snode.cluster_id) if alceml_name not in bdev_names: + checksum_method, cache_size, cache_eviction_threshold = utils.alceml_checksum_params(cluster, device_obj) + if cluster.inline_checksum and not device_obj.md_supported: + logger.warning( + f"Inline checksum: device {device_obj.get_id()} ({device_obj.pcie_address}) has no NVMe metadata; " + f"alceml will run in fallback mode (extra md page, ~1.17%% capacity overhead)." + ) ret = snode.create_alceml( alceml_name, nvme_bdev, alceml_id, pba_init_mode=3 if clear_data else 2, write_protection=cluster.distr_ndcs > 1, pba_page_size=cluster.page_size_in_blocks, - full_page_unmap=cluster.full_page_unmap + full_page_unmap=cluster.full_page_unmap, + checksum_method=checksum_method, + cache_size=cache_size, + cache_eviction_threshold=cache_eviction_threshold, ) if not ret: diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index b01ded610..041f397aa 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -126,11 +126,27 @@ def _create_crypto_lvol(rpc_client, name, base_name, key1, key2): if not ret: logger.error(f"Failed to find LVol bdev {base_name}") return False + + # Idempotent: if the crypto bdev already exists from a prior partial + # activation/restart pass, skip the key + crypto-bdev creates. SPDK + # rejects duplicate creates with hard errors that would otherwise + # break re-activation convergence. + if rpc_client.get_bdevs(name): + logger.info("crypto LVol %s already exists, skipping create", name) + return True + key_name = f'key_{name}' ret = rpc_client.lvol_crypto_key_create(key_name, key1, key2) if not ret: - logger.error("failed to create crypto key") - return False + # SPDK returns failure when the key name already exists. On + # re-activation that's the same node re-issuing the same key — + # treat existing key as benign and proceed to the crypto-bdev + # create below. If creation genuinely failed for another reason, + # the next call will surface it. + logger.warning( + "lvol_crypto_key_create returned failure for %s; if the key " + "already exists from a prior pass this is expected — " + "proceeding to crypto bdev create", key_name) ret = rpc_client.lvol_crypto_create(name, base_name, key_name) if not ret: logger.error(f"failed to create crypto LVol {name}") @@ -432,6 +448,11 @@ def add_lvol_ha(name, size, host_id_or_name, ha_type, pool_id_or_name, use_comp= if dev.status == dev.STATUS_ONLINE: dev_count += 1 cluster_size_total += dev.size + # Inline-checksum fallback layout reserves 6 of every 510 data blocks + # per 2 MiB extent for the extended md page + filler. Charge that as + # initial utilization rather than reducing reported raw capacity. + if cl.inline_checksum and not dev.md_supported: + cluster_size_prov += utils.alceml_fallback_overhead_bytes(cl, dev.size) if len(online_nodes) == 0: logger.error("No online Storage nodes found") @@ -1631,9 +1652,21 @@ def connect_lvol(uuid, ctrl_loss_tmo=constants.LVOL_NVME_CONNECT_CTRL_LOSS_TMO, for h in lvol.allowed_hosts: if h["nqn"] == host_nqn: host_entry = h - pool = db_controller.get_pool_by_id(lvol.pool_uuid) - host_entry["dhchap_key"] = pool.dhchap_key - host_entry["dhchap_ctrlr_key"] = pool.dhchap_ctrlr_key + # Note: an earlier change (sfam-2722) unconditionally + # injected ``pool.dhchap_key`` / ``pool.dhchap_ctrlr_key`` + # into ``host_entry`` here. That broke three contracts: + # - it overrode existing host-level keys + # - it injected pool keys when the pool had no DHCHAP + # configured (pool.dhchap_key=None) + # - it emitted ``--dhchap-secret`` for hosts using PSK + # (TLS auth), which conflicts with the host's chosen + # auth mode + # Pool-level DHCHAP keys are retrieved by clients via a + # separate path (see ``_register_pool_dhchap_keys_on_node``); + # the connect command must not embed them. If sfam-2722 + # needs pool keys propagated for a specific case, add a + # guarded code path that gates on ``pool.dhchap`` and + # only sets keys the host_entry doesn't already have. break if not host_entry: return False, f"Host NQN {host_nqn} not found in allowed hosts for volume {uuid}" diff --git a/simplyblock_core/env_var b/simplyblock_core/env_var index 5abdecd44..f90552c61 100644 --- a/simplyblock_core/env_var +++ b/simplyblock_core/env_var @@ -1,5 +1,5 @@ SIMPLY_BLOCK_COMMAND_NAME=sbcli-dev -SIMPLY_BLOCK_VERSION=19.2.34 +SIMPLY_BLOCK_VERSION=19.2.36 -SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:main -SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:main-latest +SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:inline-checksum-validation +SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:checksum-validation-latest diff --git a/simplyblock_core/models/cluster.py b/simplyblock_core/models/cluster.py index 1f0588a1b..29b79c7db 100644 --- a/simplyblock_core/models/cluster.py +++ b/simplyblock_core/models/cluster.py @@ -73,6 +73,9 @@ class Cluster(BaseModel): is_re_balancing: bool = False full_page_unmap: bool = True is_single_node: bool = False + # Inline CRC checksum validation for silent-data-error protection. + # Frozen at cluster create time; no upgrade path for existing clusters. + inline_checksum: bool = False snapshot_replication_target_cluster: str = "" snapshot_replication_target_pool: str = "" snapshot_replication_timeout: int = 60*10 diff --git a/simplyblock_core/models/nvme_device.py b/simplyblock_core/models/nvme_device.py index 1badd5942..b1e7299ee 100644 --- a/simplyblock_core/models/nvme_device.py +++ b/simplyblock_core/models/nvme_device.py @@ -66,6 +66,11 @@ class NVMeDevice(BaseModel): last_flap_tsc: float = 0.0 serial_number: str = "" size: int = -1 + # NVMe per-block metadata size in bytes, as reported by the bound SPDK bdev. + # >=8 means alceml can run in cv_md_method (no read/write amplification). + # 0 means alceml must use cv_fallback_method (extra md page per 2 MiB extent). + md_size: int = 0 + md_supported: bool = False testing_bdev: str = "" connecting_from_node: str = "" previous_status: str = "" diff --git a/simplyblock_core/rpc_client.py b/simplyblock_core/rpc_client.py index 3b1243dff..ad289a8db 100755 --- a/simplyblock_core/rpc_client.py +++ b/simplyblock_core/rpc_client.py @@ -570,7 +570,8 @@ def qos_vbdev_delete(self, name): def bdev_alceml_create(self, alceml_name, nvme_name, uuid, pba_init_mode=3, alceml_cpu_mask="", alceml_worker_cpu_mask="", pba_page_size=2097152, - write_protection=False, full_page_unmap=False): + write_protection=False, full_page_unmap=False, + checksum_method=0, cache_size=0, cache_eviction_threshold=0): params = { "name": alceml_name, "cntr_path": nvme_name, @@ -594,6 +595,15 @@ def bdev_alceml_create(self, alceml_name, nvme_name, uuid, pba_init_mode=3, params["write_protection"] = True if full_page_unmap: params["use_map_whole_page_on_1st_write"] = True + # Inline CRC checksum validation. method: 0=off, 1=md-on-device, 2=fallback (extra md page). + # The data plane reads md_size from spdk_bdev_get_md_size and refuses method=1 when md_size==0, + # so the caller must pick method=2 for devices without NVMe metadata support. + if checksum_method: + params["checksum_validation_method"] = int(checksum_method) + if cache_size: + params["cache_size"] = int(cache_size) + if cache_eviction_threshold: + params["cache_eviction_threshold"] = int(cache_eviction_threshold) return self._request("bdev_alceml_create", params) def bdev_distrib_create(self, name, vuid, ndcs, npcs, num_blocks, block_size, jm_names, diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py index 3e9a72c8d..1f5ea5e1a 100755 --- a/simplyblock_core/storage_node_ops.py +++ b/simplyblock_core/storage_node_ops.py @@ -122,6 +122,61 @@ def _rpc_lvstore_exists(rpc_client, lvs_name): return False +def _kill_spdk_until_dead(snode, max_attempts=3, poll_per_attempt_sec=5, + poll_interval=0.25): + """Kill SPDK on `snode` and return only after it is verifiably gone. + + Per design: any abort during restart MUST kill SPDK so the next attempt + starts from a clean process — leftover bdevs (raid0_, lvol + subsystems) cause "Duplicate bdev name" / "Subsystem already exists" + failures on retry that loop the auto-restart forever. + + The previous behavior (single 5 s soft window, log warning, proceed) + silently left zombies behind. We now retry the kill until SPDK is + confirmed down. Bounded total wall-clock = max_attempts * + poll_per_attempt_sec so a wedged docker daemon cannot trap the caller. + Returns True if SPDK died, False if all attempts exhausted (caller is + responsible for whatever comes next; the node should still be marked + OFFLINE so it stops being treated as in_restart). + """ + snode_api = snode.client(timeout=5, retry=5) + for attempt in range(1, max_attempts + 1): + try: + snode_api.spdk_process_kill(snode.rpc_port, snode.cluster_id) + except Exception as e: + logger.warning( + "spdk_process_kill RPC failed on %s (attempt %d/%d): %s", + snode.get_id(), attempt, max_attempts, e, + ) + + deadline = time.time() + poll_per_attempt_sec + while time.time() < deadline: + try: + up = snode_api.spdk_process_is_up(snode.rpc_port, snode.cluster_id) + except Exception: + up = False + if not up: + logger.info( + "SPDK on %s confirmed down (kill attempt %d/%d)", + snode.get_id(), attempt, max_attempts, + ) + return True + time.sleep(poll_interval) + + logger.warning( + "SPDK on %s still up after %ds (attempt %d/%d); re-issuing kill", + snode.get_id(), poll_per_attempt_sec, attempt, max_attempts, + ) + + logger.error( + "SPDK on %s did NOT die after %d kill attempts (%ds total) — " + "investigate snode_api / docker daemon health on %s", + snode.get_id(), max_attempts, + max_attempts * poll_per_attempt_sec, snode.mgmt_ip, + ) + return False + + def _reapply_allowed_hosts(lvol, snode, rpc_client): """Re-register allowed hosts (with DHCHAP keys) on a subsystem after recreation.""" from simplyblock_core.controllers.lvol_controller import _register_dhchap_keys_on_node, _get_dhchap_group @@ -471,22 +526,43 @@ def _search_for_partitions(rpc_client, nvme_device): def _create_jm_stack_on_raid(rpc_client, jm_nvme_bdevs, snode, after_restart): + # When the two NVMes on a node format to different LBAFs (e.g. + # heterogeneous hardware where one drive supports an md-capable LBAF + # and the other doesn't), the resulting partition bdevs end up with + # different md_size and SPDK's RAID1 layer rejects the mirror with + # EINVAL ("different metadata format than base bdev"). For lab + # configurations where we deliberately want to exercise both + # md-on-device and fallback alceml modes in the same cluster, fall + # back to a single-bdev JM (no mirror) on the first partition so + # the deploy can proceed. We persist only that first bdev in + # jm_nvme_bdev_list so the restart path takes the existing single- + # bdev recreate branch and doesn't try to rebuild the RAID. if snode.jm_device and snode.jm_device.raid_bdev: raid_bdev = snode.jm_device.raid_bdev if raid_bdev.startswith("raid_jm_"): raid_level = "1" ret = rpc_client.bdev_raid_create(raid_bdev, jm_nvme_bdevs, raid_level) if not ret: - logger.error(f"Failed to create raid_jm_{snode.get_id()}") - return False + logger.warning( + f"RAID create failed for {raid_bdev} on bdevs {jm_nvme_bdevs} " + f"(likely heterogeneous metadata format); falling back to " + f"single-bdev JM on {jm_nvme_bdevs[0]}" + ) + raid_bdev = jm_nvme_bdevs[0] + jm_nvme_bdevs = [jm_nvme_bdevs[0]] else: if len(jm_nvme_bdevs) > 1: raid_bdev = f"raid_jm_{snode.get_id()}" raid_level = "1" ret = rpc_client.bdev_raid_create(raid_bdev, jm_nvme_bdevs, raid_level) if not ret: - logger.error(f"Failed to create raid_jm_{snode.get_id()}") - return False + logger.warning( + f"RAID create failed for {raid_bdev} on bdevs {jm_nvme_bdevs} " + f"(likely heterogeneous metadata format); falling back to " + f"single-bdev JM on {jm_nvme_bdevs[0]}" + ) + raid_bdev = jm_nvme_bdevs[0] + jm_nvme_bdevs = [jm_nvme_bdevs[0]] else: raid_bdev = jm_nvme_bdevs[0] @@ -667,12 +743,22 @@ def _create_storage_device_stack(rpc_client, nvme, snode, after_restart): cluster = db_controller.get_cluster_by_id(snode.cluster_id) + checksum_method, cache_size, cache_eviction_threshold = utils.alceml_checksum_params(cluster, nvme) + if cluster.inline_checksum and not nvme.md_supported: + logger.warning( + f"Inline checksum: device {nvme.get_id()} ({nvme.pcie_address}) has no NVMe metadata; " + f"alceml will run in fallback mode (extra md page, ~1.17%% capacity overhead)." + ) + ret = snode.create_alceml( alceml_name, nvme_bdev, alceml_id, pba_init_mode=1 if (after_restart and nvme.status != NVMeDevice.STATUS_NEW) else 3, write_protection=cluster.distr_ndcs > 1, pba_page_size=cluster.page_size_in_blocks, full_page_unmap=cluster.full_page_unmap, + checksum_method=checksum_method, + cache_size=cache_size, + cache_eviction_threshold=cache_eviction_threshold, ) if not ret: @@ -2180,6 +2266,36 @@ def restart_storage_node( f"Restart of {node_id} failed (post-status={post_node.status}); " f"resetting to OFFLINE to unblock future attempts" ) + + # Abort contract: SPDK MUST be killed on every failed + # restart that owned the lock, so the next attempt starts + # from a clean process. Without this, _restart_storage_node_impl + # has ~20 different `return False` paths (per-device setup, + # examine, subsystem create, listener add, remote-dev + # connect, etc.) that all leave SPDK running with whatever + # bdevs the impl already set up — causing the next attempt + # to fail on "Duplicate bdev name for manual examine: + # raid0_" / "Subsystem NQN ... already exists" and + # loop forever (incident 2026-05-10, b278fd62 restart + # attempts 1–3). Routing every owned-lock failure through + # _kill_spdk_until_dead closes those gaps in one place. + # Idempotent: a fast no-op when SPDK was never started in + # this attempt. Inner abort paths (recreate_lvstore's + # _abort_restart_and_unblock, restart_storage_node's + # _abort_restart) emit the snode_restart_failed event + # already; the wrapper does NOT re-emit it to avoid + # duplicate events and to avoid the FDB write that + # `snode_restart_failed` performs unconditionally (which + # would raise SystemExit through base_model.write_to_db + # on hosts without FDB — the wrapper must not depend on + # FDB liveness for cleanup correctness). + try: + _kill_spdk_until_dead(post_node) + except Exception as kill_exc: + logger.error( + f"Restart cleanup: kill SPDK on {node_id} raised: {kill_exc}" + ) + # Force the OFFLINE write — bypass the state-machine guard # in set_node_status (which only restricts ONLINE writes # anyway, but we use a direct write here to avoid any @@ -2590,6 +2706,9 @@ def _restart_storage_node_impl( db_dev.nvme_bdev = found_dev.nvme_bdev db_dev.nvme_controller = found_dev.nvme_controller db_dev.pcie_address = found_dev.pcie_address + # Refresh md detection so a re-format between restarts is reflected + db_dev.md_size = found_dev.md_size + db_dev.md_supported = found_dev.md_supported # if db_dev.status in [ NVMeDevice.STATUS_ONLINE]: # db_dev.status = NVMeDevice.STATUS_UNAVAILABLE @@ -2754,11 +2873,19 @@ def _restart_storage_node_impl( # Before each, perform disconnect checks on the other two nodes. def _abort_restart(reason): - """Kill SPDK and set offline on fatal error.""" + """Kill SPDK and set offline on fatal error. + + Contract: any abort during restart kills SPDK reliably (verified + down) before returning, so the next restart attempt starts from + a clean SPDK process. The previous implementation issued a + single fire-and-forget ``spdk_process_kill`` and proceeded — + which left zombie SPDK behind when docker-rm took >5 s, + causing the next attempt to fail with "Duplicate bdev name for + manual examine: raid0_" and loop forever. + """ logger.error(f"Restart abort: {reason}") storage_events.snode_restart_failed(snode) - snode_api_inner = snode.client(timeout=5, retry=5) - snode_api_inner.spdk_process_kill(snode.rpc_port, snode.cluster_id) + _kill_spdk_until_dead(snode) set_node_status(snode.get_id(), StorageNode.STATUS_OFFLINE) try: @@ -2768,10 +2895,16 @@ def _abort_restart(reason): _abort_restart(f"LVS recreation failed: {e}") return False if not ret: + # Restart abort path. recreate_all_lvstores returning False is + # ALSO a restart abort and must honor the same kill+offline + # contract — otherwise SPDK keeps running with the partial + # bdev stack from this attempt (e.g. raid0_ created via + # auto-examine) and the next retry fails on "Duplicate bdev + # name". 10:58:11 in the AWS soak run hit exactly this gap. snode = db_controller.get_storage_node_by_id(snode.get_id()) snode.lvstore_status = "failed" snode.write_to_db() - set_node_status(snode.get_id(), StorageNode.STATUS_OFFLINE) + _abort_restart("recreate_all_lvstores returned False") return False # === Phase 10: Finalization — post all LVS recreation === @@ -3345,7 +3478,6 @@ def suspend_storage_node(node_id, force=False, change_node_status=True): logger.info("Suspending node") - rpc_client = snode.rpc_client() fw_api = FirewallClient(snode, timeout=20, retry=1) port_type = "tcp" if snode.active_rdma: @@ -3367,14 +3499,27 @@ def _revert_blocked_ports(): try: # Block per-lvstore ports for secondary lvstores hosted on this node. - # Order: client (lvs) port FIRST, then 1 s for peers to drain in-flight - # IO via the still-open hublvol back to the acting leader, then the - # hublvol port. Reversing this order — blocking the hublvol port while - # the client port is still open — kills the redirect path while clients - # may still be funneling IO into the secondary, forcing the secondary's - # lvstore layer to auto-promote (lvol.c:3508 "Leadership changed due - # to receive new IO"), which races the in-flight leader and produces - # a writer conflict (incident 2026-05-06 iter 2, jm_vuid=4450). + # Per-LVS order: + # 1. block client (lvs) port — multipath clients fail over to peers + # 2. sleep so any pre-block in-flight IO already on our distrib + # pipeline can complete locally before we close the hublvol + # 3. block the hublvol port — surviving peer detects "no redirect + # target", JC consensus + auto-promotion on the peer elects the + # new leader cleanly (the in-flight IO that triggered earlier + # revisions of this code is already drained by step 2). + # We deliberately do NOT issue an explicit + # bdev_lvol_set_leader(leader=False) / bdev_distrib_force_to_non_leader + # at any point inside this loop. Prior incidents: + # 2026-05-06 iter 2, jm_vuid=4450 — hub closed before client; fixed + # by client-first ordering. + # 2026-05-10 iter 3, jm_vuid=4245 — multipath refilled the + # secondary's pipeline during the drain; tried "demote between + # the two blocks" (3ae2a9b0). Still produced a writer conflict + # because the explicit demote races pre-block IO still being + # processed on the local distrib (port block does not halt + # internal distrib processing of already-queued requests); the + # IO completes as non-leader → conflict. Dropped the demote + # entirely and rely on the peer's auto-promotion path. if snode.lvstore_stack_secondary or snode.lvstore_stack_tertiary: nodes = db_controller.get_primary_storage_nodes_by_secondary_node_id(node_id) if nodes: @@ -3386,11 +3531,9 @@ def _revert_blocked_ports(): if sec_hub_port: _block_port(sec_hub_port) time.sleep(0.5) - rpc_client.bdev_lvol_set_leader(node.lvstore, leader=False) - rpc_client.bdev_distrib_force_to_non_leader(node.jm_vuid) # Block per-lvstore ports for this node's own primary lvstore - # (same client-port-first ordering — see comment above). + # (same per-LVS ordering — see comment above). own_lvs_port = snode.get_lvol_subsys_port(snode.lvstore) own_hub_port = snode.get_hublvol_port(snode.lvstore) _block_port(own_lvs_port) @@ -3398,8 +3541,6 @@ def _revert_blocked_ports(): if own_hub_port: _block_port(own_hub_port) time.sleep(0.5) - rpc_client.bdev_lvol_set_leader(snode.lvstore, leader=False) - rpc_client.bdev_distrib_force_to_non_leader(snode.jm_vuid) time.sleep(1) except Exception as e: logger.error(f"Failed during suspend port blocking/leadership transfer: {e}") @@ -3689,7 +3830,7 @@ def upgrade_automated_deployment_config(): def generate_automated_deployment_config(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_allowed, pci_blocked, cores_percentage=0, force=False, device_model="", size_range="", nvme_names=None, k8s=False, - calculate_hp_only=False, number_of_devices=0): + calculate_hp_only=False, number_of_devices=0, inline_checksum=False): if calculate_hp_only: minimum_hp_memory = utils.calculate_hp_only(max_lvol, number_of_devices, sockets_to_use, nodes_per_socket, cores_percentage) hp_number = math.ceil(minimum_hp_memory / 2) @@ -3707,7 +3848,8 @@ def generate_automated_deployment_config(max_lvol, max_prov, sockets_to_use, nod nodes_config, system_info = utils.generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_allowed, pci_blocked, cores_percentage, force=force, - device_model=device_model, size_range=size_range, nvme_names=nvme_names) + device_model=device_model, size_range=size_range, nvme_names=nvme_names, + inline_checksum=inline_checksum) if not nodes_config or not nodes_config.get("nodes"): return False utils.store_config_file(nodes_config, constants.NODES_CONFIG_FILE, create_read_only_file=True) @@ -4806,6 +4948,33 @@ def _abort_and_unblock(reason): "Raid %s and lvstore %s already present on %s; skipping examine", primary_node.raid, primary_node.lvstore, snode.get_id()) else: + if raid_already and not lvstore_already: + # Same convergence trap as in recreate_lvstore: the raid was + # examined on a prior pass and the lvstore module did not + # surface it. SPDK rejects re-examine of an already-examined + # bdev with "Duplicate bdev name for manual examine", so a + # plain bdev_examine here is a silent no-op that loops the + # activation retry forever. Drop the raid and re-create via + # _create_bdev_stack (idempotent) so the next examine is + # against a freshly-registered raid. + logger.info( + "Raid %s present but lvstore %s did not surface on %s; " + "dropping raid for clean re-examine", + primary_node.raid, primary_node.lvstore, snode.get_id()) + try: + snode_rpc_client.bdev_raid_delete(primary_node.raid) + except Exception as e: + logger.warning( + "bdev_raid_delete(%s) raised: %s — proceeding to " + "_create_bdev_stack which is idempotent", + primary_node.raid, e) + ret, err = _create_bdev_stack(snode, primary_node.lvstore_stack, + primary_node=primary_node) + if not ret: + logger.error( + "Failed to rebuild bdev stack on %s after raid drop: %s", + snode.get_id(), err) + # Examine is required whenever the lvstore isn't surfaced — whether # the raid was freshly created by _create_bdev_stack (normal restart # path) or pre-existing with stale state (activation retry). @@ -5312,19 +5481,28 @@ def recreate_lvstore(snode, force=False, lvs_primary=None, activation_mode=False lvol_ana_state = "optimized" - ### 2- create lvols nvmf subsystems + ### 2- create lvols nvmf subsystems (idempotent: probe SPDK first; mirrors + ### the pattern in recreate_lvstore_on_non_leader so a re-activation that + ### finds the subsystem already present from a prior partial pass does not + ### emit "Subsystem NQN ... already exists" / "Unable to create subsystem". created_subsystems = [] for lvol in lvol_list: - if lvol.nqn not in created_subsystems: - allow_any = not bool(lvol.allowed_hosts) + if lvol.nqn in created_subsystems: + continue + allow_any = not bool(lvol.allowed_hosts) + if _rpc_subsystem_exists(rpc_client, lvol.nqn): + logger.info("subsystem %s already exists on %s, skipping create", + lvol.nqn, snode.get_id()) + created_subsystems.append(lvol.nqn) + else: logger.info("creating subsystem %s (allow_any_host=%s)", lvol.nqn, allow_any) ret = rpc_client.subsystem_create(lvol.nqn, lvol.ha_type, lvol.uuid, 1, max_namespaces=constants.LVO_MAX_NAMESPACES_PER_SUBSYS, allow_any_host=allow_any) if ret: created_subsystems.append(lvol.nqn) - if lvol.allowed_hosts: - _reapply_allowed_hosts(lvol, snode, rpc_client) + if lvol.allowed_hosts: + _reapply_allowed_hosts(lvol, snode, rpc_client) # ANA failback only when the original primary is coming back (not takeover) if not is_takeover and lvs_node.secondary_node_id and lvol_list: @@ -5361,31 +5539,21 @@ def _unblock_peer_port(peer): pass def _kill_app(): + """Kill SPDK on snode and mark OFFLINE before peer ports unblock. + + Holding the peer port blocks during this wait is intentional: + unblocking before SPDK is confirmed dead lets a residual primary + on snode race the acting-leader and produce a writer conflict. + + Implemented via the module-level :func:`_kill_spdk_until_dead` + helper so the same hardened kill logic is used by every abort + path (recreate_lvstore aborts here; restart_storage_node aborts + in `_abort_restart`). On total kill failure we still mark the + node OFFLINE so it stops being treated as in_restart by the + cluster, and so peer ports get released by the caller. + """ storage_events.snode_restart_failed(snode) - snode_api = snode.client(timeout=5, retry=5) - snode_api.spdk_process_kill(snode.rpc_port, snode.cluster_id) - # spdk_process_kill returns as soon as the HTTP request is - # queued — SPDK may keep serving IO for a short while after. - # Block here until SPDK is actually gone so the subsequent - # peer-port unblock in _abort_restart_and_unblock cannot race - # client IO back into the secondary while a still-alive primary - # on snode tries to serve as leader (→ writer conflict). - # We hold the peer port blocks during this wait; cap it at 5 s - # so a stuck kill does not leave peers permanently blocked. - deadline = time.time() + 5 - while time.time() < deadline: - try: - up = snode_api.spdk_process_is_up(snode.rpc_port, snode.cluster_id) - except Exception: - up = False - if not up: - break - time.sleep(0.25) - else: - logger.warning( - "SPDK on %s still up 5s after kill signal; proceeding with unblock anyway", - snode.get_id(), - ) + _kill_spdk_until_dead(snode) set_node_status(snode.get_id(), StorageNode.STATUS_OFFLINE) def _abort_restart_and_unblock(reason): @@ -5556,6 +5724,42 @@ def _abort_restart_and_unblock(reason): "Raid %s and lvstore %s already present on %s; skipping examine", lvs_raid, lvs_name, snode.get_id()) else: + if raid_already and not lvstore_already: + # Raid is present but the lvstore module never surfaced it on + # this SPDK process (e.g. a prior activation pass examined the + # raid and the lvstore-side examine failed/was incomplete). + # SPDK rejects re-examine of an already-examined bdev with + # "Duplicate bdev name for manual examine: ", so calling + # bdev_examine again is a no-op that leaves the lvstore + # missing forever and burns the activation retry loop. + # + # Drop the raid so the underlying distribs are reusable, then + # re-create it via _create_bdev_stack (which is itself + # idempotent — it skips bdevs already present and only creates + # what's missing). The fresh bdev_examine below now runs + # against a newly-registered raid and the lvstore module gets + # a real chance to surface. + logger.info( + "Raid %s present but lvstore %s did not surface on %s; " + "dropping raid for clean re-examine", + lvs_raid, lvs_name, snode.get_id()) + try: + rpc_client.bdev_raid_delete(lvs_raid) + except Exception as e: + logger.warning( + "bdev_raid_delete(%s) raised: %s — proceeding to " + "_create_bdev_stack which is idempotent", lvs_raid, e) + stack = lvs_node.lvstore_stack if is_takeover else None + if is_takeover: + ret, err = _create_bdev_stack(snode, stack, primary_node=lvs_node) + else: + ret, err = _create_bdev_stack(snode, []) + if not ret: + logger.error( + "Failed to rebuild bdev stack on %s after raid drop: %s", + snode.get_id(), err) + # Fall through; bdev_examine below will surface what we have. + # Examine is required whenever the lvstore isn't surfaced — whether # the raid was freshly created by _create_bdev_stack (normal restart # path) or pre-existing with stale state (activation retry). The diff --git a/simplyblock_core/utils/__init__.py b/simplyblock_core/utils/__init__.py index 69a43aa12..f43dfa01b 100644 --- a/simplyblock_core/utils/__init__.py +++ b/simplyblock_core/utils/__init__.py @@ -1209,6 +1209,41 @@ def validate_sec_options(sec_options): return True, None +def alceml_fallback_overhead_bytes(cluster, device_size_bytes): + """Bytes of capacity charged as initial utilization when alceml runs in + cv_fallback_method on a device. Per 2 MiB extent the layout shrinks from + 510 to 504 data blocks (1 extended-md block + 5 filler), so we lose 6 + blocks per page. Returns 0 when inline_checksum is off, when device size + is unknown, or when the device runs in md-on-device mode (caller must + have already filtered for md_supported=False). + """ + if not getattr(cluster, 'inline_checksum', False): + return 0 + if not device_size_bytes or device_size_bytes <= 0: + return 0 + blk_size = cluster.blk_size or 4096 + page_size = cluster.page_size_in_blocks or (2 * 1024 * 1024) + pages = device_size_bytes // page_size + return int(pages * 6 * blk_size) + + +def alceml_checksum_params(cluster, nvme_device): + """Pick the inline-checksum method and tunables for bdev_alceml_create. + + Returns (method, cache_size, cache_eviction_threshold). method: + 0 = off (cluster.inline_checksum False) + 1 = md-on-device (cv_md_method, no read/write amplification) + 2 = fallback (cv_fallback_method, extra md page per 2 MiB extent) + cache_size and cache_eviction_threshold default to 0 so the data plane + keeps its built-in defaults (2000 entries, 90% eviction trigger). + """ + if not getattr(cluster, 'inline_checksum', False): + return 0, 0, 0 + if getattr(nvme_device, 'md_supported', False): + return 1, 0, 0 + return 2, 0, 0 + + def addNvmeDevices(rpc_client, snode, devs): devices = [] ret = rpc_client.bdev_nvme_controller_list() @@ -1271,6 +1306,12 @@ def addNvmeDevices(rpc_client, snode, devs): else: logger.error(f"No subsystem nqn found for device: {nvme_driver_data['pci_address']}") + # SPDK exposes per-namespace metadata size as a top-level uint32 in bdev_get_bdevs JSON + # (lib/bdev/bdev_rpc.c writes "md_size" via spdk_bdev_get_md_size). >=8 means alceml can run + # in cv_md_method on this device; 0 means it must run in cv_fallback_method. + md_size = int(nvme_dict.get('md_size', 0) or 0) + md_supported = md_size >= 8 + devices.append( NVMeDevice({ 'uuid': str(uuid.uuid4()), @@ -1284,7 +1325,9 @@ def addNvmeDevices(rpc_client, snode, devs): 'nvme_controller': nvme_controller, 'node_id': snode.get_id(), 'cluster_id': snode.cluster_id, - 'status': NVMeDevice.STATUS_ONLINE + 'status': NVMeDevice.STATUS_ONLINE, + 'md_size': md_size, + 'md_supported': md_supported, })) return devices @@ -1772,7 +1815,8 @@ def regenerate_config(new_config, old_config, force=False): def generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_allowed, pci_blocked, - cores_percentage=0, force=False, device_model="", size_range="", nvme_names=None): + cores_percentage=0, force=False, device_model="", size_range="", nvme_names=None, + inline_checksum=False): system_info = {} nodes_config: dict = {"nodes": []} @@ -1797,8 +1841,24 @@ def generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_a nvme_device_path = f"/dev/{nvme_device}n1" clean_partitions(nvme_device_path) nvme_json_string = get_idns(nvme_device_path) - lbaf_id = find_lbaf_id(nvme_json_string, 0, 12) - format_nvme_device(nvme_device_path, lbaf_id) + lbaf_id = None + md_lbaf = False + if inline_checksum: + # Prefer an LBAF with metadata so alceml can run in cv_md_method on this drive. + lbaf_id = find_md_lbaf_id(nvme_json_string, target_ds=12, min_ms=8) + if lbaf_id is None: + logger.warning( + f"--enable-inline-checksum: device {nvme_device_path} exposes no 4K LBAF with >=8B metadata; " + f"formatting plain 4K. alceml will run in fallback mode on this drive." + ) + else: + md_lbaf = True + logger.info(f"Formatting {nvme_device_path} with md-capable LBAF index {lbaf_id}") + if lbaf_id is None: + lbaf_id = find_lbaf_id(nvme_json_string, 0, 12) + # When switching to an md-capable LBAF, the namespace SectorSize stays 4096, + # so the in-list 4K early-out would skip the reformat. Force it. + format_nvme_device(nvme_device_path, lbaf_id, force_reformat=md_lbaf) for nid in sockets_to_use: if nid in cores_by_numa: @@ -2824,6 +2884,26 @@ def find_lbaf_id(json_data: str, target_ms: int, target_ds: int) -> int: return 0 +def find_md_lbaf_id(json_data: str, target_ds: int = 12, min_ms: int = 8): + """Return the LBAF index for a format with data-size==target_ds (log2, 12=4K) + and metadata-size>=min_ms. Among matches, prefer the smallest ms to avoid + wasting space on 64B-md formats. Returns None if no such LBAF exists. + """ + try: + data = json.loads(json_data) + except (json.JSONDecodeError, TypeError): + return None + candidates = [] + for index, lbaf in enumerate(data.get('lbafs', [])): + ms = lbaf.get('ms', 0) + if lbaf.get('ds') == target_ds and ms >= min_ms: + candidates.append((ms, index)) + if not candidates: + return None + candidates.sort() + return candidates[0][1] + + def get_idns(nvme_device: str): command = ['nvme', 'id-ns', nvme_device, '--output-format', 'json'] try: @@ -2895,8 +2975,11 @@ def is_namespace_4k_from_nvme_list(device_path: str) -> bool: return False -def format_nvme_device(nvme_device: str, lbaf_id: int): - if is_namespace_4k_from_nvme_list(nvme_device): +def format_nvme_device(nvme_device: str, lbaf_id: int, force_reformat: bool = False): + # The 4K early-out only checks SectorSize, not metadata size, so it would + # silently skip a reformat needed to switch a 4K-no-md namespace to 4K-with-md. + # Callers that need a specific LBAF (e.g. md-capable) pass force_reformat=True. + if not force_reformat and is_namespace_4k_from_nvme_list(nvme_device): logger.debug(f"Device {nvme_device} already formatted with 4K...skipping") return command = ['nvme', 'format', nvme_device, f"--lbaf={lbaf_id}", '--force'] diff --git a/tests/test_activate_portblock_and_suspend_no_demote.py b/tests/test_activate_portblock_and_suspend_no_demote.py new file mode 100644 index 000000000..ebe56c7a2 --- /dev/null +++ b/tests/test_activate_portblock_and_suspend_no_demote.py @@ -0,0 +1,409 @@ +# coding=utf-8 +""" +Regression tests for two related fixes: + +1. cluster_activate (Pass 2): on re-activation (cluster.status != UNREADY), + the configured primary's LVS port is firewall-blocked before + recreate_lvstore_on_non_leader runs and unblocked afterwards. On a fresh + activation (cluster.status == UNREADY) NO port-block is issued — peers + aren't serving yet, so there is nothing to quiesce. + Bug: a JCERR-driven cluster suspend followed by re-activation looped + forever with bs_load_cur_extent_page_valid CRC mismatch on the + secondary's examine because the live primary kept writing into the LVS + blob metadata. Observed 2026-05-11, LVS_6769 on node 8084. + +2. suspend_storage_node: after blocking the lvs+hublvol ports, the + explicit bdev_lvol_set_leader(leader=False) and + bdev_distrib_force_to_non_leader RPCs are NOT issued. With both ports + blocked the surviving peer auto-promotes; demoting now races pre-block + in-flight IO still completing on the local distrib → writer conflict. +""" + +import unittest +from unittest.mock import MagicMock, patch + +from simplyblock_core.models.cluster import Cluster +from simplyblock_core.models.iface import IFace +from simplyblock_core.models.storage_node import StorageNode + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _node(uuid, status=StorageNode.STATUS_ONLINE, lvstore="LVS_A", + jm_vuid=1, primary_secondary=None, primary_tertiary=None, + mgmt_ip="10.0.0.1", rpc_port=8080, n_devices=4): + from simplyblock_core.models.nvme_device import NVMeDevice + n = StorageNode() + n.uuid = uuid + n.cluster_id = "cluster-1" + n.status = status + n.hostname = f"host-{uuid}" + n.mgmt_ip = mgmt_ip + n.rpc_port = rpc_port + n.rpc_username = "u" + n.rpc_password = "p" + n.lvstore = lvstore + n.lvstore_status = "ready" + n.jm_vuid = jm_vuid + n.is_secondary_node = False + n.lvstore_stack_secondary = "" + n.lvstore_stack_tertiary = "" + n.lvstore_ports = {lvstore: {"lvol_subsys_port": 4420, "hublvol_port": 4427}} + devs = [] + for i in range(n_devices): + d = NVMeDevice() + d.uuid = f"dev-{uuid}-{i}" + d.status = NVMeDevice.STATUS_ONLINE + devs.append(d) + n.nvme_devices = devs + n.remote_devices = [] + n.remote_jm_devices = [] + n.physical_label = 0 + n.secondary_node_id = primary_secondary or "" + n.tertiary_node_id = primary_tertiary or "" + n.data_nics = [IFace()] + n.data_nics[0].ip4_address = mgmt_ip + n.data_nics[0].trtype = "TCP" + n.active_tcp = True + n.active_rdma = False + return n + + +def _cluster(status=Cluster.STATUS_SUSPENDED, ha_type="ha", ftt=1): + c = Cluster() + c.uuid = "cluster-1" + c.status = status + c.ha_type = ha_type + c.max_fault_tolerance = ftt + c.distr_ndcs = 4 + c.distr_npcs = 2 + c.distr_bs = 4096 + c.distr_chunk_bs = 4096 + c.page_size_in_blocks = 128 + c.nqn = "nqn.cluster" + c.is_single_node = False + c.enable_node_affinity = False + c.backup_config = None + return c + + +# =========================================================================== +# 1. cluster_activate Pass 2 port-block wrapper +# =========================================================================== + + +class TestActivatePortBlockWrapper(unittest.TestCase): + """The Pass 2 firewall block runs only on re-activation, not on first + activation, and wraps recreate_lvstore_on_non_leader exactly once per + (snode, primary_node) pair.""" + + def _patch_cluster_activate_environment( + self, cluster, primary, secondary, + recreate_lvstore_ret=True, + recreate_non_leader_ret=True, + recreate_non_leader_exc=None, + firewall_block_exc=None, + firewall_allow_exc=None, + ): + """Returns a tuple of (patches_started, recorded_calls). + + Caller is responsible for stopping the patches (via the returned + contextmanager-like list) — done in the test's tearDown via addCleanup. + """ + from simplyblock_core import cluster_ops + + db = MagicMock() + db.get_cluster_by_id.return_value = cluster + db.get_storage_nodes_by_cluster_id.return_value = [primary, secondary] + db.get_storage_node_by_id.side_effect = lambda nid: ( + primary if nid == primary.get_id() else secondary) + db.get_cluster_capacity.return_value = [{"size_total": 1 << 40}] + db.get_qos.return_value = [] + + def _primary_for(node_id): + return [primary] if node_id == secondary.get_id() else [] + db.get_primary_storage_nodes_by_secondary_node_id.side_effect = _primary_for + + # FirewallClient: record block/allow calls. + fw_calls = [] + + class FakeFW: + def __init__(self, node, timeout=3, retry=1): + self._node = node + + def firewall_set_port(self, port, ptype, action, rpc_port, **kw): + fw_calls.append((self._node.get_id(), port, action)) + if action == "block" and firewall_block_exc: + raise firewall_block_exc + if action == "allow" and firewall_allow_exc: + raise firewall_allow_exc + + # storage_node_ops.recreate_lvstore* are heavy — replace with stubs. + recreate_calls = [] + + def _recreate_primary(snode, activation_mode=False, **kw): + recreate_calls.append(("primary", snode.get_id(), activation_mode)) + return recreate_lvstore_ret + + def _recreate_non_leader(snode, leader, primary_node, + activation_mode=False, **kw): + recreate_calls.append(("non_leader", snode.get_id(), + primary_node.get_id(), activation_mode)) + if recreate_non_leader_exc: + raise recreate_non_leader_exc + return recreate_non_leader_ret + + # tasks_controller: drop schedule fallback (the wrapper falls back to + # add_port_allow_task only when unblock RPC fails — we don't trigger + # that in the green-path tests below; still need a no-op shim). + scheduled_port_allow = [] + + def _add_port_allow_task(cluster_id, node_id, port): + scheduled_port_allow.append((cluster_id, node_id, port)) + + port_events = [] + + def _port_deny(node, port): + port_events.append(("deny", node.get_id(), port)) + + def _port_allowed(node, port): + port_events.append(("allow", node.get_id(), port)) + + patches = [ + patch.object(cluster_ops, "db_controller", db), + patch.object(cluster_ops, "DBController", return_value=db), + patch.object(cluster_ops, "FirewallClient", FakeFW), + patch.object(cluster_ops.tcp_ports_events, "port_deny", _port_deny), + patch.object(cluster_ops.tcp_ports_events, "port_allowed", _port_allowed), + patch.object(cluster_ops.tasks_controller, "add_port_allow_task", + _add_port_allow_task), + patch.object(cluster_ops.storage_node_ops, "recreate_lvstore", + _recreate_primary), + patch.object(cluster_ops.storage_node_ops, "recreate_lvstore_on_non_leader", + _recreate_non_leader), + patch.object(cluster_ops.storage_node_ops, "get_next_physical_device_order", + lambda *a, **kw: 0), + patch.object(cluster_ops.storage_node_ops, "get_secondary_nodes", + lambda *a, **kw: [secondary.get_id()]), + patch.object(cluster_ops.storage_node_ops, "get_secondary_nodes_2", + lambda *a, **kw: []), + patch.object(cluster_ops, "set_cluster_status", lambda *a, **kw: None), + patch.object(cluster_ops, "time", MagicMock()), + # qos: prevent FDB writes + patch.object(cluster_ops.qos_controller, "get_qos_weights_list", + lambda *a, **kw: []), + ] + # Ensure each node's rpc_client and snode.recreate_hublvol / connect / + # write_to_db are no-ops (Pass 3 + post-loop work). The model objects + # already exist; we patch their methods inline. + for n in (primary, secondary): + n.write_to_db = MagicMock(return_value=True) + n.rpc_client = MagicMock() + n.recreate_hublvol = MagicMock(return_value=True) + n.create_secondary_hublvol = MagicMock(return_value=True) + n.connect_to_hublvol = MagicMock(return_value=True) + n.client = MagicMock() + # Primary's per-LVS port lookup — already populated in _node(). + # Make is_qos_set on the cluster return False so the QOS branch is skipped. + cluster.is_qos_set = lambda: False + + for p in patches: + p.start() + return patches, fw_calls, recreate_calls, port_events, scheduled_port_allow + + def _run_activate(self, cluster, primary, secondary, **kw): + from simplyblock_core import cluster_ops + patches, fw_calls, recreate_calls, port_events, scheduled = \ + self._patch_cluster_activate_environment(cluster, primary, + secondary, **kw) + self.addCleanup(lambda: [p.stop() for p in patches]) + try: + cluster_ops.cluster_activate("cluster-1", force=True) + except ValueError: + # cluster_activate may raise on the LVSRestartRequiredError path; + # the tests below assert on it explicitly. + pass + return fw_calls, recreate_calls, port_events, scheduled + + # ----- tests ----- + + def test_reactivation_blocks_and_unblocks_leader_port(self): + cluster = _cluster(status=Cluster.STATUS_SUSPENDED) + primary = _node("primary-1", primary_secondary="secondary-1") + secondary = _node("secondary-1", mgmt_ip="10.0.0.2", rpc_port=8081) + + fw_calls, recreate_calls, port_events, _ = self._run_activate( + cluster, primary, secondary) + + # The wrapper must have issued block on primary:4420, then allow on + # primary:4420 — once each, in that order, and surrounding the + # non_leader recreate call. + fw_for_primary = [c for c in fw_calls if c[0] == "primary-1"] + self.assertEqual( + fw_for_primary, + [("primary-1", 4420, "block"), ("primary-1", 4420, "allow")], + f"unexpected firewall sequence: {fw_calls}") + # Recreate on the non-leader ran with activation_mode=True (we deliberately + # do NOT switch the helper out of activation_mode — only add the firewall). + non_leader_runs = [c for c in recreate_calls if c[0] == "non_leader"] + self.assertEqual(len(non_leader_runs), 1, recreate_calls) + _, snode_id, primary_id, activation_mode = non_leader_runs[0] + self.assertEqual(snode_id, "secondary-1") + self.assertEqual(primary_id, "primary-1") + self.assertTrue(activation_mode, + "Pass 2 must still call helper with activation_mode=True " + "— the firewall wrapper provides the only added op") + # tcp_ports_events emitted deny + allowed events on the primary. + self.assertIn(("deny", "primary-1", 4420), port_events) + self.assertIn(("allow", "primary-1", 4420), port_events) + + def test_fresh_activation_does_not_block_leader_port(self): + cluster = _cluster(status=Cluster.STATUS_UNREADY) + primary = _node("primary-1", primary_secondary="secondary-1") + secondary = _node("secondary-1", mgmt_ip="10.0.0.2", rpc_port=8081) + + fw_calls, recreate_calls, port_events, _ = self._run_activate( + cluster, primary, secondary) + + # On fresh activation NO port-block is issued — peers aren't serving + # yet and the existing activation_mode=True short-circuit handles + # everything. + self.assertEqual( + [c for c in fw_calls if c[0] == "primary-1"], [], + f"fresh activation must not block primary's port; got {fw_calls}") + self.assertEqual( + [e for e in port_events if e[1] == "primary-1"], [], + f"fresh activation must not emit port deny/allow events; got {port_events}") + + def test_reactivation_unblocks_when_recreate_raises(self): + """LVSRestartRequiredError out of recreate_lvstore_on_non_leader must + not leak a stuck-blocked leader port — the finally clause unblocks.""" + from simplyblock_core import storage_node_ops + cluster = _cluster(status=Cluster.STATUS_SUSPENDED) + primary = _node("primary-1", primary_secondary="secondary-1") + secondary = _node("secondary-1", mgmt_ip="10.0.0.2", rpc_port=8081) + + err = storage_node_ops.LVSRestartRequiredError( + "secondary-1", "LVS_A", detail="examine did not produce lvstore") + fw_calls, _, _, scheduled = self._run_activate( + cluster, primary, secondary, recreate_non_leader_exc=err) + + fw_for_primary = [c for c in fw_calls if c[0] == "primary-1"] + # block then allow — even on the exception path. + self.assertEqual( + fw_for_primary, + [("primary-1", 4420, "block"), ("primary-1", 4420, "allow")], + f"finally-unblock missing on exception path: {fw_calls}") + # No port_allow_task scheduled — the unblock RPC itself succeeded. + self.assertEqual(scheduled, []) + + def test_reactivation_schedules_port_allow_task_on_unblock_failure(self): + cluster = _cluster(status=Cluster.STATUS_SUSPENDED) + primary = _node("primary-1", primary_secondary="secondary-1") + secondary = _node("secondary-1", mgmt_ip="10.0.0.2", rpc_port=8081) + + fw_calls, _, _, scheduled = self._run_activate( + cluster, primary, secondary, + firewall_allow_exc=RuntimeError("network down")) + + # block recorded, allow attempted (and raised), so it is in fw_calls. + self.assertEqual( + [c for c in fw_calls if c[0] == "primary-1"], + [("primary-1", 4420, "block"), ("primary-1", 4420, "allow")]) + # Fallback task scheduled. + self.assertEqual(scheduled, [("cluster-1", "primary-1", 4420)], + f"add_port_allow_task fallback missing: {scheduled}") + + +# =========================================================================== +# 2. suspend_storage_node — no leadership drop after port block +# =========================================================================== + + +class TestSuspendNoLeadershipDropAfterBlock(unittest.TestCase): + """suspend_storage_node must block lvs+hublvol ports but NOT issue + bdev_lvol_set_leader(leader=False) or bdev_distrib_force_to_non_leader + afterwards — the surviving peer auto-promotes on the closed redirect + and an explicit demote races pre-block in-flight IO.""" + + def _run(self, snode, secondary_owners=None): + from simplyblock_core import storage_node_ops + + db = MagicMock() + db.get_storage_node_by_id.return_value = snode + db.get_primary_storage_nodes_by_secondary_node_id.return_value = \ + secondary_owners or [] + + rpc_client = MagicMock() + snode.rpc_client = MagicMock(return_value=rpc_client) + snode.write_to_db = MagicMock(return_value=True) + + fw_calls = [] + + class FakeFW: + def __init__(self, node, timeout=20, retry=1): + self._node = node + + def firewall_set_port(self, port, ptype, action, rpc_port, **kw): + fw_calls.append((port, action)) + + patches = [ + patch.object(storage_node_ops, "DBController", return_value=db), + patch.object(storage_node_ops, "FirewallClient", FakeFW), + patch.object(storage_node_ops, "set_node_status", + lambda *a, **kw: None), + patch.object(storage_node_ops, "time", MagicMock()), + patch.object(storage_node_ops.tasks_controller, + "get_active_node_restart_task", + lambda *a, **kw: None), + patch.object(storage_node_ops.tasks_controller, + "get_active_node_tasks", lambda *a, **kw: []), + patch.object(storage_node_ops, "_check_ftt_allows_node_removal", + lambda *a, **kw: (True, "")), + ] + for p in patches: + p.start() + self.addCleanup(lambda: [p.stop() for p in patches]) + + ret = storage_node_ops.suspend_storage_node(snode.get_id()) + return ret, rpc_client, fw_calls + + def test_own_primary_lvs_blocks_ports_without_demote(self): + snode = _node("node-A", lvstore="LVS_A") + ret, rpc, fw_calls = self._run(snode) + self.assertTrue(ret) + # Both ports were blocked. + self.assertIn((4420, "block"), fw_calls) + self.assertIn((4427, "block"), fw_calls) + # And critically: no leadership drop after the block. + rpc.bdev_lvol_set_leader.assert_not_called() + rpc.bdev_distrib_force_to_non_leader.assert_not_called() + + def test_secondary_tertiary_lvs_blocks_ports_without_demote(self): + # snode also hosts secondary copy for primary "node-B" + snode = _node("node-A", lvstore="LVS_A") + snode.lvstore_stack_secondary = "node-B" + + primary_b = _node("node-B", lvstore="LVS_B", + mgmt_ip="10.0.0.99", rpc_port=8082) + primary_b.lvstore_ports = { + "LVS_B": {"lvol_subsys_port": 4430, "hublvol_port": 4431}} + + ret, rpc, fw_calls = self._run(snode, secondary_owners=[primary_b]) + self.assertTrue(ret) + # Sec lvs+hub ports blocked AND own primary lvs+hub ports blocked. + ports_blocked = {p for p, action in fw_calls if action == "block"} + self.assertIn(4430, ports_blocked) # sec lvs port (primary_b's lvs port) + self.assertIn(4431, ports_blocked) # sec hub port (primary_b's hub port) + self.assertIn(4420, ports_blocked) # own lvs port + self.assertIn(4427, ports_blocked) # own hub port + # No explicit demote on any LVS. + rpc.bdev_lvol_set_leader.assert_not_called() + rpc.bdev_distrib_force_to_non_leader.assert_not_called() + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_dhchap_e2e.py b/tests/test_dhchap_e2e.py index dbee35413..78415413b 100644 --- a/tests/test_dhchap_e2e.py +++ b/tests/test_dhchap_e2e.py @@ -441,7 +441,7 @@ def test_connect_lvol_includes_dhchap_secrets(self): with patch("simplyblock_core.controllers.lvol_controller.DBController", return_value=mock_db): - result = lvol_ctl.connect_lvol("lvol-1", host_nqn=host_nqn) + result, _err = lvol_ctl.connect_lvol("lvol-1", host_nqn=host_nqn) self.assertTrue(len(result) > 0) cmd = result[0]["connect"] @@ -495,7 +495,7 @@ def test_connect_lvol_tls_only_with_psk(self): with patch("simplyblock_core.controllers.lvol_controller.DBController", return_value=mock_db): - result = lvol_ctl.connect_lvol("lvol-1", host_nqn=host_nqn) + result, _err = lvol_ctl.connect_lvol("lvol-1", host_nqn=host_nqn) cmd = result[0]["connect"] self.assertIn("--tls", cmd) @@ -546,7 +546,7 @@ def test_connect_lvol_without_host_nqn_is_rejected_when_acl_exists(self): with patch("simplyblock_core.controllers.lvol_controller.DBController", return_value=mock_db): - result = lvol_ctl.connect_lvol("lvol-1") + result, _err = lvol_ctl.connect_lvol("lvol-1") self.assertFalse(result) diff --git a/tests/test_dhchap_pool_level.py b/tests/test_dhchap_pool_level.py index 332184d2b..fbd33e13d 100644 --- a/tests/test_dhchap_pool_level.py +++ b/tests/test_dhchap_pool_level.py @@ -700,7 +700,7 @@ def test_host_with_dhchap_keys_injected_into_connect_cmd(self): } patcher, _ = _make_connect_ctx([host_entry]) try: - result = connect_lvol("lvol-1", host_nqn="nqn:host-a") + result, _err = connect_lvol("lvol-1", host_nqn="nqn:host-a") finally: patcher.stop() @@ -726,7 +726,7 @@ def test_host_with_psk_sets_tls_flag(self): } patcher, _ = _make_connect_ctx([host_entry]) try: - result = connect_lvol("lvol-1", host_nqn="nqn:host-a") + result, _err = connect_lvol("lvol-1", host_nqn="nqn:host-a") finally: patcher.stop() @@ -745,7 +745,7 @@ def test_missing_host_nqn_when_allowed_hosts_present_returns_false(self): patcher, _ = _make_connect_ctx([{"nqn": "nqn:host-a"}]) try: - result = connect_lvol("lvol-1", host_nqn=None) + result, _err = connect_lvol("lvol-1", host_nqn=None) finally: patcher.stop() self.assertFalse(result) @@ -756,7 +756,7 @@ def test_unknown_host_nqn_returns_false(self): patcher, _ = _make_connect_ctx([{"nqn": "nqn:host-a"}]) try: - result = connect_lvol("lvol-1", host_nqn="nqn:intruder") + result, _err = connect_lvol("lvol-1", host_nqn="nqn:intruder") finally: patcher.stop() self.assertFalse(result) @@ -768,7 +768,7 @@ def test_no_allowed_hosts_pass_through_with_host_nqn(self): patcher, _ = _make_connect_ctx([]) try: - result = connect_lvol("lvol-1", host_nqn="nqn:whoever") + result, _err = connect_lvol("lvol-1", host_nqn="nqn:whoever") finally: patcher.stop() @@ -791,7 +791,7 @@ def test_pool_level_dhchap_lvol_has_no_secret_in_connect_cmd(self): # Pool-level DHCHAP: lvol.allowed_hosts contains only nqn, no keys. patcher, _ = _make_connect_ctx([{"nqn": "nqn:host-a"}]) try: - result = connect_lvol("lvol-1", host_nqn="nqn:host-a") + result, _err = connect_lvol("lvol-1", host_nqn="nqn:host-a") finally: patcher.stop() diff --git a/tests/test_dual_ft_secondary_fixes.py b/tests/test_dual_ft_secondary_fixes.py index 74fd05028..111df3ed8 100644 --- a/tests/test_dual_ft_secondary_fixes.py +++ b/tests/test_dual_ft_secondary_fixes.py @@ -149,7 +149,7 @@ def get_node(nid): db.get_storage_node_by_id.side_effect = get_node db.get_cluster_by_id.return_value = cluster - result = connect_lvol("vol-1") + result, _err = connect_lvol("vol-1") self.assertTrue(result) self.assertEqual(len(result), 3) @@ -174,7 +174,7 @@ def test_connect_single_node_uses_primary_port(self, mock_db_cls): db.get_storage_node_by_id.return_value = primary db.get_cluster_by_id.return_value = cluster - result = connect_lvol("vol-1") + result, _err = connect_lvol("vol-1") self.assertTrue(result) self.assertEqual(result[0]["port"], 4420) diff --git a/tests/test_inline_checksum.py b/tests/test_inline_checksum.py new file mode 100644 index 000000000..f83bc6644 --- /dev/null +++ b/tests/test_inline_checksum.py @@ -0,0 +1,308 @@ +# coding=utf-8 +""" +test_inline_checksum.py – unit tests for the per-cluster inline CRC checksum +validation feature (TD.100226.1). + +Covers: + * Cluster.inline_checksum / NVMeDevice.md_size / md_supported model defaults. + * find_md_lbaf_id helper – LBAF selection from `nvme id-ns` JSON. + * alceml_checksum_params helper – cluster flag + per-device md combo logic. + * alceml_fallback_overhead_bytes helper – capacity-overhead math. + * bdev_alceml_create RPC – correct param wire-up for each method. + * addNvmeDevices – md_size flowing from SPDK bdev JSON onto NVMeDevice. +""" + +import json +import unittest +from unittest.mock import patch, MagicMock + +from simplyblock_core import utils +from simplyblock_core.models.cluster import Cluster +from simplyblock_core.models.nvme_device import NVMeDevice +from simplyblock_core.rpc_client import RPCClient + + +def _make_rpc_client(): + with patch("requests.session"): + return RPCClient("127.0.0.1", 8081, "user", "pass", timeout=1, retry=0) + + +# --------------------------------------------------------------------------- +# Model defaults & persistence +# --------------------------------------------------------------------------- +class TestModelDefaults(unittest.TestCase): + def test_cluster_inline_checksum_defaults_off(self): + c = Cluster() + self.assertFalse(c.inline_checksum) + + def test_cluster_inline_checksum_can_be_set(self): + c = Cluster() + c.inline_checksum = True + self.assertTrue(c.inline_checksum) + + def test_nvme_device_md_fields_default_off(self): + d = NVMeDevice() + self.assertEqual(d.md_size, 0) + self.assertFalse(d.md_supported) + + def test_nvme_device_md_fields_round_trip(self): + d = NVMeDevice({'md_size': 16, 'md_supported': True}) + self.assertEqual(d.md_size, 16) + self.assertTrue(d.md_supported) + + +# --------------------------------------------------------------------------- +# find_md_lbaf_id +# --------------------------------------------------------------------------- +class TestFindMdLbafId(unittest.TestCase): + def _idns(self, lbafs): + return json.dumps({'lbafs': lbafs}) + + def test_returns_none_when_no_md_lbaf(self): + # Only 4K-no-md available. + s = self._idns([ + {'ms': 0, 'ds': 9}, + {'ms': 0, 'ds': 12}, + ]) + self.assertIsNone(utils.find_md_lbaf_id(s)) + + def test_picks_smallest_ms_above_min(self): + # 4K-with-8B is preferred over 4K-with-64B (waste less space). + s = self._idns([ + {'ms': 0, 'ds': 12}, # idx 0 – no md + {'ms': 64, 'ds': 12}, # idx 1 + {'ms': 8, 'ds': 12}, # idx 2 – preferred + {'ms': 16, 'ds': 12}, # idx 3 + ]) + self.assertEqual(utils.find_md_lbaf_id(s), 2) + + def test_skips_non_matching_ds(self): + # An LBAF with ms>=8 but ds!=12 must not be selected. + s = self._idns([ + {'ms': 8, 'ds': 9}, # 512B with md – ignore + {'ms': 0, 'ds': 12}, + ]) + self.assertIsNone(utils.find_md_lbaf_id(s)) + + def test_below_min_ms_excluded(self): + s = self._idns([ + {'ms': 4, 'ds': 12}, # below 8B threshold + {'ms': 0, 'ds': 12}, + ]) + self.assertIsNone(utils.find_md_lbaf_id(s)) + + def test_invalid_json_returns_none(self): + self.assertIsNone(utils.find_md_lbaf_id("not json")) + self.assertIsNone(utils.find_md_lbaf_id(None)) + + def test_empty_lbafs_returns_none(self): + self.assertIsNone(utils.find_md_lbaf_id(self._idns([]))) + + +# --------------------------------------------------------------------------- +# alceml_checksum_params +# --------------------------------------------------------------------------- +class TestAlcemlChecksumParams(unittest.TestCase): + def test_off_when_cluster_flag_off(self): + c = Cluster({'inline_checksum': False}) + d = NVMeDevice({'md_supported': True}) + self.assertEqual(utils.alceml_checksum_params(c, d), (0, 0, 0)) + + def test_method_1_when_md_supported(self): + c = Cluster({'inline_checksum': True}) + d = NVMeDevice({'md_supported': True, 'md_size': 8}) + self.assertEqual(utils.alceml_checksum_params(c, d), (1, 0, 0)) + + def test_method_2_when_md_unsupported(self): + c = Cluster({'inline_checksum': True}) + d = NVMeDevice({'md_supported': False, 'md_size': 0}) + self.assertEqual(utils.alceml_checksum_params(c, d), (2, 0, 0)) + + def test_off_for_cluster_without_attribute(self): + # Old DB record (no inline_checksum field) must behave as off. + class _Old: + pass + d = NVMeDevice({'md_supported': True}) + self.assertEqual(utils.alceml_checksum_params(_Old(), d), (0, 0, 0)) + + +# --------------------------------------------------------------------------- +# alceml_fallback_overhead_bytes +# --------------------------------------------------------------------------- +class TestFallbackOverhead(unittest.TestCase): + def test_zero_when_flag_off(self): + c = Cluster({'inline_checksum': False, 'blk_size': 4096, 'page_size_in_blocks': 2 * 1024 * 1024}) + self.assertEqual(utils.alceml_fallback_overhead_bytes(c, 100 * 2 * 1024 * 1024), 0) + + def test_zero_for_zero_or_negative_size(self): + c = Cluster({'inline_checksum': True, 'blk_size': 4096, 'page_size_in_blocks': 2 * 1024 * 1024}) + self.assertEqual(utils.alceml_fallback_overhead_bytes(c, 0), 0) + self.assertEqual(utils.alceml_fallback_overhead_bytes(c, -1), 0) + + def test_six_blocks_per_page(self): + # 100 pages × 2 MiB = 200 MiB device. Overhead = 100 × 6 × 4 KiB = 2400 KiB. + c = Cluster({'inline_checksum': True, 'blk_size': 4096, 'page_size_in_blocks': 2 * 1024 * 1024}) + device_size = 100 * 2 * 1024 * 1024 + expected = 100 * 6 * 4096 + self.assertEqual(utils.alceml_fallback_overhead_bytes(c, device_size), expected) + + def test_partial_page_floored(self): + # 1.5 pages → only 1 full page counts (page-granular accounting). + c = Cluster({'inline_checksum': True, 'blk_size': 4096, 'page_size_in_blocks': 2 * 1024 * 1024}) + partial = (1 * 2 * 1024 * 1024) + (1 * 1024 * 1024) + self.assertEqual(utils.alceml_fallback_overhead_bytes(c, partial), 1 * 6 * 4096) + + def test_overhead_is_about_1_17_percent(self): + # Sanity: the design doc cites ~1.17% overhead in fallback mode. + c = Cluster({'inline_checksum': True, 'blk_size': 4096, 'page_size_in_blocks': 2 * 1024 * 1024}) + size = 1024 * 2 * 1024 * 1024 # 2 GiB, 1024 pages + ratio = utils.alceml_fallback_overhead_bytes(c, size) / size + self.assertAlmostEqual(ratio, 6 / 512, places=6) + + +# --------------------------------------------------------------------------- +# bdev_alceml_create RPC params +# --------------------------------------------------------------------------- +class TestBdevAlcemlCreateRPC(unittest.TestCase): + @patch.object(RPCClient, "_request") + def test_no_checksum_params_when_method_zero(self, mock_req): + mock_req.return_value = True + client = _make_rpc_client() + client.bdev_alceml_create("alc_x", "nvme0", "uuid-1") + params = mock_req.call_args[0][1] + self.assertNotIn("checksum_validation_method", params) + self.assertNotIn("cache_size", params) + self.assertNotIn("cache_eviction_threshold", params) + + @patch.object(RPCClient, "_request") + def test_method_1_only_emits_method_field(self, mock_req): + mock_req.return_value = True + client = _make_rpc_client() + client.bdev_alceml_create("alc_x", "nvme0", "uuid-1", checksum_method=1) + params = mock_req.call_args[0][1] + self.assertEqual(params["checksum_validation_method"], 1) + # Defaults of 0 must not be sent so the data plane uses its own defaults. + self.assertNotIn("cache_size", params) + self.assertNotIn("cache_eviction_threshold", params) + + @patch.object(RPCClient, "_request") + def test_method_2_with_explicit_cache_overrides(self, mock_req): + mock_req.return_value = True + client = _make_rpc_client() + client.bdev_alceml_create( + "alc_x", "nvme0", "uuid-1", + checksum_method=2, cache_size=1500, cache_eviction_threshold=85, + ) + params = mock_req.call_args[0][1] + self.assertEqual(params["checksum_validation_method"], 2) + self.assertEqual(params["cache_size"], 1500) + self.assertEqual(params["cache_eviction_threshold"], 85) + + @patch.object(RPCClient, "_request") + def test_existing_params_unchanged(self, mock_req): + # Regression guard: the new kwargs must not perturb the well-known params + # the data plane expects. + mock_req.return_value = True + client = _make_rpc_client() + client.bdev_alceml_create( + "alc_x", "nvme0", "uuid-1", + pba_init_mode=2, pba_page_size=2 * 1024 * 1024, + write_protection=True, full_page_unmap=True, checksum_method=1, + ) + params = mock_req.call_args[0][1] + self.assertEqual(params["name"], "alc_x") + self.assertEqual(params["cntr_path"], "nvme0") + self.assertEqual(params["uuid"], "uuid-1") + self.assertEqual(params["pba_init_mode"], 2) + self.assertEqual(params["pba_page_size"], 2 * 1024 * 1024) + self.assertTrue(params["write_protection"]) + self.assertTrue(params["use_map_whole_page_on_1st_write"]) + self.assertEqual(params["checksum_validation_method"], 1) + + +# --------------------------------------------------------------------------- +# addNvmeDevices md detection +# --------------------------------------------------------------------------- +class TestAddNvmeDevicesMd(unittest.TestCase): + def _make_rpc_with_bdev(self, *, md_size): + rpc = MagicMock() + rpc.bdev_nvme_controller_list.return_value = [] + rpc.bdev_nvme_controller_attach.return_value = ["nvmeX_n1"] + rpc.bdev_examine.return_value = True + rpc.bdev_wait_for_examine.return_value = True + # SPDK bdev_get_bdevs payload – the only md-relevant field is the + # top-level uint32 md_size set by spdk_bdev_get_md_size. + bdev_payload = [{ + 'name': 'nvmeX_n1', + 'block_size': 4096, + 'num_blocks': 100 * 1024 * 1024 // 4096, # 100 MiB + 'md_size': md_size, + 'driver_specific': { + 'nvme': [{ + 'pci_address': '0000:00:01.0', + 'ctrlr_data': { + 'model_number': 'TEST_NVME', + 'serial_number': 'SN-TEST-1', + }, + }], + }, + }] + rpc.get_bdevs.return_value = bdev_payload + return rpc + + def _make_snode(self): + snode = MagicMock() + snode.physical_label = 0 + snode.id_device_by_nqn = False + snode.get_id.return_value = "snode-1" + snode.cluster_id = "cluster-1" + return snode + + def test_md_size_zero_marks_unsupported(self): + rpc = self._make_rpc_with_bdev(md_size=0) + snode = self._make_snode() + devs = utils.addNvmeDevices(rpc, snode, ["0000:00:01.0"]) + self.assertEqual(len(devs), 1) + self.assertEqual(devs[0].md_size, 0) + self.assertFalse(devs[0].md_supported) + + def test_md_size_8_marks_supported(self): + rpc = self._make_rpc_with_bdev(md_size=8) + snode = self._make_snode() + devs = utils.addNvmeDevices(rpc, snode, ["0000:00:01.0"]) + self.assertEqual(devs[0].md_size, 8) + self.assertTrue(devs[0].md_supported) + + def test_md_size_below_threshold_marks_unsupported(self): + # Pre-existing PI-only formats expose ms=4 (T10 PI's reftag field + # alone). That's < 8 bytes so checksums won't fit – treat as no-md. + rpc = self._make_rpc_with_bdev(md_size=4) + snode = self._make_snode() + devs = utils.addNvmeDevices(rpc, snode, ["0000:00:01.0"]) + self.assertEqual(devs[0].md_size, 4) + self.assertFalse(devs[0].md_supported) + + def test_missing_md_size_field_treated_as_zero(self): + # Older SPDK builds that omit the field entirely must not crash. + rpc = MagicMock() + rpc.bdev_nvme_controller_list.return_value = [] + rpc.bdev_nvme_controller_attach.return_value = ["nvmeX_n1"] + rpc.get_bdevs.return_value = [{ + 'name': 'nvmeX_n1', + 'block_size': 4096, + 'num_blocks': 100 * 1024 * 1024 // 4096, + 'driver_specific': { + 'nvme': [{ + 'pci_address': '0000:00:01.0', + 'ctrlr_data': {'model_number': 'M', 'serial_number': 'S'}, + }], + }, + }] + snode = self._make_snode() + devs = utils.addNvmeDevices(rpc, snode, ["0000:00:01.0"]) + self.assertEqual(devs[0].md_size, 0) + self.assertFalse(devs[0].md_supported) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_nvmeof_security.py b/tests/test_nvmeof_security.py index 644615854..44aae70ae 100644 --- a/tests/test_nvmeof_security.py +++ b/tests/test_nvmeof_security.py @@ -671,7 +671,7 @@ def test_connect_with_psk_includes_tls_flag(self, MockDBCtrl): mock_db.get_cluster_by_id.return_value = cl MockDBCtrl.return_value = mock_db - result = connect_lvol("lvol-1", host_nqn="nqn:host1") + result, _err = connect_lvol("lvol-1", host_nqn="nqn:host1") self.assertTrue(len(result) > 0) entry = result[0] self.assertIn("--tls", entry["connect"]) @@ -696,7 +696,7 @@ def test_connect_without_tls_no_flag(self, MockDBCtrl): mock_db.get_cluster_by_id.return_value = cl MockDBCtrl.return_value = mock_db - result = connect_lvol("lvol-1") + result, _err = connect_lvol("lvol-1") self.assertTrue(len(result) > 0) entry = result[0] self.assertNotIn("tls", entry) diff --git a/tests/test_peer_disconnect.py b/tests/test_peer_disconnect.py index 816d86ee6..31ccecce9 100644 --- a/tests/test_peer_disconnect.py +++ b/tests/test_peer_disconnect.py @@ -41,12 +41,20 @@ class TestCheckPeerDisconnected(unittest.TestCase): def _run(self, status, quorum_result=False): # _check_peer_disconnected imports ``is_node_data_plane_disconnected_quorum`` # locally from the services module at call time, so the patch must target - # the source module (where the symbol lives). + # the source module (where the symbol lives). The function also re-fetches + # the peer from FDB before reading status (cc3b4811), so DBController must + # be patched to return the same mock — otherwise the lookup hits + # uninitialised FDB and the KeyError fallback returns True for every input, + # masking the branch under test. from simplyblock_core import storage_node_ops as mod peer = _node(status=status) + db = MagicMock() + db.get_storage_node_by_id.return_value = peer with patch( "simplyblock_core.services.storage_node_monitor.is_node_data_plane_disconnected_quorum", - return_value=quorum_result) as q: + return_value=quorum_result) as q, \ + patch("simplyblock_core.storage_node_ops.DBController", + return_value=db): return mod._check_peer_disconnected(peer), q # -----------------------------------------------------------------