From fd465a071849ff7e293d47ccc5f791d1cb53bdf4 Mon Sep 17 00:00:00 2001 From: grumbach Date: Wed, 20 May 2026 16:19:36 +0900 Subject: [PATCH 01/23] fix(upgrade): re-verify ML-DSA signature on every cache hit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The shared upgrade binary cache stored the extracted binary and, on a cache hit, returned it after only a SHA-256 check against a sibling .meta.json. SHA-256 is not a security control: anyone able to write to the shared cache directory (a co-located process, a shared container volume, a low-privilege foothold on the host) could drop a malicious binary plus a forged matching metadata hash, and the next ant-node instance to upgrade would execute it with no signature verification at all — persistent RCE on every co-located node. The ML-DSA-65 signature covers the archive and was only checked on the initial download, never on a cache hit. Changes: - Cache the signed *archive + detached signature* instead of the extracted binary. `BinaryCache::get_verified_archive` re-runs ML-DSA-65 verification on every cache hit; the binary is always extracted fresh from the just-verified archive. A tampered archive, tampered or missing signature, or forged metadata fails verification against the pinned release public key, so a poisoned cache entry is rejected and a fresh verified download runs. - Stage cached files into the caller's process-private temp directory and verify that copy, then extract from the same private path. Closes the verify-vs-extract TOCTOU on the shared cache files: an attacker cannot swap the bytes between when the verifier reads them and when the extractor reads them. - Size policy before any copy or read. `fs::symlink_metadata` + `file_type().is_file()` rejects symlinks / FIFOs / devices outright; archive size is bounded by `MAX_ARCHIVE_SIZE_BYTES` and the signature must be exactly `SIGNATURE_SIZE` bytes. Otherwise an attacker could plant `cached.archive -> /dev/zero` (stats as 0 bytes) and force unbounded disk fill in the staging dir or OOM in `signature::verify`. - Cache only after successful extraction. A validly-signed-but-malformed release no longer becomes a shared cache poison pill that every later node downloads, fails to extract, and re-downloads. - `cache_dir.rs` restricts the shared upgrade cache directory to 0700 on Unix as defence in depth; the ML-DSA gate is the primary control. - `store_archive` mirrors the same size / file-type / signature checks before persisting, so a poisoned entry cannot be created through the supported path either. Tests in `src/upgrade/binary_cache.rs` cover the tamper path (SHA-256-forged swap on disk rejected by the signature re-check), the post-hit shared-file swap (private copy unaffected), the symlink-to- `/dev/zero` bypass attempt, oversize archive / wrong-sized signature rejection, and round-trip storage. Production verifies against the pinned `RELEASE_SIGNING_KEY`; tests use a `#[cfg(test)]`-only constructor that injects a generated key without weakening the production trust anchor. Residual: cache entries are not bound to a specific release version (the ML-DSA signing context is constant across versions), so a same-UID attacker who already has any past validly-signed release can plant it under a newer version's cache key and force a downgrade to that old signed binary. Not RCE (still legitimately-signed bytes) and a same-UID attacker has easier paths anyway; closing it cleanly requires coordinated changes in the release-signing pipeline, ant-keygen, ant-node, and ant-client, and is tracked in the binary_cache module docs. --- src/upgrade/apply.rs | 93 +++-- src/upgrade/binary_cache.rs | 678 +++++++++++++++++++++++++++++++----- src/upgrade/cache_dir.rs | 21 ++ 3 files changed, 668 insertions(+), 124 deletions(-) diff --git a/src/upgrade/apply.rs b/src/upgrade/apply.rs index 9d19870a..7af8c7d7 100644 --- a/src/upgrade/apply.rs +++ b/src/upgrade/apply.rs @@ -20,7 +20,7 @@ use std::path::{Path, PathBuf}; use tar::Archive; /// Maximum allowed upgrade archive size (200 MiB). -const MAX_ARCHIVE_SIZE_BYTES: usize = 200 * 1024 * 1024; +pub(super) const MAX_ARCHIVE_SIZE_BYTES: usize = 200 * 1024 * 1024; /// Exit code that signals the service manager to restart the process. /// @@ -176,9 +176,24 @@ impl AutoApplyUpgrader { .parent() .ok_or_else(|| Error::Upgrade("Cannot determine binary directory".to_string()))?; - // Create temp directory for upgrade - let temp_dir = tempfile::Builder::new() - .prefix("ant-upgrade-") + // Create temp directory for upgrade. + // + // On Unix, create it with 0700 so a same-host attacker on a different + // UID cannot read/write the staging area between when the cache + // re-verifies the ML-DSA signature on a private copy and when + // `extract_binary` reads it (closes a verify-vs-extract TOCTOU on + // the staging directory). The `tempfile::Builder::permissions` + // path is supported on tempfile 3 — on platforms that don't honour + // it the call is a no-op and the ML-DSA verification on the + // private copy still bounds the residual. + let mut tempdir_builder = tempfile::Builder::new(); + tempdir_builder.prefix("ant-upgrade-"); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + tempdir_builder.permissions(std::fs::Permissions::from_mode(0o700)); + } + let temp_dir = tempdir_builder .tempdir_in(binary_dir) .map_err(|e| Error::Upgrade(format!("Failed to create temp dir: {e}")))?; @@ -317,21 +332,26 @@ impl AutoApplyUpgrader { version_str: &str, ) -> Result { if let Some(ref cache) = self.binary_cache { - // Fast path — cache hit without locking - if let Some(cached_path) = cache.get_verified(version_str) { - info!("Cached binary verified for version {}", version_str); - let dest = dest_dir.join( - cached_path - .file_name() - .unwrap_or_else(|| std::ffi::OsStr::new("ant-node")), - ); - if let Err(e) = fs::copy(&cached_path, &dest) { - warn!("Failed to copy from cache, will re-download: {e}"); - return self - .download_verify_extract(info, dest_dir, Some(cache)) - .await; + // Fast path — cache hit without locking. The cache re-verifies + // the ML-DSA signature over the archive on every call, so a + // tampered cache entry returns None here and we fall through to + // a fresh, fully verified download. + // `dest_dir` is this upgrade's process-private temp dir, so the + // cache stages + verifies the archive there; extraction then + // reads exactly the verified bytes (no shared-file TOCTOU). + if let Some(verified_archive) = cache.get_verified_archive(version_str, dest_dir) { + match Self::extract_binary(&verified_archive, dest_dir) { + Ok(binary) => { + info!("Reused signature-verified cached archive for {version_str}"); + return Ok(binary); + } + Err(e) => { + warn!("Failed to extract from cached archive, will re-download: {e}"); + return self + .download_verify_extract(info, dest_dir, Some(cache)) + .await; + } } - return Ok(dest); } // Cache miss — acquire exclusive download lock via spawn_blocking @@ -345,19 +365,15 @@ impl AutoApplyUpgrader { .await .map_err(|e| Error::Upgrade(format!("Lock task failed: {e}")))??; - // Re-check cache under the lock — another node may have populated it - if let Some(cached_path) = cache.get_verified(version_str) { - info!( - "Cached binary became available under lock for version {}", - version_str - ); - let dest = dest_dir.join( - cached_path - .file_name() - .unwrap_or_else(|| std::ffi::OsStr::new("ant-node")), - ); - fs::copy(&cached_path, &dest)?; - return Ok(dest); + // Re-check cache under the lock — another node may have populated + // it. Same re-verification guarantee as the fast path. + if let Some(verified_archive) = cache.get_verified_archive(version_str, dest_dir) { + if let Ok(binary) = Self::extract_binary(&verified_archive, dest_dir) { + info!( + "Signature-verified cached archive became available under lock for {version_str}" + ); + return Ok(binary); + } } // Still missing — download while holding the lock @@ -400,15 +416,22 @@ impl AutoApplyUpgrader { signature::verify_from_file(&archive_path, &sig_path)?; info!("Archive signature verified successfully"); - // Step 4: Extract binary from verified archive + // Step 4: Extract binary from the just-verified archive. info!("Extracting binary from archive..."); let extracted_binary = Self::extract_binary(&archive_path, dest_dir)?; - // Store in binary cache if available + // Step 5: Cache the signature-verified ARCHIVE (+ its signature) + // AFTER successful extraction. We cache the signed artifact, never + // the extracted binary, so every later cache hit can re-verify the + // signature. Caching only after extract proves the archive is + // actually usable on this platform avoids turning a + // validly-signed-but-malformed release into a shared cache poison + // pill (every later node would hit cache, fail extract, and + // re-download). if let Some(c) = cache { let version_str = info.version.to_string(); - if let Err(e) = c.store(&version_str, &extracted_binary) { - warn!("Failed to store binary in cache: {e}"); + if let Err(e) = c.store_archive(&version_str, &archive_path, &sig_path) { + warn!("Failed to store verified archive in cache: {e}"); } } diff --git a/src/upgrade/binary_cache.rs b/src/upgrade/binary_cache.rs index 43aeb2ff..b708a1c5 100644 --- a/src/upgrade/binary_cache.rs +++ b/src/upgrade/binary_cache.rs @@ -1,124 +1,400 @@ -//! Disk cache for downloaded upgrade binaries. +//! Disk cache for downloaded upgrade archives. //! //! When multiple ant-node instances detect the same upgrade, only the first -//! one needs to download and verify the archive. `BinaryCache` stores the -//! extracted binary alongside a SHA-256 integrity metadata file so that -//! subsequent nodes can copy it directly. +//! one needs to download the archive. `BinaryCache` stores the **signed +//! archive together with its detached ML-DSA-65 signature** so that +//! subsequent nodes can reuse it. //! -//! **Security note:** SHA-256 is used only for cache integrity (detecting -//! corruption or partial writes). The actual security gate remains the -//! ML-DSA-65 signature verification performed during the initial download. +//! ## Security model +//! +//! The ML-DSA-65 signature is the security gate, and it covers the *archive* +//! bytes — not the extracted binary. A previous version cached the extracted +//! binary and, on a cache hit, returned it after only a SHA-256 check against +//! a sibling metadata file. SHA-256 is not a security control: anyone able to +//! write to the shared cache directory (a co-located process, a shared +//! container volume, a low-privilege foothold) could replace the cached +//! binary and its `.meta.json` with a matching hash, and the next node would +//! execute it **without any signature verification** — persistent RCE. +//! +//! This module now caches the *archive + signature* and, on **every** cache +//! hit, re-runs ML-DSA-65 verification over the cached archive before it is +//! used. A tampered archive fails verification (the release key is pinned in +//! the binary and cannot be forged); a tampered or missing signature fails +//! likewise. The extracted binary is always derived fresh from the +//! just-verified archive by the caller, so a poisoned cache entry can never +//! be executed. The SHA-256 metadata is retained only as a fast corruption +//! pre-check, never as the trust decision. +//! +//! ## Residual: cache entries are not bound to a specific release version +//! +//! `signature::SIGNING_CONTEXT = "ant-node-release-v1"` is constant across +//! versions, so the ML-DSA signature attests to "this archive is a valid +//! ant-node release", not "this archive is release X.Y.Z". An attacker with +//! cache-dir write access who possesses any past validly-signed release can +//! plant it under a newer version's cache key; the next node performing +//! that upgrade accepts it and runs it as the newer version. Net effect: +//! forced downgrade or wrong-arch crash loop, not arbitrary RCE. +//! +//! This is out of scope of the cache-poisoning RCE class this module +//! addresses (which trusted SHA-256 alone on cache hits): the `cache_dir` +//! is `0o700` (defence in depth, see `cache_dir.rs`) and the attacker +//! already needs same-UID write to exploit this — they can replace the +//! running binary directly. Closing the gap properly requires upstream +//! release-signing changes (the signing context must include the version +//! string, e.g. `b"ant-node-release-v1:1.2.3"`) and is tracked as a +//! follow-up. use crate::error::{Error, Result}; use crate::logging::{debug, warn}; +use crate::upgrade::signature; use fs2::FileExt; +use saorsa_pqc::api::sig::MlDsaPublicKey; use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; use std::fs::{self, File}; use std::io::{Read, Write}; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; -/// On-disk cache for downloaded upgrade binaries. +/// On-disk cache for downloaded, signature-verified upgrade archives. #[derive(Clone)] pub struct BinaryCache { - /// Directory that holds cached binaries and metadata. + /// Directory that holds cached archives, signatures, and metadata. cache_dir: PathBuf, + /// Verification key override. `None` in production → the pinned release + /// key embedded in [`signature`] is used (the real, unforgeable gate). + /// Only ever `Some` via the `#[cfg(test)]` constructor, so test builds + /// can exercise the cache with a generated keypair without weakening the + /// production trust anchor in any way. + verify_key: Option, } -/// Metadata written alongside each cached binary. +/// Metadata written alongside each cached archive. +/// +/// The SHA-256 here is a fast integrity/corruption pre-check only. It is +/// **not** a security control: the ML-DSA-65 signature over the archive is +/// re-verified on every cache hit regardless of this value. #[derive(Serialize, Deserialize)] -struct CachedBinaryMeta { +struct CachedArchiveMeta { /// Semantic version string (e.g. "1.2.3"). version: String, - /// Hex-encoded SHA-256 digest of the cached binary. - sha256: String, - /// When the binary was cached (seconds since UNIX epoch). + /// Hex-encoded SHA-256 digest of the cached archive (corruption check). + archive_sha256: String, + /// When the archive was cached (seconds since UNIX epoch). cached_at_epoch_secs: u64, } impl BinaryCache { /// Create a new binary cache backed by the given directory. + /// + /// Production constructor: the cache verifies cached archives against the + /// pinned release public key embedded in the binary. #[must_use] pub fn new(cache_dir: PathBuf) -> Self { - Self { cache_dir } + Self { + cache_dir, + verify_key: None, + } } - /// Return the path where a cached binary for `version` would be stored. + /// Test-only constructor that verifies against an explicit public key + /// instead of the pinned release key (the production trust anchor is + /// unchanged; this only exists so unit tests can produce verifiable + /// signatures with a generated keypair). + #[cfg(test)] #[must_use] - pub fn cached_binary_path(&self, version: &str) -> PathBuf { - let name = if cfg!(windows) { - format!("ant-node-{version}.exe") - } else { - format!("ant-node-{version}") - }; - self.cache_dir.join(name) + pub fn new_with_verify_key(cache_dir: PathBuf, verify_key: MlDsaPublicKey) -> Self { + Self { + cache_dir, + verify_key: Some(verify_key), + } } - /// Return the cached binary path if it exists and its SHA-256 matches - /// the stored metadata. Returns `None` on any mismatch or error. + /// Path of the cached archive for `version`. #[must_use] - pub fn get_verified(&self, version: &str) -> Option { - let bin_path = self.cached_binary_path(version); + pub fn cached_archive_path(&self, version: &str) -> PathBuf { + self.cache_dir.join(format!("ant-node-{version}.archive")) + } + + /// Path of the cached detached signature for `version`. + #[must_use] + fn cached_signature_path(&self, version: &str) -> PathBuf { + self.cache_dir.join(format!("ant-node-{version}.sig")) + } + + /// Verify `archive` against `sig` using the pinned release key in + /// production, or the injected test key under `#[cfg(test)]`. + fn verify_archive(&self, archive: &Path, sig: &Path) -> Result<()> { + self.verify_key.as_ref().map_or_else( + || signature::verify_from_file(archive, sig), + |key| signature::verify_from_file_with_key(archive, sig, key), + ) + } + + /// Copy the cached archive into the caller-private `private_dir`, + /// **cryptographically re-verify that private copy**, and return its + /// path — or `None` if there is no usable, trusted cache entry. + /// + /// On every call this: + /// 1. loads the sibling metadata and checks the version matches, + /// 2. copies the cached archive + signature into `private_dir` (a + /// location only this process writes, e.g. the per-upgrade temp dir), + /// 3. SHA-256 pre-checks the private copy against the metadata (fast + /// corruption check), then + /// 4. **re-verifies the ML-DSA-65 signature over the private copy** with + /// the pinned release key — the actual security gate. + /// + /// Verifying the *private copy* (not the shared cache file) closes the + /// TOCTOU window: an attacker with write access to the shared cache dir + /// cannot swap the bytes between verification and extraction, because the + /// caller extracts from the returned private path, which is the exact + /// byte sequence that was verified and is unreachable to the attacker. + /// + /// Any failure (missing/corrupt metadata, copy error, hash mismatch, + /// missing signature, or — critically — a signature that does not verify + /// against the pinned release key) returns `None`, forcing a fresh, + /// fully verified download. + /// + /// The caller MUST extract the binary from the returned (private) archive + /// path, so the executed bytes always derive from signature-verified + /// input that no other principal could have modified post-verification. + #[must_use] + pub fn get_verified_archive(&self, version: &str, private_dir: &Path) -> Option { + let cached_archive = self.cached_archive_path(version); + let cached_sig = self.cached_signature_path(version); let meta_path = self.meta_path(version); let meta_data = fs::read_to_string(&meta_path).ok()?; - let meta: CachedBinaryMeta = serde_json::from_str(&meta_data).ok()?; + let meta: CachedArchiveMeta = serde_json::from_str(&meta_data).ok()?; if meta.version != version { debug!("Binary cache version mismatch in metadata"); return None; } - let actual_hash = sha256_file(&bin_path).ok()?; - if actual_hash != meta.sha256 { + // Size policy gate — runs BEFORE we copy or read the cached files. + // + // `signature::verify_from_file*` and `sha256_file` both load the + // archive into memory in full. An attacker with cache-dir write + // access could otherwise drop a multi-GB `.archive` and force disk + // exhaustion in the staging dir or an OOM during re-verification + // before the entry is rejected. The download path already enforces + // `MAX_ARCHIVE_SIZE_BYTES`; cache hits must honour the same bound, + // plus the fixed `SIGNATURE_SIZE`. + // Use `symlink_metadata` (does NOT follow symlinks) and require a + // regular file. Otherwise a cache-dir writer could plant + // `ant-node-X.archive -> /dev/zero` (or a FIFO/device) whose `.len()` + // stats as 0 — passing a `fs::metadata` size check while + // `fs::copy` then reads indefinitely from the underlying special + // file, exhausting disk in the private staging dir. + let archive_meta = match fs::symlink_metadata(&cached_archive) { + Ok(m) => m, + Err(e) => { + debug!("Cannot stat cached archive for {version}: {e}"); + return None; + } + }; + if !archive_meta.file_type().is_file() { + warn!( + "Cached archive for {version} is not a regular file \ + (symlink/special); discarding cache entry" + ); + return None; + } + if archive_meta.len() > crate::upgrade::apply::MAX_ARCHIVE_SIZE_BYTES as u64 { + warn!( + "Cached archive for {version} exceeds MAX_ARCHIVE_SIZE_BYTES \ + ({} bytes); discarding cache entry", + archive_meta.len() + ); + return None; + } + let sig_meta = match fs::symlink_metadata(&cached_sig) { + Ok(m) => m, + Err(e) => { + debug!("Cannot stat cached signature for {version}: {e}"); + return None; + } + }; + if !sig_meta.file_type().is_file() { + warn!( + "Cached signature for {version} is not a regular file \ + (symlink/special); discarding cache entry" + ); + return None; + } + if sig_meta.len() != signature::SIGNATURE_SIZE as u64 { + warn!( + "Cached signature for {version} has wrong size ({} bytes, \ + expected {}); discarding cache entry", + sig_meta.len(), + signature::SIGNATURE_SIZE + ); + return None; + } + + // Copy archive + signature into the caller-private directory. + // Everything below operates only on these private copies, which the + // attacker cannot reach — eliminating any verify/extract TOCTOU on + // the shared cache files. + let private_archive = private_dir.join(format!("cached-{version}.archive")); + let private_sig = private_dir.join(format!("cached-{version}.sig")); + + // Cleanup helper defined BEFORE any copy so even a partially-created + // destination from a failed copy is removed on every rejection path. + let cleanup = |reason: &str| { + debug!("Cleaning staged cache copy for {version}: {reason}"); + let _ = fs::remove_file(&private_archive); + let _ = fs::remove_file(&private_sig); + }; + + if let Err(e) = fs::copy(&cached_archive, &private_archive) { + debug!("Could not stage cached archive for {version}: {e}"); + cleanup("archive copy failed"); + return None; + } + if let Err(e) = fs::copy(&cached_sig, &private_sig) { + debug!("Could not stage cached signature for {version}: {e}"); + cleanup("signature copy failed"); + return None; + } + + // Fast corruption pre-check on the PRIVATE copy (NOT the security + // decision). A copy error or truncation surfaces here. + let actual_hash = match sha256_file(&private_archive) { + Ok(h) => h, + Err(e) => { + cleanup(&format!("sha256 read failed: {e}")); + return None; + } + }; + if actual_hash != meta.archive_sha256 { + warn!( + "Binary cache SHA-256 mismatch for version {version} \ + (expected {}, got {actual_hash}) — ignoring cache entry", + meta.archive_sha256 + ); + cleanup("sha256 mismatch"); + return None; + } + + // THE SECURITY GATE: re-verify the ML-DSA-65 signature over the + // PRIVATE archive copy on every hit. The returned path is this same + // private copy, so the caller extracts exactly the bytes that were + // verified — a cache entry tampered with on disk (binary/archive + // swap, forged metadata, or a post-verify swap attempt) cannot + // produce a private copy whose signature verifies against the + // pinned release key. + if let Err(e) = self.verify_archive(&private_archive, &private_sig) { warn!( - "Binary cache SHA-256 mismatch for version {version} (expected {}, got {})", - meta.sha256, actual_hash + "Cached archive for version {version} FAILED ML-DSA signature \ + re-verification ({e}); discarding cache entry (possible \ + on-disk tampering). A fresh verified download will run." ); + cleanup("signature re-verification failed"); return None; } - Some(bin_path) + debug!("Cached archive for version {version} passed ML-DSA re-verification"); + Some(private_archive) } - /// Store a binary in the cache. + /// Store a signature-verified archive in the cache. /// - /// Uses a write-to-temp-then-rename strategy so that readers never - /// observe partially written files. The metadata file is written last - /// so that `get_verified` only succeeds once both files are complete. + /// Both files are persisted (via write-to-temp-then-rename so readers + /// never observe partial writes); the metadata file is written last so + /// [`get_verified_archive`](Self::get_verified_archive) only succeeds + /// once every file is complete. + /// + /// Defence in depth: this re-verifies the archive against its signature + /// before caching, so a poisoned entry cannot be created through the + /// supported path even if a caller forgot to verify first. /// /// # Errors /// - /// Returns an error if the binary cannot be read or the cache files - /// cannot be written. - pub fn store(&self, version: &str, source_path: &std::path::Path) -> Result<()> { - let hash = sha256_file(source_path)?; + /// Returns an error if the signature does not verify, the inputs cannot + /// be read, or the cache files cannot be written. + pub fn store_archive( + &self, + version: &str, + archive_path: &Path, + signature_path: &Path, + ) -> Result<()> { + // Defence in depth: refuse to persist a non-regular file, an + // oversize archive, or a misshapen signature — mirroring the + // `get_verified_archive` cache-hit policy. `symlink_metadata` + // refuses to chase a symlink the caller may have planted. + let archive_meta = fs::symlink_metadata(archive_path)?; + if !archive_meta.file_type().is_file() { + return Err(Error::Upgrade(format!( + "Refusing to cache archive for {version}: source is not a \ + regular file (symlink/special)" + ))); + } + let archive_len = archive_meta.len(); + if archive_len > crate::upgrade::apply::MAX_ARCHIVE_SIZE_BYTES as u64 { + return Err(Error::Upgrade(format!( + "Refusing to cache archive for {version}: size {archive_len} bytes \ + exceeds MAX_ARCHIVE_SIZE_BYTES" + ))); + } + let sig_meta = fs::symlink_metadata(signature_path)?; + if !sig_meta.file_type().is_file() { + return Err(Error::Upgrade(format!( + "Refusing to cache archive for {version}: signature is not a \ + regular file (symlink/special)" + ))); + } + let sig_len = sig_meta.len(); + if sig_len != signature::SIGNATURE_SIZE as u64 { + return Err(Error::Upgrade(format!( + "Refusing to cache archive for {version}: signature size {sig_len} \ + bytes, expected {}", + signature::SIGNATURE_SIZE + ))); + } + + self.verify_archive(archive_path, signature_path) + .map_err(|e| { + Error::Upgrade(format!( + "Refusing to cache archive for {version}: signature does not verify ({e})" + )) + })?; + + let archive_hash = sha256_file(archive_path)?; - let dest = self.cached_binary_path(version); + let dest_archive = self.cached_archive_path(version); + let dest_sig = self.cached_signature_path(version); let meta_path = self.meta_path(version); - // Write binary to a temp file then rename into place. - // Remove dest first on Windows where rename fails if it exists. - let tmp_bin = self.cache_dir.join(format!(".ant-node-{version}.tmp")); - fs::copy(source_path, &tmp_bin)?; - let _ = fs::remove_file(&dest); - fs::rename(&tmp_bin, &dest)?; + Self::atomic_copy( + archive_path, + &dest_archive, + &self + .cache_dir + .join(format!(".ant-node-{version}.archive.tmp")), + )?; + Self::atomic_copy( + signature_path, + &dest_sig, + &self.cache_dir.join(format!(".ant-node-{version}.sig.tmp")), + )?; let now = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .map_err(|e| Error::Upgrade(format!("System clock error: {e}")))? .as_secs(); - let meta = CachedBinaryMeta { + let meta = CachedArchiveMeta { version: version.to_string(), - sha256: hash, + archive_sha256: archive_hash, cached_at_epoch_secs: now, }; let meta_json = serde_json::to_string(&meta) .map_err(|e| Error::Upgrade(format!("Failed to serialize binary cache meta: {e}")))?; - // Write metadata to a temp file then rename into place + // Metadata written last so a reader never sees a complete meta file + // pointing at an incomplete archive/signature pair. let tmp_meta = self.cache_dir.join(format!(".ant-node-{version}.meta.tmp")); let mut f = File::create(&tmp_meta)?; f.write_all(meta_json.as_bytes())?; @@ -127,7 +403,10 @@ impl BinaryCache { let _ = fs::remove_file(&meta_path); fs::rename(&tmp_meta, &meta_path)?; - debug!("Cached binary for version {version} at {}", dest.display()); + debug!( + "Cached verified archive for version {version} at {}", + dest_archive.display() + ); Ok(()) } @@ -135,11 +414,11 @@ impl BinaryCache { /// /// This prevents multiple nodes from downloading the same archive /// concurrently — the first acquires the lock and downloads, the rest - /// wait and then find the binary already cached. + /// wait and then find the archive already cached. /// /// The lock is released when the returned guard is dropped. /// - /// **Note:** `lock_exclusive()` blocks the calling thread. Callers in + /// **Note:** `lock_exclusive()` blocks the calling thread. Callers in /// async contexts should wrap this call in `tokio::task::spawn_blocking`. /// /// # Errors @@ -156,13 +435,17 @@ impl BinaryCache { // -- private helpers ----------------------------------------------------- + /// Copy `src` to `dest` atomically via a temp file + rename. + fn atomic_copy(src: &Path, dest: &Path, tmp: &Path) -> Result<()> { + fs::copy(src, tmp)?; + // Remove dest first on Windows where rename fails if it exists. + let _ = fs::remove_file(dest); + fs::rename(tmp, dest)?; + Ok(()) + } + fn meta_path(&self, version: &str) -> PathBuf { - let name = if cfg!(windows) { - format!("ant-node-{version}.exe.meta.json") - } else { - format!("ant-node-{version}.meta.json") - }; - self.cache_dir.join(name) + self.cache_dir.join(format!("ant-node-{version}.meta.json")) } } @@ -174,7 +457,7 @@ pub struct DownloadLockGuard { } /// Compute the hex-encoded SHA-256 digest of a file. -fn sha256_file(path: &std::path::Path) -> Result { +fn sha256_file(path: &Path) -> Result { let mut file = File::open(path)?; let mut hasher = Sha256::new(); let mut buf = [0u8; 8192]; @@ -198,58 +481,275 @@ fn sha256_file(path: &std::path::Path) -> Result { #[allow(clippy::unwrap_used, clippy::expect_used)] mod tests { use super::*; + use saorsa_pqc::api::sig::{ml_dsa_65, MlDsaPublicKey, MlDsaSecretKey}; + use std::sync::OnceLock; use tempfile::TempDir; + /// One generated keypair for the whole test module (keygen is expensive). + fn test_keypair() -> &'static (MlDsaPublicKey, MlDsaSecretKey) { + static KP: OnceLock<(MlDsaPublicKey, MlDsaSecretKey)> = OnceLock::new(); + KP.get_or_init(|| ml_dsa_65().generate_keypair().unwrap()) + } + + fn cache_with_test_key(dir: &Path) -> BinaryCache { + BinaryCache::new_with_verify_key(dir.to_path_buf(), test_keypair().0.clone()) + } + + /// A caller-private staging directory (the per-upgrade temp dir in + /// production). Returned so it outlives the call. + fn priv_dir() -> TempDir { + TempDir::new().unwrap() + } + + /// Write an archive + a valid detached signature over it. + fn make_signed_archive(dir: &Path, contents: &[u8]) -> (PathBuf, PathBuf) { + let archive = dir.join("src-archive"); + fs::write(&archive, contents).unwrap(); + let sig = ml_dsa_65() + .sign_with_context(&test_keypair().1, contents, signature::SIGNING_CONTEXT) + .unwrap(); + let sig_path = dir.join("src-archive.sig"); + fs::write(&sig_path, sig.to_bytes()).unwrap(); + (archive, sig_path) + } + #[test] fn test_miss_returns_none() { let tmp = TempDir::new().unwrap(); - let cache = BinaryCache::new(tmp.path().to_path_buf()); - assert!(cache.get_verified("1.0.0").is_none()); + let cache = cache_with_test_key(tmp.path()); + let pd = priv_dir(); + assert!(cache.get_verified_archive("1.0.0", pd.path()).is_none()); } #[test] - fn test_store_and_get_verified() { + fn test_store_and_get_verified_archive() { let tmp = TempDir::new().unwrap(); - let cache = BinaryCache::new(tmp.path().to_path_buf()); + let cache = cache_with_test_key(tmp.path()); + let pd = priv_dir(); + + let (archive, sig) = make_signed_archive(tmp.path(), b"signed archive bytes"); + cache.store_archive("1.2.3", &archive, &sig).unwrap(); + + let got = cache + .get_verified_archive("1.2.3", pd.path()) + .expect("cache hit"); + assert_eq!(fs::read(&got).unwrap(), b"signed archive bytes"); + // The returned path must be the PRIVATE copy, not the shared cache + // file (that is what closes the verify/extract TOCTOU). + assert!( + got.starts_with(pd.path()), + "returned archive must be the caller-private copy, got {got:?}" + ); + assert_ne!(got, cache.cached_archive_path("1.2.3")); + } - // Create a fake binary - let src = tmp.path().join("source-bin"); - fs::write(&src, b"hello world binary").unwrap(); + #[test] + fn test_store_rejects_unsigned_archive() { + let tmp = TempDir::new().unwrap(); + let cache = cache_with_test_key(tmp.path()); + let pd = priv_dir(); - cache.store("1.2.3", &src).unwrap(); + let archive = tmp.path().join("a"); + fs::write(&archive, b"unsigned").unwrap(); + let bad_sig = tmp.path().join("a.sig"); + fs::write(&bad_sig, vec![0u8; signature::SIGNATURE_SIZE]).unwrap(); - let result = cache.get_verified("1.2.3"); - assert!(result.is_some()); - let cached_path = result.unwrap(); - assert_eq!(fs::read(&cached_path).unwrap(), b"hello world binary"); + assert!(cache.store_archive("1.0.0", &archive, &bad_sig).is_err()); + assert!(cache.get_verified_archive("1.0.0", pd.path()).is_none()); } + /// An attacker who swaps the cached archive on disk (and even forges a + /// matching SHA-256 in the metadata) cannot get it trusted, because + /// the ML-DSA signature is re-verified on every hit. #[test] - fn test_sha256_mismatch_returns_none() { + fn test_tampered_cached_archive_is_rejected() { let tmp = TempDir::new().unwrap(); - let cache = BinaryCache::new(tmp.path().to_path_buf()); + let cache = cache_with_test_key(tmp.path()); + let pd = priv_dir(); + + let (archive, sig) = make_signed_archive(tmp.path(), b"legit release archive"); + cache.store_archive("2.0.0", &archive, &sig).unwrap(); + assert!(cache.get_verified_archive("2.0.0", pd.path()).is_some()); + + // Attacker overwrites the cached archive with a malicious payload... + let cached_archive = cache.cached_archive_path("2.0.0"); + fs::write(&cached_archive, b"malicious payload").unwrap(); + + // ...and forges the metadata SHA-256 so the corruption pre-check passes. + let forged_hash = { + let mut h = Sha256::new(); + h.update(b"malicious payload"); + hex::encode(h.finalize()) + }; + let meta = CachedArchiveMeta { + version: "2.0.0".to_string(), + archive_sha256: forged_hash, + cached_at_epoch_secs: 0, + }; + fs::write( + cache.meta_path("2.0.0"), + serde_json::to_string(&meta).unwrap(), + ) + .unwrap(); + + // The SHA-256 pre-check now passes, but ML-DSA re-verification of the + // swapped archive against the key fails → entry rejected. + assert!( + cache.get_verified_archive("2.0.0", pd.path()).is_none(), + "tampered cache entry must NOT be trusted even with a forged \ + matching SHA-256 — the signature gate runs on every hit" + ); + } - // Store a valid binary - let src = tmp.path().join("source-bin"); - fs::write(&src, b"original content").unwrap(); - cache.store("1.0.0", &src).unwrap(); + /// TOCTOU defence: even if an attacker swaps the *shared* cache archive + /// for malicious bytes immediately after a hit, the previously returned + /// path (a caller-private copy) still contains the verified bytes, so + /// what gets extracted/executed is exactly what was signature-verified. + #[test] + fn test_returned_archive_is_private_copy_immune_to_post_hit_swap() { + let tmp = TempDir::new().unwrap(); + let cache = cache_with_test_key(tmp.path()); + let pd = priv_dir(); + + let (archive, sig) = make_signed_archive(tmp.path(), b"the real signed release"); + cache.store_archive("3.0.0", &archive, &sig).unwrap(); + + let verified = cache + .get_verified_archive("3.0.0", pd.path()) + .expect("cache hit"); + + // Attacker swaps the SHARED cache archive right after verification. + fs::write( + cache.cached_archive_path("3.0.0"), + b"post-verify malicious swap", + ) + .unwrap(); + + // The path the caller will extract from is the private copy and is + // unaffected by the shared-file swap. + assert_eq!( + fs::read(&verified).unwrap(), + b"the real signed release", + "extraction must read the verified private bytes, not the \ + attacker's post-verification swap" + ); + } - // Corrupt the cached binary - let cached = cache.cached_binary_path("1.0.0"); - fs::write(&cached, b"corrupted content").unwrap(); + #[test] + fn test_missing_signature_returns_none() { + let tmp = TempDir::new().unwrap(); + let cache = cache_with_test_key(tmp.path()); + let pd = priv_dir(); + + let (archive, sig) = make_signed_archive(tmp.path(), b"data"); + cache.store_archive("1.0.0", &archive, &sig).unwrap(); - assert!(cache.get_verified("1.0.0").is_none()); + // Attacker deletes the signature to try to skip verification. + fs::remove_file(cache.cached_signature_path("1.0.0")).unwrap(); + assert!(cache.get_verified_archive("1.0.0", pd.path()).is_none()); } #[test] fn test_missing_meta_returns_none() { let tmp = TempDir::new().unwrap(); - let cache = BinaryCache::new(tmp.path().to_path_buf()); + let cache = cache_with_test_key(tmp.path()); + let pd = priv_dir(); + let (archive, sig) = make_signed_archive(tmp.path(), b"data"); + cache.store_archive("1.0.0", &archive, &sig).unwrap(); + fs::remove_file(cache.meta_path("1.0.0")).unwrap(); + assert!(cache.get_verified_archive("1.0.0", pd.path()).is_none()); + } + + /// Size policy: an attacker with cache-dir write cannot OOM/disk-exhaust + /// the verifier by dropping a multi-GB archive — `get_verified_archive` + /// stat-checks the cached archive against `MAX_ARCHIVE_SIZE_BYTES` BEFORE + /// any copy or `fs::read` reaches `signature::verify_from_file`. + #[test] + fn test_oversize_cached_archive_is_rejected_before_copy() { + let tmp = TempDir::new().unwrap(); + let cache = cache_with_test_key(tmp.path()); + let pd = priv_dir(); + + // Plant a real signed entry so the meta/sig pass earlier checks… + let (archive, sig) = make_signed_archive(tmp.path(), b"legit"); + cache.store_archive("3.1.0", &archive, &sig).unwrap(); + // …then truncate-grow the cached archive past the limit. + let cached_archive = cache.cached_archive_path("3.1.0"); + let oversize = crate::upgrade::apply::MAX_ARCHIVE_SIZE_BYTES as u64 + 1; + { + let f = File::create(&cached_archive).unwrap(); + f.set_len(oversize).unwrap(); + } + + // The size gate rejects pre-copy → no private archive ever staged. + assert!(cache.get_verified_archive("3.1.0", pd.path()).is_none()); + let private_archive = pd.path().join("cached-3.1.0.archive"); + assert!( + !private_archive.exists(), + "oversize entry must NOT be staged into private dir" + ); + } + + #[test] + fn test_wrong_size_signature_is_rejected_before_copy() { + let tmp = TempDir::new().unwrap(); + let cache = cache_with_test_key(tmp.path()); + let pd = priv_dir(); - // Write a binary but no meta file - let cached = cache.cached_binary_path("1.0.0"); - fs::write(&cached, b"binary data").unwrap(); + let (archive, sig) = make_signed_archive(tmp.path(), b"legit"); + cache.store_archive("3.2.0", &archive, &sig).unwrap(); + // Replace the cached signature with the wrong size. + fs::write(cache.cached_signature_path("3.2.0"), b"too-short").unwrap(); - assert!(cache.get_verified("1.0.0").is_none()); + assert!(cache.get_verified_archive("3.2.0", pd.path()).is_none()); + } + + /// `store_archive` itself refuses to persist an oversize archive — even + /// from a (hypothetically) misbehaving caller that bypassed the + /// download-time size cap. + #[test] + fn test_store_archive_rejects_oversize() { + let tmp = TempDir::new().unwrap(); + let cache = cache_with_test_key(tmp.path()); + + // Make a sparse "archive" past the limit and any signature. + let big = tmp.path().join("big.archive"); + { + let f = File::create(&big).unwrap(); + f.set_len(crate::upgrade::apply::MAX_ARCHIVE_SIZE_BYTES as u64 + 1) + .unwrap(); + } + let any_sig = tmp.path().join("any.sig"); + fs::write(&any_sig, vec![0u8; signature::SIGNATURE_SIZE]).unwrap(); + + assert!(cache.store_archive("9.9.9", &big, &any_sig).is_err()); + } + + /// Round-3 regression: a cache-dir writer cannot bypass the size gate + /// by planting a symlink whose `stat(2)` size is small but whose + /// target reads indefinitely (e.g. `/dev/zero`). `symlink_metadata` + /// + `is_file()` rejects the entry before any `fs::copy` reads it. + #[cfg(unix)] + #[test] + fn test_symlink_cached_archive_is_rejected_before_copy() { + let tmp = TempDir::new().unwrap(); + let cache = cache_with_test_key(tmp.path()); + let pd = priv_dir(); + + // Plant a legit signed entry so meta/version/sig-size are good… + let (archive, sig) = make_signed_archive(tmp.path(), b"legit"); + cache.store_archive("4.0.0", &archive, &sig).unwrap(); + // …then replace the cached archive with a symlink to /dev/zero. + let cached_archive = cache.cached_archive_path("4.0.0"); + fs::remove_file(&cached_archive).unwrap(); + std::os::unix::fs::symlink("/dev/zero", &cached_archive).unwrap(); + + assert!( + cache.get_verified_archive("4.0.0", pd.path()).is_none(), + "a symlinked cached archive must be rejected pre-copy, \ + not chased into /dev/zero" + ); + // Nothing should have been staged. + assert!(!pd.path().join("cached-4.0.0.archive").exists()); } } diff --git a/src/upgrade/cache_dir.rs b/src/upgrade/cache_dir.rs index 75458e96..aa099ddb 100644 --- a/src/upgrade/cache_dir.rs +++ b/src/upgrade/cache_dir.rs @@ -26,6 +26,27 @@ pub fn upgrade_cache_dir() -> Result { let cache_dir = project_dirs.data_dir().join("upgrades"); fs::create_dir_all(&cache_dir)?; + // Defence in depth: restrict the shared upgrade cache to the owning + // user (0700) so a co-located low-privilege process cannot + // write/tamper with cached archives in the first place. The ML-DSA + // re-verification on every cache hit is the primary control; this just + // shrinks the attack surface. Best-effort on Unix; a failure to tighten + // permissions must not break upgrades (the crypto gate still holds). + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + if let Ok(meta) = fs::metadata(&cache_dir) { + let mut perms = meta.permissions(); + perms.set_mode(0o700); + if let Err(e) = fs::set_permissions(&cache_dir, perms) { + crate::logging::warn!( + "Could not tighten upgrade cache dir permissions to 0700 ({e}); \ + ML-DSA re-verification still protects cached archives" + ); + } + } + } + Ok(cache_dir) } From 20201325f78401fcf668da5d3fe58422101bf3ad Mon Sep 17 00:00:00 2001 From: grumbach Date: Wed, 20 May 2026 17:00:07 +0900 Subject: [PATCH 02/23] address review: harden meta read, narrow copy TOCTOU, fix stale wording MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review feedback on the upgrade binary cache: - `meta.json` was read with an unbounded `fs::read_to_string`. An attacker with write access to the shared cache directory could plant the metadata sidecar as a symlink to `/dev/zero` or as a huge file and stall the read into a hang/OOM before the archive/sig hardening ran. The metadata path now goes through the same open-once-and-validate gate as the archive: regular-file check on the opened handle, capped at `MAX_META_BYTES` (4 KiB). - Archive + signature staging previously did `symlink_metadata` (path) followed by `fs::copy` (path), leaving a small TOCTOU window where an attacker could race-swap the path to a symlink/FIFO/device or an oversized file between the check and the copy. Both files are now opened once via `open_regular_capped`, validated on the resulting `File` handle (size + file-type), and copied into the private staging dir from the open handle (wrapped in `Read::take(len)` as belt-and-braces against a post-open extension). All subsequent operations on those files use the staged private bytes, never the shared path. - Comment fix: the prior comment claimed `sha256_file` loads the archive into memory in full. It actually streams in 8 KiB chunks; the memory-pressure concern is `signature::verify_from_file*` (FIPS-204 requires the message as a slice). Wording updated. - Stale error message "Failed to serialize binary cache meta" updated to "Failed to serialize cached archive metadata" — the cache now stores archive metadata, not extracted-binary metadata. Two new tests: test_oversized_meta_is_rejected test_meta_symlink_to_special_file_is_rejected (Unix-only) 488 lib tests pass; cfd clean. --- src/upgrade/binary_cache.rs | 232 ++++++++++++++++++++++++++---------- 1 file changed, 169 insertions(+), 63 deletions(-) diff --git a/src/upgrade/binary_cache.rs b/src/upgrade/binary_cache.rs index b708a1c5..3eef3895 100644 --- a/src/upgrade/binary_cache.rs +++ b/src/upgrade/binary_cache.rs @@ -51,10 +51,18 @@ use fs2::FileExt; use saorsa_pqc::api::sig::MlDsaPublicKey; use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; -use std::fs::{self, File}; -use std::io::{Read, Write}; +use std::fs::{self, File, OpenOptions}; +use std::io::{self, Read, Write}; use std::path::{Path, PathBuf}; +/// Maximum size accepted for the `.meta.json` sidecar. +/// +/// A well-formed `CachedArchiveMeta` serialises to roughly 120 bytes; the +/// 4 KiB cap is comfortably above any legitimate payload and tight enough +/// that an attacker who plants a metadata file the size of `/dev/zero` +/// cannot stall the metadata read into a hang or OOM. +const MAX_META_BYTES: u64 = 4 * 1024; + /// On-disk cache for downloaded, signature-verified upgrade archives. #[derive(Clone)] pub struct BinaryCache { @@ -157,13 +165,40 @@ impl BinaryCache { /// The caller MUST extract the binary from the returned (private) archive /// path, so the executed bytes always derive from signature-verified /// input that no other principal could have modified post-verification. + // The verifier-side cache-hit gate is read top-to-bottom by anyone + // auditing the security model. Splitting it into smaller helpers just + // to placate clippy's line limit would scatter the threat model across + // call sites without improving safety. + #[allow(clippy::too_many_lines)] #[must_use] pub fn get_verified_archive(&self, version: &str, private_dir: &Path) -> Option { let cached_archive = self.cached_archive_path(version); let cached_sig = self.cached_signature_path(version); let meta_path = self.meta_path(version); - let meta_data = fs::read_to_string(&meta_path).ok()?; + // Read the metadata sidecar with a small, opened-handle size cap so + // an attacker with cache-dir write cannot plant `meta.json` as a + // symlink to `/dev/zero` (or any large/special file) and force a + // hang/OOM here before the archive/sig hardening runs. + let meta_data = { + let (mut meta_file, meta_len) = match open_regular_capped(&meta_path, MAX_META_BYTES) { + Ok(pair) => pair, + Err(e) => { + debug!("Rejecting cache metadata for {version}: {e}"); + return None; + } + }; + // `meta_len` is capped at MAX_META_BYTES (4 KiB), so this + // truncation can never happen in practice; saturating_cast + // makes that explicit for clippy on 32-bit targets. + let cap = usize::try_from(meta_len).unwrap_or(usize::MAX); + let mut buf = String::with_capacity(cap); + if let Err(e) = meta_file.read_to_string(&mut buf) { + debug!("Failed to read cache metadata for {version}: {e}"); + return None; + } + buf + }; let meta: CachedArchiveMeta = serde_json::from_str(&meta_data).ok()?; if meta.version != version { @@ -171,88 +206,77 @@ impl BinaryCache { return None; } - // Size policy gate — runs BEFORE we copy or read the cached files. + // Open archive + signature ONCE each with size and file-type + // validation on the opened handles. Subsequent reads / hash / + // signature verification all go through the FDs opened here — there + // is no second path-based stat or open after this point, so an + // attacker who races a swap on the cache-dir paths (symlink, FIFO, + // device, oversized file) after these validations cannot redirect + // what gets staged into the private dir. // - // `signature::verify_from_file*` and `sha256_file` both load the - // archive into memory in full. An attacker with cache-dir write - // access could otherwise drop a multi-GB `.archive` and force disk - // exhaustion in the staging dir or an OOM during re-verification - // before the entry is rejected. The download path already enforces - // `MAX_ARCHIVE_SIZE_BYTES`; cache hits must honour the same bound, - // plus the fixed `SIGNATURE_SIZE`. - // Use `symlink_metadata` (does NOT follow symlinks) and require a - // regular file. Otherwise a cache-dir writer could plant - // `ant-node-X.archive -> /dev/zero` (or a FIFO/device) whose `.len()` - // stats as 0 — passing a `fs::metadata` size check while - // `fs::copy` then reads indefinitely from the underlying special - // file, exhausting disk in the private staging dir. - let archive_meta = match fs::symlink_metadata(&cached_archive) { - Ok(m) => m, + // Memory pressure note: `signature::verify_from_file*` reads the + // archive into memory in full (it is the FIPS-204 verifier's + // contract — message must be provided as a slice). `sha256_file` + // streams in 8 KiB chunks and is not an OOM vector. The + // `MAX_ARCHIVE_SIZE_BYTES` cap bounds the in-memory load and the + // staging-dir disk footprint together. + let (mut archive_file, archive_len) = match open_regular_capped( + &cached_archive, + crate::upgrade::apply::MAX_ARCHIVE_SIZE_BYTES as u64, + ) { + Ok(pair) => pair, Err(e) => { - debug!("Cannot stat cached archive for {version}: {e}"); + warn!("Rejecting cached archive for {version}: {e}"); return None; } }; - if !archive_meta.file_type().is_file() { - warn!( - "Cached archive for {version} is not a regular file \ - (symlink/special); discarding cache entry" - ); - return None; - } - if archive_meta.len() > crate::upgrade::apply::MAX_ARCHIVE_SIZE_BYTES as u64 { - warn!( - "Cached archive for {version} exceeds MAX_ARCHIVE_SIZE_BYTES \ - ({} bytes); discarding cache entry", - archive_meta.len() - ); - return None; - } - let sig_meta = match fs::symlink_metadata(&cached_sig) { - Ok(m) => m, - Err(e) => { - debug!("Cannot stat cached signature for {version}: {e}"); - return None; - } - }; - if !sig_meta.file_type().is_file() { - warn!( - "Cached signature for {version} is not a regular file \ - (symlink/special); discarding cache entry" - ); - return None; - } - if sig_meta.len() != signature::SIGNATURE_SIZE as u64 { + let (mut sig_file, sig_len) = + match open_regular_capped(&cached_sig, signature::SIGNATURE_SIZE as u64) { + Ok(pair) => pair, + Err(e) => { + warn!("Rejecting cached signature for {version}: {e}"); + return None; + } + }; + if sig_len != signature::SIGNATURE_SIZE as u64 { + // open_regular_capped enforces ≤ max; we additionally require + // EXACTLY SIGNATURE_SIZE (a shorter sig is not valid ML-DSA-65). warn!( - "Cached signature for {version} has wrong size ({} bytes, \ - expected {}); discarding cache entry", - sig_meta.len(), + "Cached signature for {version} has wrong size ({sig_len} bytes, \ + expected {})", signature::SIGNATURE_SIZE ); return None; } - // Copy archive + signature into the caller-private directory. - // Everything below operates only on these private copies, which the - // attacker cannot reach — eliminating any verify/extract TOCTOU on - // the shared cache files. + // Stream the validated archive + signature into the caller-private + // directory FROM THE ALREADY-OPEN HANDLES (not from the path), so + // the bytes the verifier reads are the exact bytes the open-handle + // metadata checks were performed against. `take()` is belt-and- + // braces against an attacker who extends the file after open. let private_archive = private_dir.join(format!("cached-{version}.archive")); let private_sig = private_dir.join(format!("cached-{version}.sig")); - // Cleanup helper defined BEFORE any copy so even a partially-created - // destination from a failed copy is removed on every rejection path. let cleanup = |reason: &str| { debug!("Cleaning staged cache copy for {version}: {reason}"); let _ = fs::remove_file(&private_archive); let _ = fs::remove_file(&private_sig); }; - if let Err(e) = fs::copy(&cached_archive, &private_archive) { + if let Err(e) = (|| -> io::Result<()> { + let mut dest = File::create(&private_archive)?; + io::copy(&mut (&mut archive_file).take(archive_len), &mut dest)?; + Ok(()) + })() { debug!("Could not stage cached archive for {version}: {e}"); cleanup("archive copy failed"); return None; } - if let Err(e) = fs::copy(&cached_sig, &private_sig) { + if let Err(e) = (|| -> io::Result<()> { + let mut dest = File::create(&private_sig)?; + io::copy(&mut (&mut sig_file).take(sig_len), &mut dest)?; + Ok(()) + })() { debug!("Could not stage cached signature for {version}: {e}"); cleanup("signature copy failed"); return None; @@ -390,8 +414,9 @@ impl BinaryCache { cached_at_epoch_secs: now, }; - let meta_json = serde_json::to_string(&meta) - .map_err(|e| Error::Upgrade(format!("Failed to serialize binary cache meta: {e}")))?; + let meta_json = serde_json::to_string(&meta).map_err(|e| { + Error::Upgrade(format!("Failed to serialize cached archive metadata: {e}")) + })?; // Metadata written last so a reader never sees a complete meta file // pointing at an incomplete archive/signature pair. @@ -456,6 +481,39 @@ pub struct DownloadLockGuard { _file: File, } +/// Open `path` as a regular file with size at most `max_len`, validating +/// the metadata on the **opened handle** so a race between any prior stat +/// and the read cannot substitute a special file (FIFO/device/socket) or +/// an oversized payload. A symlink whose target is a regular file is +/// accepted (it's just an indirect path to a regular file — the attacker +/// who placed the link already needed write access to the cache dir, the +/// same access level as directly editing the regular file); a symlink +/// whose target is a special file is rejected by the `is_file()` check on +/// the opened handle. +/// +/// Returns `(File, len)` on success; the returned `File` is positioned at +/// offset 0 and may be `io::copy`'d into a destination — callers should +/// wrap with `Read::take(max_len)` so an attacker who extends the file +/// after the metadata read cannot stream beyond the cap. +fn open_regular_capped(path: &Path, max_len: u64) -> io::Result<(File, u64)> { + let file = OpenOptions::new().read(true).open(path)?; + let meta = file.metadata()?; + if !meta.file_type().is_file() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "not a regular file (FIFO/device/socket/dir)", + )); + } + let len = meta.len(); + if len > max_len { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("file exceeds size cap ({len} > {max_len})"), + )); + } + Ok((file, len)) +} + /// Compute the hex-encoded SHA-256 digest of a file. fn sha256_file(path: &Path) -> Result { let mut file = File::open(path)?; @@ -752,4 +810,52 @@ mod tests { // Nothing should have been staged. assert!(!pd.path().join("cached-4.0.0.archive").exists()); } + + /// `.meta.json` is read through the same size/file-type gate as the + /// archive and signature: planting a multi-MB metadata file (or a + /// metadata symlink to a special file) is rejected pre-parse without + /// risking a hang or large allocation. + #[test] + fn test_oversized_meta_is_rejected() { + let tmp = TempDir::new().unwrap(); + let cache = cache_with_test_key(tmp.path()); + let pd = priv_dir(); + + // Establish a valid entry so archive/sig are well-formed. + let (archive, sig) = make_signed_archive(tmp.path(), b"legit"); + cache.store_archive("5.0.0", &archive, &sig).unwrap(); + + // Overwrite meta with a file well above MAX_META_BYTES of garbage. + let meta_path = cache.meta_path("5.0.0"); + let huge = vec![b'a'; usize::try_from(MAX_META_BYTES).unwrap_or(usize::MAX) + 1024]; + fs::write(&meta_path, &huge).unwrap(); + + assert!( + cache.get_verified_archive("5.0.0", pd.path()).is_none(), + "oversized metadata file must be rejected before parsing" + ); + } + + /// `.meta.json` planted as a symlink to a special file (e.g. + /// `/dev/zero`) is rejected by the open-handle file-type check, + /// without hanging or OOM'ing on the read. + #[cfg(unix)] + #[test] + fn test_meta_symlink_to_special_file_is_rejected() { + let tmp = TempDir::new().unwrap(); + let cache = cache_with_test_key(tmp.path()); + let pd = priv_dir(); + + let (archive, sig) = make_signed_archive(tmp.path(), b"legit"); + cache.store_archive("5.1.0", &archive, &sig).unwrap(); + + let meta_path = cache.meta_path("5.1.0"); + fs::remove_file(&meta_path).unwrap(); + std::os::unix::fs::symlink("/dev/zero", &meta_path).unwrap(); + + assert!( + cache.get_verified_archive("5.1.0", pd.path()).is_none(), + "metadata symlink to a special file must be rejected" + ); + } } From 0d53f7212e6d1f25d5ab0a4ce221cd9902e1452c Mon Sep 17 00:00:00 2001 From: grumbach Date: Fri, 22 May 2026 15:07:07 +0900 Subject: [PATCH 03/23] fix(upgrade): reject FIFO/pipe planted at cache entry path Close a local DoS on auto-upgrade: a cache-dir attacker could plant a FIFO at ant-node-.archive (or .sig / .meta.json) and open() for reading would block indefinitely waiting for a writer, hanging the upgrade. open_regular_capped previously only checked file type AFTER the blocking open. Two-layer defence in open_regular_capped: - Pre-check via fs::metadata (follows symlinks), reject non-regular files before open(). A symlink-to-regular is still accepted as before; a symlink-to-FIFO/device/socket is rejected. - On Unix, also open with O_NONBLOCK so a race between the pre-check and open() cannot reopen the FIFO window. Reads on regular files ignore O_NONBLOCK, so this is a no-op for the happy path. Platform- specific constant (0o4000 Linux, 0x0004 macOS/BSD); fallback to no flag on unknown unix-likes. The existing post-open is_file() check on the file handle remains the TOCTOU-safe final gate. New regression test test_fifo_cached_archive_does_not_hang plants a real FIFO via mkfifo and asserts return in well under 2s. 14/14 binary_cache tests pass; cfd clean. --- src/upgrade/binary_cache.rs | 122 +++++++++++++++++++++++++++++++++++- 1 file changed, 121 insertions(+), 1 deletion(-) diff --git a/src/upgrade/binary_cache.rs b/src/upgrade/binary_cache.rs index 3eef3895..cbe7c52a 100644 --- a/src/upgrade/binary_cache.rs +++ b/src/upgrade/binary_cache.rs @@ -491,12 +491,78 @@ pub struct DownloadLockGuard { /// whose target is a special file is rejected by the `is_file()` check on /// the opened handle. /// +/// On Unix, `open()` of a FIFO/named-pipe for reading blocks until a +/// writer connects, so a cache-dir attacker could otherwise hang the +/// upgrade indefinitely by planting a FIFO at the cache entry's path. We +/// (a) reject non-regular files via a `fs::metadata()` pre-check (follows +/// symlinks, so a symlink-to-regular is still accepted), and (b) on Unix +/// also open with `O_NONBLOCK` as a belt-and-braces defence in case the +/// pre-check races a swap. The post-open `is_file()` on the opened handle +/// remains the TOCTOU-safe gate. +/// /// Returns `(File, len)` on success; the returned `File` is positioned at /// offset 0 and may be `io::copy`'d into a destination — callers should /// wrap with `Read::take(max_len)` so an attacker who extends the file /// after the metadata read cannot stream beyond the cap. fn open_regular_capped(path: &Path, max_len: u64) -> io::Result<(File, u64)> { - let file = OpenOptions::new().read(true).open(path)?; + // Pre-check: refuse to even open a non-regular file. This is the + // first line of defence against an attacker who planted a FIFO at + // `path` — opening a FIFO for reading on Unix blocks until a writer + // connects, hanging the upgrade indefinitely. `fs::metadata` follows + // symlinks, so a symlink whose target is a regular file is accepted + // here and a symlink whose target is a FIFO/device/socket is rejected. + let pre_meta = fs::metadata(path)?; + if !pre_meta.file_type().is_file() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "not a regular file (FIFO/device/socket/dir)", + )); + } + + // Belt-and-braces against a pre-check vs open() race: on Unix also + // open with O_NONBLOCK, so even if an attacker swaps the regular file + // for a FIFO between the metadata read and open(), the open() returns + // immediately instead of blocking on a writer. Reads on a regular file + // ignore O_NONBLOCK, so this is a no-op for the happy path. The + // post-open is_file() check below still catches the swap. + let file = { + let mut opts = OpenOptions::new(); + opts.read(true); + #[cfg(unix)] + { + use std::os::unix::fs::OpenOptionsExt; + // O_NONBLOCK is platform-specific: 0o4000 on Linux, 0x0004 on + // macOS/*BSD. Reads on a regular file ignore O_NONBLOCK on all + // these platforms, so this is a no-op for the happy path. + #[cfg(target_os = "linux")] + const O_NONBLOCK: i32 = 0o4000; + #[cfg(any( + target_os = "macos", + target_os = "ios", + target_os = "freebsd", + target_os = "netbsd", + target_os = "openbsd", + target_os = "dragonfly", + ))] + const O_NONBLOCK: i32 = 0x0004; + // Fallback for other unix-likes: skip the flag rather than + // guess wrong. The pre-check + post-open is_file() still gate. + #[cfg(not(any( + target_os = "linux", + target_os = "macos", + target_os = "ios", + target_os = "freebsd", + target_os = "netbsd", + target_os = "openbsd", + target_os = "dragonfly", + )))] + const O_NONBLOCK: i32 = 0; + if O_NONBLOCK != 0 { + opts.custom_flags(O_NONBLOCK); + } + } + opts.open(path)? + }; let meta = file.metadata()?; if !meta.file_type().is_file() { return Err(io::Error::new( @@ -836,6 +902,60 @@ mod tests { ); } + /// A cache-dir attacker who replaces the cached archive with a FIFO + /// must not be able to hang `get_verified_archive` waiting for a + /// writer to connect. The pre-check + O_NONBLOCK belt-and-braces + /// returns immediately with an error, the cache hit is abandoned, and + /// the caller falls back to a fresh verified download. + #[cfg(unix)] + #[test] + fn test_fifo_cached_archive_does_not_hang() { + use std::time::{Duration, Instant}; + let tmp = TempDir::new().unwrap(); + let cache = cache_with_test_key(tmp.path()); + let pd = priv_dir(); + + // Plant a legit signed entry so meta/version/sig-size are good, + // then replace the cached archive with a FIFO. Without the + // pre-check + O_NONBLOCK, opening the FIFO for reading would + // block until a writer connected. + let (archive, sig) = make_signed_archive(tmp.path(), b"legit"); + cache.store_archive("6.0.0", &archive, &sig).unwrap(); + let cached_archive = cache.cached_archive_path("6.0.0"); + fs::remove_file(&cached_archive).unwrap(); + let cstr = std::ffi::CString::new(cached_archive.as_os_str().as_encoded_bytes()).unwrap(); + // mkfifo via libc-equivalent: use the nix-free path through + // `std::process::Command` to avoid pulling a libc dep just for + // the test. `mkfifo` is in coreutils on Linux and bundled on + // macOS — both CI targets. + let mkfifo_ok = std::process::Command::new("mkfifo") + .arg(cstr.to_str().unwrap()) + .status() + .ok() + .is_some_and(|s| s.success()); + if !mkfifo_ok { + // If mkfifo isn't available skip rather than fail the suite. + eprintln!("mkfifo unavailable, skipping FIFO test"); + return; + } + + let start = Instant::now(); + let got = cache.get_verified_archive("6.0.0", pd.path()); + let elapsed = start.elapsed(); + + assert!( + got.is_none(), + "a FIFO planted at the cached archive path must be rejected" + ); + assert!( + elapsed < Duration::from_secs(2), + "open of FIFO returned in {elapsed:?}, expected ≪ 2s — \ + pre-check or O_NONBLOCK is not catching this" + ); + // Nothing should have been staged. + assert!(!pd.path().join("cached-6.0.0.archive").exists()); + } + /// `.meta.json` planted as a symlink to a special file (e.g. /// `/dev/zero`) is rejected by the open-handle file-type check, /// without hanging or OOM'ing on the read. From aeb754bfbf80afe397cd757dfc6e369e6d168ca6 Mon Sep 17 00:00:00 2001 From: grumbach Date: Fri, 22 May 2026 15:14:04 +0900 Subject: [PATCH 04/23] refactor(upgrade): address review on FIFO DoS fix Round 2 from adversarial review: - Replace hand-coded O_NONBLOCK constants with libc::O_NONBLOCK. The previous 0o4000/0x0004 per-OS values were correct on x86_64/aarch64/arm but wrong on Linux/MIPS (0o200) and Linux/SPARC (0x4000), where 0o4000 maps to O_NOATIME. Using the libc constant always picks the right value for the target arch. Add libc as a Unix-only direct dependency (was already transitive). - Test test_fifo_cached_archive_does_not_hang: replace the mkfifo shell-out with libc::mkfifo so a CI image that drops coreutils cannot silently skip this test. Bump the budget from 2s to 5s to absorb GitHub Actions macOS runner cold-start variance, since the failure mode "O_NONBLOCK wrong on this arch" and "CI runner slow" look identical from the assertion. - Document the load-bearing invariant on get_verified_archive's private_dir: callers MUST supply a process-private 0o700 dir (apply.rs already does via tempfile + permissions). Without that the reopens-by-path in sha256_file/verify_archive would reopen a TOCTOU window. - Add a cross-reference comment explaining the intentional asymmetry between store_archive (uses symlink_metadata, rejects symlinks) and open_regular_capped (uses fs::metadata, accepts symlink-to-regular) so a later editor doesn't unify them in the wrong direction. 14/14 binary_cache tests pass, 489/489 lib tests pass, cfd clean. --- Cargo.toml | 3 ++ src/upgrade/binary_cache.rs | 86 ++++++++++++++++++------------------- 2 files changed, 44 insertions(+), 45 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ab3f24ac..002c8358 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -106,6 +106,9 @@ page_size = "0.6" # Protocol serialization postcard = { version = "1.1.3", features = ["use-std"] } +[target.'cfg(unix)'.dependencies] +libc = "0.2" + [target.'cfg(windows)'.dependencies] self-replace = "1" diff --git a/src/upgrade/binary_cache.rs b/src/upgrade/binary_cache.rs index cbe7c52a..af505fdd 100644 --- a/src/upgrade/binary_cache.rs +++ b/src/upgrade/binary_cache.rs @@ -165,6 +165,14 @@ impl BinaryCache { /// The caller MUST extract the binary from the returned (private) archive /// path, so the executed bytes always derive from signature-verified /// input that no other principal could have modified post-verification. + /// + /// `private_dir` is a load-bearing security invariant: it MUST be a + /// process-private, mode-`0o700` directory that no other principal + /// can write to. The caller in `apply.rs` creates it via + /// `tempfile::Builder::permissions(0o700).tempdir_in(binary_dir)` — + /// any future caller MUST uphold the same invariant, otherwise the + /// reopens by path in `sha256_file` and `verify_archive` would re- + /// introduce a TOCTOU window. // The verifier-side cache-hit gate is read top-to-bottom by anyone // auditing the security model. Splitting it into smaller helpers just // to placate clippy's line limit would scatter the threat model across @@ -347,6 +355,17 @@ impl BinaryCache { // oversize archive, or a misshapen signature — mirroring the // `get_verified_archive` cache-hit policy. `symlink_metadata` // refuses to chase a symlink the caller may have planted. + // + // Note the intentional asymmetry with `open_regular_capped` + // (which uses `fs::metadata` and DOES follow symlinks): on the + // store path the source file is supplied by the caller (typically + // a path under our control after download), so a symlink there is + // surprising and worth rejecting. On the read path the cache dir + // is shared and an attacker may have planted a symlink — but the + // attacker already has write access, so chasing a symlink-to- + // regular is no worse than them editing the regular file + // directly, while still letting the post-open `is_file()` reject + // symlink-to-special. let archive_meta = fs::symlink_metadata(archive_path)?; if !archive_meta.file_type().is_file() { return Err(Error::Upgrade(format!( @@ -531,35 +550,13 @@ fn open_regular_capped(path: &Path, max_len: u64) -> io::Result<(File, u64)> { #[cfg(unix)] { use std::os::unix::fs::OpenOptionsExt; - // O_NONBLOCK is platform-specific: 0o4000 on Linux, 0x0004 on - // macOS/*BSD. Reads on a regular file ignore O_NONBLOCK on all - // these platforms, so this is a no-op for the happy path. - #[cfg(target_os = "linux")] - const O_NONBLOCK: i32 = 0o4000; - #[cfg(any( - target_os = "macos", - target_os = "ios", - target_os = "freebsd", - target_os = "netbsd", - target_os = "openbsd", - target_os = "dragonfly", - ))] - const O_NONBLOCK: i32 = 0x0004; - // Fallback for other unix-likes: skip the flag rather than - // guess wrong. The pre-check + post-open is_file() still gate. - #[cfg(not(any( - target_os = "linux", - target_os = "macos", - target_os = "ios", - target_os = "freebsd", - target_os = "netbsd", - target_os = "openbsd", - target_os = "dragonfly", - )))] - const O_NONBLOCK: i32 = 0; - if O_NONBLOCK != 0 { - opts.custom_flags(O_NONBLOCK); - } + // `O_NONBLOCK` is per-arch on Linux (0o4000 on x86/arm/aarch64 + // /riscv, 0o200 on mips, 0x4000 on sparc, etc.). Use `libc` + // so we always pick the right constant for the target arch + // instead of silently setting a different flag. Reads on a + // regular file ignore `O_NONBLOCK` on all our supported + // platforms, so this is a no-op for the happy path. + opts.custom_flags(libc::O_NONBLOCK); } opts.open(path)? }; @@ -923,22 +920,19 @@ mod tests { cache.store_archive("6.0.0", &archive, &sig).unwrap(); let cached_archive = cache.cached_archive_path("6.0.0"); fs::remove_file(&cached_archive).unwrap(); + + // Use libc::mkfifo directly so a CI image that drops coreutils + // can't silently skip this test (an earlier shell-out version + // would hide a packaging regression). The unsafe block is scoped + // to the single FFI call — `mkfifo(2)` takes a NUL-terminated + // path, returns 0 on success and -1 on error with errno set. let cstr = std::ffi::CString::new(cached_archive.as_os_str().as_encoded_bytes()).unwrap(); - // mkfifo via libc-equivalent: use the nix-free path through - // `std::process::Command` to avoid pulling a libc dep just for - // the test. `mkfifo` is in coreutils on Linux and bundled on - // macOS — both CI targets. - let mkfifo_ok = std::process::Command::new("mkfifo") - .arg(cstr.to_str().unwrap()) - .status() - .ok() - .is_some_and(|s| s.success()); - if !mkfifo_ok { - // If mkfifo isn't available skip rather than fail the suite. - eprintln!("mkfifo unavailable, skipping FIFO test"); - return; - } + #[allow(unsafe_code)] + let rc = unsafe { libc::mkfifo(cstr.as_ptr(), 0o600) }; + assert_eq!(rc, 0, "mkfifo failed: {}", std::io::Error::last_os_error()); + // Measure only the cache-hit path so cold-process startup or + // unrelated test parallelism don't blow the budget. let start = Instant::now(); let got = cache.get_verified_archive("6.0.0", pd.path()); let elapsed = start.elapsed(); @@ -947,9 +941,11 @@ mod tests { got.is_none(), "a FIFO planted at the cached archive path must be rejected" ); + // 5s gives generous headroom on a contended CI macOS runner + // while still catching a real "open is blocking on the FIFO". assert!( - elapsed < Duration::from_secs(2), - "open of FIFO returned in {elapsed:?}, expected ≪ 2s — \ + elapsed < Duration::from_secs(5), + "open of FIFO returned in {elapsed:?}, expected ≪ 5s — \ pre-check or O_NONBLOCK is not catching this" ); // Nothing should have been staged. From ae63d6de50fd15feb625e966b98a6799eb0f6d71 Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Tue, 19 May 2026 17:59:11 +0100 Subject: [PATCH 05/23] ci: build Linux releases against musl Switch both Linux release targets from glibc to musl so the published binaries run on any Linux distribution, including Alpine and other musl-based systems. Asset filenames are unchanged (ant-node-cli-linux-{arm64,x64}.tar.gz) so existing auto-upgraders on deployed nodes continue to find them. x86_64-unknown-linux-musl now uses `cross` for the musl toolchain (matching aarch64). musl-static binaries have no dynamic linker dependency and execute on glibc hosts as well as musl hosts. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/release.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 3c3e8996..b49063a5 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -82,13 +82,16 @@ jobs: fail-fast: false matrix: include: - # Use ubuntu-22.04 for GLIBC 2.35 compatibility with server deployments - - target: x86_64-unknown-linux-gnu + # Linux builds use musl for portability across glibc and musl distros + # (e.g. Alpine). Built via `cross` so the musl toolchain is provided + # by the cross-rs container image. + - target: x86_64-unknown-linux-musl os: ubuntu-22.04 binary: ant-node archive: tar.gz + cross: true friendly_name: linux-x64 - - target: aarch64-unknown-linux-gnu + - target: aarch64-unknown-linux-musl os: ubuntu-22.04 binary: ant-node archive: tar.gz From 857969f0d960e44df737ebed8c549475175b2c8b Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Tue, 19 May 2026 18:04:17 +0100 Subject: [PATCH 06/23] feat: use mimalloc as global allocator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit musl's default malloc is notably slower than glibc's under concurrent allocation churn — the steady-state shape of a DHT-bridged P2P node. Switching the global allocator to mimalloc neutralises that regression for the musl Linux builds, and tends to outperform glibc's allocator as well, so all builds benefit. Applied to both ant-node and ant-devnet binaries. Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 20 ++++++++++++++++++++ Cargo.toml | 6 ++++++ src/bin/ant-devnet/main.rs | 3 +++ src/bin/ant-node/main.rs | 3 +++ 4 files changed, 32 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 3cf0f2a3..c16e1331 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -825,7 +825,9 @@ dependencies = [ "futures", "heed", "hex", + "libc", "lru", + "mimalloc", "objc2", "objc2-foundation", "page_size", @@ -3466,6 +3468,15 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" +[[package]] +name = "libmimalloc-sys" +version = "0.1.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d1eacfa31c33ec25e873c136ba5669f00f9866d0688bea7be4d3f7e43067df6" +dependencies = [ + "cc", +] + [[package]] name = "libredox" version = "0.1.16" @@ -3584,6 +3595,15 @@ dependencies = [ "autocfg", ] +[[package]] +name = "mimalloc" +version = "0.1.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3627c4272df786b9260cabaa46aec1d59c93ede723d4c3ef646c503816b0640" +dependencies = [ + "libmimalloc-sys", +] + [[package]] name = "minimal-lexical" version = "0.2.1" diff --git a/Cargo.toml b/Cargo.toml index 002c8358..9551fc6e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,12 @@ name = "ant-devnet" path = "src/bin/ant-devnet/main.rs" [dependencies] +# Global allocator. musl's default malloc is significantly slower than +# glibc's under concurrent allocation churn, which matches the node's +# steady-state workload. mimalloc neutralises that regression for the +# musl Linux builds (and tends to beat glibc's allocator too). +mimalloc = "0.1" + # Wire protocol — the single version-pin shared with ant-client. # Bumping ant-protocol's `evmlib`/`saorsa-core`/`saorsa-pqc` pins ripples # through here automatically; we keep a direct saorsa-core dep for diff --git a/src/bin/ant-devnet/main.rs b/src/bin/ant-devnet/main.rs index 1117f7de..44d85b7b 100644 --- a/src/bin/ant-devnet/main.rs +++ b/src/bin/ant-devnet/main.rs @@ -2,6 +2,9 @@ #![cfg_attr(not(feature = "logging"), allow(unused_variables))] +#[global_allocator] +static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; + mod cli; use ant_node::devnet::{Devnet, DevnetConfig, DevnetEvmInfo, DevnetManifest}; diff --git a/src/bin/ant-node/main.rs b/src/bin/ant-node/main.rs index 3af62dc9..6849103d 100644 --- a/src/bin/ant-node/main.rs +++ b/src/bin/ant-node/main.rs @@ -2,6 +2,9 @@ #![cfg_attr(not(feature = "logging"), allow(unused_variables))] +#[global_allocator] +static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; + mod cli; mod platform; From baa7dd2fad15b000bb3162ef53b9aa9b807df7de Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Sun, 24 May 2026 17:38:38 +0100 Subject: [PATCH 07/23] chore(release): cut rc-2026.5.4 --- Cargo.lock | 56 ++++++++++++++++++++++++++---------------------------- Cargo.toml | 6 +++--- 2 files changed, 30 insertions(+), 32 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3cf0f2a3..1708d67e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -809,7 +809,7 @@ dependencies = [ [[package]] name = "ant-node" -version = "0.11.4" +version = "0.11.5-rc.1" dependencies = [ "alloy", "ant-protocol", @@ -859,9 +859,8 @@ dependencies = [ [[package]] name = "ant-protocol" -version = "2.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4d0ba3f671c08a1d52291b601ec35a7353f4f4e10f221b5f0ee372d154636dd" +version = "2.1.2-rc.1" +source = "git+https://github.com/WithAutonomi/ant-protocol?branch=rc-2026.5.4#5d80f6b0e291642305f3047522f92db3ef93d74b" dependencies = [ "blake3", "bytes", @@ -1204,9 +1203,9 @@ dependencies = [ [[package]] name = "autocfg" -version = "1.5.0" +version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53" [[package]] name = "aws-lc-rs" @@ -1421,9 +1420,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.20.2" +version = "3.20.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" +checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649" [[package]] name = "byte-slice-cast" @@ -1679,9 +1678,9 @@ dependencies = [ [[package]] name = "const-hex" -version = "1.19.0" +version = "1.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20d9a563d167a9cce0f94153382b33cb6eded6dfabff03c69ad65a28ea1514e0" +checksum = "33e2a781ebdf4467d1428dc4593067825fb646f6871475098d8577421af73558" dependencies = [ "cfg-if", "cpufeatures 0.2.17", @@ -3364,9 +3363,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.98" +version = "0.3.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67df7112613f8bfd9150013a0314e196f4800d3201ae742489d999db2f979f08" +checksum = "142bc4740e452c1e57ade0cbc129f139c9093e354346f0872ef985f4f5cf5f11" dependencies = [ "cfg-if", "futures-util", @@ -4846,9 +4845,8 @@ dependencies = [ [[package]] name = "saorsa-core" -version = "0.24.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c1267928da1bcc91748c314f95f7952bc01c0359a4ac70a0b111b0386898934" +version = "0.24.5-rc.1" +source = "git+https://github.com/saorsa-labs/saorsa-core?branch=rc-2026.5.4#7da46fe36b6124ac4cde434bcd18f70d4a8338ab" dependencies = [ "anyhow", "async-trait", @@ -5219,9 +5217,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.149" +version = "1.0.150" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9" dependencies = [ "itoa", "memchr", @@ -6263,9 +6261,9 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.121" +version = "0.2.122" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49ace1d07c165b0864824eee619580c4689389afa9dc9ed3a4c75040d82e6790" +checksum = "3ed04576f974d2b2fba0f38c51dbc5518011e38c36bf1143164be765528fd409" dependencies = [ "cfg-if", "once_cell", @@ -6276,9 +6274,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.71" +version = "0.4.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96492d0d3ffba25305a7dc88720d250b1401d7edca02cc3bcd50633b424673b8" +checksum = "9473dbd2991ae90b6291c3c32c30c6187ac49aa32f9905d1cce280ec1e110b0f" dependencies = [ "js-sys", "wasm-bindgen", @@ -6286,9 +6284,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.121" +version = "0.2.122" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e68e6f4afd367a562002c05637acb8578ff2dea1943df76afb9e83d177c8578" +checksum = "916151b09da36bd82f6615cbf3a419e2f0ba23a03c6160e8e92eb6bd4aa1dec6" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -6296,9 +6294,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.121" +version = "0.2.122" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d95a9ec35c64b2a7cb35d3fead40c4238d0940c86d107136999567a4703259f2" +checksum = "299047362ccbfce148b67ab7e73349f77748e00c8296f9542adfad2ad82c5c5e" dependencies = [ "bumpalo", "proc-macro2", @@ -6309,9 +6307,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.121" +version = "0.2.122" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4e0100b01e9f0d03189a92b96772a1fb998639d981193d7dbab487302513441" +checksum = "9a929b2c61f11ba3e9bc35b50c1f25cb38e0e892c0c231ae2b8cf78d5dad4437" dependencies = [ "unicode-ident", ] @@ -6366,9 +6364,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.98" +version = "0.3.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b572dff8bcf38bad0fa19729c89bb5748b2b9b1d8be70cf90df697e3a8f32aa" +checksum = "6d621441cfc37b84979402712047321980c178f299193a3589d05b99e8763436" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/Cargo.toml b/Cargo.toml index ab3f24ac..eb469ae9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ant-node" -version = "0.11.4" +version = "0.11.5-rc.1" edition = "2021" authors = ["David Irvine "] description = "Pure quantum-proof network node for the Autonomi decentralized network" @@ -33,10 +33,10 @@ path = "src/bin/ant-devnet/main.rs" # Until then, the git pin tracks the matching saorsa-core lineage # (the rc-2026.4.2 branch) so Cargo can unify the wire types here # with ant-protocol's re-exports. -ant-protocol = "2.1.1" +ant-protocol = { git = "https://github.com/WithAutonomi/ant-protocol", branch = "rc-2026.5.4" } # Core (provides EVERYTHING: networking, DHT, security, trust, storage) -saorsa-core = "0.24.4" +saorsa-core = { git = "https://github.com/saorsa-labs/saorsa-core", branch = "rc-2026.5.4" } saorsa-pqc = "0.5" # Payment verification - autonomi network lookup + EVM payment From 4c6db0087b40429c8187daa5fc88bf59f18c8d1f Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Sun, 24 May 2026 18:23:25 +0100 Subject: [PATCH 08/23] fix: answer Merkle closeness check from local routing table The Merkle pay-yourself defence verified candidate closeness with an iterative Kademlia *network* lookup (find_closest_nodes_network) on the PUT-handling hot path. That lookup runs up to MAX_ITERATIONS rounds bounded by CLOSENESS_LOOKUP_TIMEOUT (240s) and is the dominant term in slow per-chunk store times; its instability (fresh transient peers pulled in on every call) also contributes to the closeness disagreements that cause outright rejections. Answer instead from the local routing table (find_closest_nodes_local, a pure in-memory k-bucket read with no network I/O), matching the precedent already used for the close-group responsibility check (find_closest_nodes_local_with_self). Fall back to the network lookup only when the local table is genuinely too sparse to be authoritative (fewer than CLOSENESS_LOOKUP_WIDTH peers near the midpoint). The fallback is gated on local table size, not match outcome, so a forged pool cannot force the expensive 240s path -- an attacker cannot make a victim's local routing table sparse. check_closeness_match and the single-flight pass-cache wrapper are unchanged. Node-side only, no wire/protocol change, so this is backwards compatible across a mixed-version fleet. The fallback decision is extracted into a pure const fn (closeness_should_fall_back_to_network) so its CLOSENESS_LOOKUP_WIDTH boundary is unit tested without standing up a P2PNode. Test results: - cargo fmt -- --check: clean - cargo clippy --lib --all-features -- -D clippy::panic -D clippy::unwrap_used -D clippy::expect_used: no warnings - cargo test --lib payment::verifier: 67 passed, 0 failed (incl. new boundary test closeness_falls_back_to_network_only_below_lookup_width) - e2e test target (--test e2e --features test-utils): compiles Co-Authored-By: Claude Opus 4.7 (1M context) --- src/payment/verifier.rs | 140 ++++++++++++++++++++++++++++++++-------- 1 file changed, 113 insertions(+), 27 deletions(-) diff --git a/src/payment/verifier.rs b/src/payment/verifier.rs index 8f63916e..995fad34 100644 --- a/src/payment/verifier.rs +++ b/src/payment/verifier.rs @@ -990,6 +990,12 @@ impl PaymentVerifier { /// the dominant cost is still Sybil-grinding midpoint addresses or /// running real nodes near the target — same security floor. /// `CANDIDATE_CLOSENESS_REQUIRED` (13/16) is unchanged. + /// + /// Also doubles as the sparse-table gate in + /// [`verify_merkle_candidate_closeness_inner`]: the storer answers from its + /// local routing table and only falls back to the iterative network lookup + /// when the local table returns fewer than this many peers near the + /// midpoint (i.e. it genuinely cannot answer authoritatively). const CLOSENESS_LOOKUP_WIDTH: usize = 2 * evmlib::merkle_payments::CANDIDATES_PER_POOL; /// Maximum waiter → leader retries when the leader's future was cancelled @@ -1024,6 +1030,21 @@ impl PaymentVerifier { } } + /// Whether the storer must fall back from the local routing table to the + /// iterative network lookup for the Merkle closeness check. + /// + /// The local k-buckets can answer authoritatively only when they hold at + /// least `CLOSENESS_LOOKUP_WIDTH` peers near the midpoint; below that the + /// table is genuinely too sparse and we pay for the network lookup. The + /// gate is local table size — NOT match outcome — so a forged pool cannot + /// force the expensive 240s path (an attacker cannot make a victim's local + /// routing table sparse). Extracted from + /// [`verify_merkle_candidate_closeness_inner`] so the boundary can be unit + /// tested without a `P2PNode`. + const fn closeness_should_fall_back_to_network(local_peer_count: usize) -> bool { + local_peer_count < Self::CLOSENESS_LOOKUP_WIDTH + } + /// Verify that the candidate pool's `pub_keys` correspond to peers that /// are actually XOR-closest to the pool midpoint address, by querying /// the DHT for its closest peers to that address and requiring that a @@ -1340,36 +1361,68 @@ impl PaymentVerifier { // the pool rather than truncating, which would otherwise re-open the // K-too-small failure mode. let lookup_count = Self::closeness_lookup_count(pool.candidate_nodes.len()); - let network_lookup = p2p_node + + // Fast path: answer from the local routing table. This is a pure + // in-memory k-bucket read (`find_closest_nodes_local` returns + // `Vec` with no network I/O and no `Result`), so it is safe to + // call from this PUT-handling request handler — unlike + // `find_closest_nodes_network`, which runs an iterative Kademlia lookup + // (up to MAX_ITERATIONS rounds, bounded by CLOSENESS_LOOKUP_TIMEOUT) and + // is the dominant term in slow per-chunk store times. The local table is + // already the view this node trusts for the close-group responsibility + // check (`find_closest_nodes_local_with_self` above), so using it here + // brings the Merkle closeness check in line with that precedent. + let mut network_peers = p2p_node .dht_manager() - .find_closest_nodes_network(&pool_address.0, lookup_count); - let network_peers = - match tokio::time::timeout(Self::CLOSENESS_LOOKUP_TIMEOUT, network_lookup).await { - Ok(Ok(peers)) => peers, - Ok(Err(e)) => { - debug!( - "Merkle closeness network-lookup failed for pool midpoint {}: {e}", - hex::encode(pool_address.0), - ); - return Err(Error::Payment( - "Merkle candidate pool rejected: could not verify candidate \ + .find_closest_nodes_local(&pool_address.0, lookup_count) + .await; + + // Sparse-table fallback: only when the local table genuinely cannot + // answer authoritatively (fewer than CLOSENESS_LOOKUP_WIDTH peers near + // the midpoint) do we pay for the iterative network lookup. The gate is + // local table size, NOT match outcome — an attacker cannot make a + // victim's local routing table sparse, so a forged pool cannot force the + // expensive 240s network path (DoS-safe). On a well-connected production + // node the local table is dense near any key, so this path is rare. + if Self::closeness_should_fall_back_to_network(network_peers.len()) { + debug!( + "Merkle closeness: local table returned only {} peers (< {}) for \ + pool midpoint {}; falling back to network lookup", + network_peers.len(), + Self::CLOSENESS_LOOKUP_WIDTH, + hex::encode(pool_address.0), + ); + let network_lookup = p2p_node + .dht_manager() + .find_closest_nodes_network(&pool_address.0, lookup_count); + network_peers = + match tokio::time::timeout(Self::CLOSENESS_LOOKUP_TIMEOUT, network_lookup).await { + Ok(Ok(peers)) => peers, + Ok(Err(e)) => { + debug!( + "Merkle closeness network-lookup failed for pool midpoint {}: {e}", + hex::encode(pool_address.0), + ); + return Err(Error::Payment( + "Merkle candidate pool rejected: could not verify candidate \ closeness against the authoritative network view." - .into(), - )); - } - Err(_) => { - debug!( - "Merkle closeness network-lookup timeout ({:?}) for pool midpoint {}", - Self::CLOSENESS_LOOKUP_TIMEOUT, - hex::encode(pool_address.0), - ); - return Err(Error::Payment( - "Merkle candidate pool rejected: authoritative network lookup \ + .into(), + )); + } + Err(_) => { + debug!( + "Merkle closeness network-lookup timeout ({:?}) for pool midpoint {}", + Self::CLOSENESS_LOOKUP_TIMEOUT, + hex::encode(pool_address.0), + ); + return Err(Error::Payment( + "Merkle candidate pool rejected: authoritative network lookup \ timed out. Retry once the network lookup completes." - .into(), - )); - } - }; + .into(), + )); + } + }; + } let network_peer_ids: Vec = network_peers.iter().map(|n| n.peer_id).collect(); Self::check_closeness_match(&candidate_peer_ids, &network_peer_ids, &pool_address.0) @@ -3353,6 +3406,39 @@ mod tests { ); } + #[test] + fn closeness_falls_back_to_network_only_below_lookup_width() { + // Fix B: the storer answers the Merkle closeness check from its local + // routing table and falls back to the iterative network lookup ONLY + // when the local table is genuinely too sparse to be authoritative — + // i.e. it returns fewer than CLOSENESS_LOOKUP_WIDTH peers near the + // midpoint. The gate is local table size, not match outcome, so a + // forged pool cannot force the expensive 240s network path. + let width = PaymentVerifier::CLOSENESS_LOOKUP_WIDTH; + + // Below the boundary: local table too sparse → must fall back. + assert!( + PaymentVerifier::closeness_should_fall_back_to_network(0), + "an empty local table must fall back to the network lookup" + ); + assert!( + PaymentVerifier::closeness_should_fall_back_to_network(width - 1), + "WIDTH-1 local peers is still too sparse — must fall back" + ); + + // At/above the boundary: local table is authoritative → no fallback. + // This is the common production path: a forged pool reaching a + // well-connected node must NOT be able to trigger the network lookup. + assert!( + !PaymentVerifier::closeness_should_fall_back_to_network(width), + "exactly WIDTH local peers is authoritative — must not fall back" + ); + assert!( + !PaymentVerifier::closeness_should_fall_back_to_network(width + 1), + "more than WIDTH local peers is authoritative — must not fall back" + ); + } + // Compile-time invariant: the `closeness_lookup_count` formula relies // on WIDTH being ≥ CANDIDATES_PER_POOL so we never request fewer peers // than the pool itself contains. From 246965bbcf32c2a9372772dafa3ff56f5cb9695f Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Tue, 26 May 2026 14:48:49 +0100 Subject: [PATCH 09/23] fix: verify merkle closeness against XOR-only local lookup Addresses the coordination concern raised in review (dirvine): the Merkle closeness check is a *verification* that must mirror the uploader's pure XOR-distance view, not the reachability re-rank used for storage selection. With saorsa-core's reachability-aware find_closest_nodes_local (saorsa-labs/saorsa-core#121), a re-rank could demote an XOR-close relay-only peer out of the compared window and falsely reject an honest candidate pool that legitimately contains that peer. Switch the closeness check to find_closest_nodes_local_by_distance, the XOR-only variant added to saorsa-core#121 for exactly this purpose. check_closeness_match (the set-membership helper) is unchanged. Also rename the local variable network_peers -> closeness_peers for readability (review feedback, grumbach), since it now usually holds local-table results. The rc-2026.5.4 dependency pins (saorsa-core, ant-protocol) come from the release-cut base commit; this commit only advances Cargo.lock to the rc-2026.5.4 tip so the pin includes the merged #121 (find_closest_nodes_local_by_distance), which the base's release cut predated. Test results (against the rc-2026.5.4 deps): - cargo fmt -- --check: clean - cargo clippy --lib --all-features (-D panic -D unwrap_used -D expect_used): no warnings - cargo test --lib payment::verifier: 67 passed, 0 failed Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 2 +- src/payment/verifier.rs | 33 +++++++++++++++++++-------------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1708d67e..929c1239 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4846,7 +4846,7 @@ dependencies = [ [[package]] name = "saorsa-core" version = "0.24.5-rc.1" -source = "git+https://github.com/saorsa-labs/saorsa-core?branch=rc-2026.5.4#7da46fe36b6124ac4cde434bcd18f70d4a8338ab" +source = "git+https://github.com/saorsa-labs/saorsa-core?branch=rc-2026.5.4#1be73520bb8939d651f6d55ef1b2308810995052" dependencies = [ "anyhow", "async-trait", diff --git a/src/payment/verifier.rs b/src/payment/verifier.rs index 995fad34..660f70fd 100644 --- a/src/payment/verifier.rs +++ b/src/payment/verifier.rs @@ -1363,18 +1363,23 @@ impl PaymentVerifier { let lookup_count = Self::closeness_lookup_count(pool.candidate_nodes.len()); // Fast path: answer from the local routing table. This is a pure - // in-memory k-bucket read (`find_closest_nodes_local` returns - // `Vec` with no network I/O and no `Result`), so it is safe to - // call from this PUT-handling request handler — unlike + // in-memory k-bucket read (`find_closest_nodes_local_by_distance` + // returns `Vec` with no network I/O and no `Result`), so it is + // safe to call from this PUT-handling request handler — unlike // `find_closest_nodes_network`, which runs an iterative Kademlia lookup // (up to MAX_ITERATIONS rounds, bounded by CLOSENESS_LOOKUP_TIMEOUT) and - // is the dominant term in slow per-chunk store times. The local table is - // already the view this node trusts for the close-group responsibility - // check (`find_closest_nodes_local_with_self` above), so using it here - // brings the Merkle closeness check in line with that precedent. - let mut network_peers = p2p_node + // is the dominant term in slow per-chunk store times. + // + // We use the XOR-only `_by_distance` variant deliberately, NOT the + // reachability-reranked `find_closest_nodes_local`: this is a closeness + // *verification*, so it must mirror the uploader's pure XOR-distance + // view. The reachability re-rank (which the close-group *selection* path + // uses) could demote an XOR-close relay-only peer out of the compared + // window and falsely reject an honest candidate pool that legitimately + // contains that peer. See saorsa-labs/saorsa-core#121. + let mut closeness_peers = p2p_node .dht_manager() - .find_closest_nodes_local(&pool_address.0, lookup_count) + .find_closest_nodes_local_by_distance(&pool_address.0, lookup_count) .await; // Sparse-table fallback: only when the local table genuinely cannot @@ -1384,18 +1389,18 @@ impl PaymentVerifier { // victim's local routing table sparse, so a forged pool cannot force the // expensive 240s network path (DoS-safe). On a well-connected production // node the local table is dense near any key, so this path is rare. - if Self::closeness_should_fall_back_to_network(network_peers.len()) { + if Self::closeness_should_fall_back_to_network(closeness_peers.len()) { debug!( "Merkle closeness: local table returned only {} peers (< {}) for \ pool midpoint {}; falling back to network lookup", - network_peers.len(), + closeness_peers.len(), Self::CLOSENESS_LOOKUP_WIDTH, hex::encode(pool_address.0), ); let network_lookup = p2p_node .dht_manager() .find_closest_nodes_network(&pool_address.0, lookup_count); - network_peers = + closeness_peers = match tokio::time::timeout(Self::CLOSENESS_LOOKUP_TIMEOUT, network_lookup).await { Ok(Ok(peers)) => peers, Ok(Err(e)) => { @@ -1424,8 +1429,8 @@ impl PaymentVerifier { }; } - let network_peer_ids: Vec = network_peers.iter().map(|n| n.peer_id).collect(); - Self::check_closeness_match(&candidate_peer_ids, &network_peer_ids, &pool_address.0) + let closeness_peer_ids: Vec = closeness_peers.iter().map(|n| n.peer_id).collect(); + Self::check_closeness_match(&candidate_peer_ids, &closeness_peer_ids, &pool_address.0) } /// Verify a merkle batch payment proof. From ed939af0945c583feed5d1243781275af200dcc7 Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Tue, 26 May 2026 18:01:13 +0100 Subject: [PATCH 10/23] chore(release): refresh rc-2026.5.4 lock for updated saorsa-core/ant-protocol --- Cargo.lock | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 929c1239..fcbaccbd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -860,7 +860,7 @@ dependencies = [ [[package]] name = "ant-protocol" version = "2.1.2-rc.1" -source = "git+https://github.com/WithAutonomi/ant-protocol?branch=rc-2026.5.4#5d80f6b0e291642305f3047522f92db3ef93d74b" +source = "git+https://github.com/WithAutonomi/ant-protocol?branch=rc-2026.5.4#3d57d10ba7112169e3e02e955a4272dfcb13c550" dependencies = [ "blake3", "bytes", @@ -2910,9 +2910,9 @@ dependencies = [ [[package]] name = "http" -version = "1.4.0" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" +checksum = "8be7462df143984c4598a256ef469b251d7d7f9e271135073e78fc535414f3d0" dependencies = [ "bytes", "itoa", @@ -3508,9 +3508,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.29" +version = "0.4.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +checksum = "616ec5685824bcc94416c6d4a7a446eea774a31efd7062c8480ba6fd06d7a6e5" [[package]] name = "lru" @@ -4504,9 +4504,9 @@ checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" [[package]] name = "reqwest" -version = "0.13.3" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62e0021ea2c22aed41653bc7e1419abb2c97e038ff2c33d0e1309e49a97deec0" +checksum = "219c5811de6525e5416c7d5d53bb656d3afdbc6c5af816e0802bcfa42dbdc1c3" dependencies = [ "base64", "bytes", From 8c8456f2ac61c42aec9d6e609f84397c06e76b28 Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Tue, 26 May 2026 21:22:50 +0100 Subject: [PATCH 11/23] fix(payment): verify closeness against pure-XOR view; escalate Merkle on mismatch Two upload-breaking regressions on testnets with a meaningful NAT fraction, both from storer-side closeness verification diverging from the uploader's network-walked peer selection. Single-node close-group check (introduced in #107): switch off the reachability-reranked find_closest_nodes_local_with_self onto the XOR-only find_closest_nodes_local_by_distance_with_self. The re-rank (saorsa-core #121) demoted XOR-close relay-only / NAT'd peers out of the local top-CLOSE_GROUP_SIZE, dropping 2-3 of the uploader's 7 quoted peers and breaching the >=5 threshold. This mirrors the fix already applied to the Merkle path; it remains a pure local lookup, so no added network cost. Merkle candidate-pool check (changed in #111): #111 moved the check off the authoritative network lookup onto the local routing table, with the fallback gated on local-table *size*, not match *outcome*. On a real network the local k-bucket sample legitimately diverges from the uploader's network-walked candidates (which include reachable responders from positions 17-32), so honest pools were hard-rejected with no escalation. Keep #111's local fast path (accept on a local match), but escalate to the authoritative network lookup on match *failure* too. Bound the reopened network-fallback path with a new closeness_fallback_permits semaphore (CLOSENESS_NETWORK_FALLBACK_CONCURRENCY = 16): inflight_closeness already collapses same-pool concurrency, and this caps the distinct-pool case so a forged-pool flood cannot spawn unbounded 240s Kademlia walks -- addressing the DoS rationale #111 gave for the size-only gate. Requires saorsa-core's find_closest_nodes_local_by_distance_with_self (saorsa-labs/saorsa-core#122) on rc-2026.5.4. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/payment/verifier.rs | 195 +++++++++++++++++++++++++++++----------- 1 file changed, 145 insertions(+), 50 deletions(-) diff --git a/src/payment/verifier.rs b/src/payment/verifier.rs index 660f70fd..73bf82e8 100644 --- a/src/payment/verifier.rs +++ b/src/payment/verifier.rs @@ -163,6 +163,14 @@ pub struct PaymentVerifier { /// amplification to one lookup per unique `pool_hash` regardless of /// concurrency. inflight_closeness: Mutex>>, + /// Bounds the number of concurrent authoritative *network* closeness + /// lookups. The local routing-table fast path serves healthy traffic, so + /// honest uploads rarely fall back to the network; this caps the cost when + /// they (or a flood of forged pools that each fail the local check) do. + /// `inflight_closeness` already collapses same-pool concurrency — this + /// bounds the distinct-`pool_hash` case so a forged-pool storm cannot spawn + /// unbounded `CLOSENESS_LOOKUP_TIMEOUT`-long Kademlia walks. + closeness_fallback_permits: Arc, /// P2P node handle, attached post-construction so merkle verification can /// check that candidate `pub_keys` map to peers actually close to the pool /// midpoint in the live DHT. `None` in unit tests that don't exercise @@ -262,6 +270,9 @@ impl PaymentVerifier { let pool_cache = Mutex::new(LruCache::new(pool_cache_size)); let closeness_pass_cache = Mutex::new(LruCache::new(pool_cache_size)); let inflight_closeness = Mutex::new(LruCache::new(pool_cache_size)); + let closeness_fallback_permits = Arc::new(tokio::sync::Semaphore::new( + Self::CLOSENESS_NETWORK_FALLBACK_CONCURRENCY, + )); let cache_capacity = config.cache_capacity; info!("Payment verifier initialized (cache_capacity={cache_capacity}, evm=always-on, pool_cache={DEFAULT_POOL_CACHE_CAPACITY})"); @@ -283,6 +294,7 @@ impl PaymentVerifier { pool_cache, closeness_pass_cache, inflight_closeness, + closeness_fallback_permits, p2p_node: RwLock::new(None), config, } @@ -664,9 +676,20 @@ impl PaymentVerifier { &self.config.local_rewards_address, )?; + // Use the XOR-only `_by_distance` variant, NOT the reachability-reranked + // `find_closest_nodes_local_with_self`: this is a quote-closeness + // *verification*, so it must mirror the uploader's pure XOR-distance + // close-group selection (`get_store_quotes` keeps the XOR-closest + // CLOSE_GROUP_SIZE responders). The reachability re-rank that + // saorsa-labs/saorsa-core#121 added to `_with_self` demotes XOR-close + // relay-only / NAT'd peers out of the local top-CLOSE_GROUP_SIZE, so on a + // network with a meaningful NAT fraction the storer's view drops 2-3 of + // the uploader's quoted peers and falsely rejects honest single-node + // payments. This mirrors the same fix already applied to the Merkle + // closeness check below (verify_merkle_candidate_closeness_inner). let close_group = p2p_node .dht_manager() - .find_closest_nodes_local_with_self(xorname, CLOSE_GROUP_SIZE) + .find_closest_nodes_local_by_distance_with_self(xorname, CLOSE_GROUP_SIZE) .await; let close_group_peer_ids: Vec = close_group.iter().map(|node| node.peer_id).collect(); @@ -1013,6 +1036,20 @@ impl PaymentVerifier { /// at the previous value of 4). const MAX_LEADER_RETRIES: usize = 1; + /// Maximum number of authoritative *network* closeness lookups allowed to + /// run concurrently across all pools (see [`closeness_fallback_permits`]). + /// + /// The local routing-table fast path resolves honest traffic on a healthy, + /// converged network, so this cap is rarely approached in normal operation. + /// Its job is to bound the worst case: a flood of distinct forged pools that + /// each pass pool-signature verification but fail the local closeness check + /// would otherwise each spawn a `CLOSENESS_LOOKUP_TIMEOUT`-bounded Kademlia + /// walk. Beyond this many in-flight walks, further PUTs are rejected fast + /// with a retryable error rather than queueing. Sized generously enough to + /// absorb several legitimate concurrent batch uploads (each batch collapses + /// to a single lookup via [`inflight_closeness`]). + const CLOSENESS_NETWORK_FALLBACK_CONCURRENCY: usize = 16; + /// Compute the storer's authoritative-lookup width for a candidate pool. /// /// Returns `max(CLOSENESS_LOOKUP_WIDTH, pool_len)`: matches the client's @@ -1030,17 +1067,22 @@ impl PaymentVerifier { } } - /// Whether the storer must fall back from the local routing table to the - /// iterative network lookup for the Merkle closeness check. + /// Whether the storer's local routing table is too sparse to answer the + /// Merkle closeness check authoritatively on its own. /// /// The local k-buckets can answer authoritatively only when they hold at /// least `CLOSENESS_LOOKUP_WIDTH` peers near the midpoint; below that the - /// table is genuinely too sparse and we pay for the network lookup. The - /// gate is local table size — NOT match outcome — so a forged pool cannot - /// force the expensive 240s path (an attacker cannot make a victim's local - /// routing table sparse). Extracted from - /// [`verify_merkle_candidate_closeness_inner`] so the boundary can be unit - /// tested without a `P2PNode`. + /// table is genuinely too sparse and the network lookup is mandatory. + /// + /// Note this is no longer the *only* trigger for the network lookup: a + /// dense-but-disagreeing local view also escalates to the network (see + /// [`verify_merkle_candidate_closeness_inner`]), because on a real network + /// the local k-bucket sample legitimately diverges from the uploader's + /// network-walked candidate set. The `DoS` concern that motivated the + /// size-only gate — a forged pool forcing the expensive path — is instead + /// bounded by [`closeness_fallback_permits`] + /// (`CLOSENESS_NETWORK_FALLBACK_CONCURRENCY`). Extracted so the boundary can + /// be unit tested without a `P2PNode`. const fn closeness_should_fall_back_to_network(local_peer_count: usize) -> bool { local_peer_count < Self::CLOSENESS_LOOKUP_WIDTH } @@ -1377,60 +1419,113 @@ impl PaymentVerifier { // uses) could demote an XOR-close relay-only peer out of the compared // window and falsely reject an honest candidate pool that legitimately // contains that peer. See saorsa-labs/saorsa-core#121. - let mut closeness_peers = p2p_node + let local_peers = p2p_node .dht_manager() .find_closest_nodes_local_by_distance(&pool_address.0, lookup_count) .await; - // Sparse-table fallback: only when the local table genuinely cannot - // answer authoritatively (fewer than CLOSENESS_LOOKUP_WIDTH peers near - // the midpoint) do we pay for the iterative network lookup. The gate is - // local table size, NOT match outcome — an attacker cannot make a - // victim's local routing table sparse, so a forged pool cannot force the - // expensive 240s network path (DoS-safe). On a well-connected production - // node the local table is dense near any key, so this path is rare. - if Self::closeness_should_fall_back_to_network(closeness_peers.len()) { + // When the local table is dense enough to be authoritative AND the pool + // already matches it, accept without paying for a network lookup — the + // common case on a healthy, converged network, and the #111 performance + // win we want to keep. + if Self::closeness_should_fall_back_to_network(local_peers.len()) { debug!( "Merkle closeness: local table returned only {} peers (< {}) for \ pool midpoint {}; falling back to network lookup", - closeness_peers.len(), + local_peers.len(), Self::CLOSENESS_LOOKUP_WIDTH, hex::encode(pool_address.0), ); - let network_lookup = p2p_node - .dht_manager() - .find_closest_nodes_network(&pool_address.0, lookup_count); - closeness_peers = - match tokio::time::timeout(Self::CLOSENESS_LOOKUP_TIMEOUT, network_lookup).await { - Ok(Ok(peers)) => peers, - Ok(Err(e)) => { - debug!( - "Merkle closeness network-lookup failed for pool midpoint {}: {e}", - hex::encode(pool_address.0), - ); - return Err(Error::Payment( - "Merkle candidate pool rejected: could not verify candidate \ + } else { + let local_peer_ids: Vec = local_peers.iter().map(|n| n.peer_id).collect(); + if Self::check_closeness_match(&candidate_peer_ids, &local_peer_ids, &pool_address.0) + .is_ok() + { + return Ok(()); + } + // Local view disagrees. The storer's local k-bucket sample + // legitimately diverges from the uploader's *network-walked* + // candidate set: the uploader collected reachable responders (per + // CLOSENESS_LOOKUP_WIDTH, often from positions 17–32) that the + // storer's local closest-`lookup_count` need not rank the same way. + // Escalate to the authoritative network lookup before rejecting an + // honest pool — this restores the pre-#111 correctness, but only on + // the (rare on a healthy net) match-failure path rather than for + // every chunk. The DoS exposure this reopens is bounded by + // `merkle_closeness_network_peer_ids`' concurrency permit. + debug!( + "Merkle closeness: local view ({} peers) disagreed for pool \ + midpoint {}; escalating to authoritative network lookup", + local_peers.len(), + hex::encode(pool_address.0), + ); + } + + // Authoritative network lookup (bounded; see + // `merkle_closeness_network_peer_ids`). This is the final verdict — + // there is no further escalation, so its result is returned directly. + let network_peer_ids = self + .merkle_closeness_network_peer_ids(&p2p_node, &pool_address.0, lookup_count) + .await?; + Self::check_closeness_match(&candidate_peer_ids, &network_peer_ids, &pool_address.0) + } + + /// Run the authoritative iterative-Kademlia closeness lookup for a pool + /// midpoint, bounded both in time (`CLOSENESS_LOOKUP_TIMEOUT`) and in + /// concurrency ([`closeness_fallback_permits`]). + /// + /// Returns the network's closest-`lookup_count` peer IDs to the midpoint. + /// All error paths are retryable (the uploader can re-PUT): the verifier is + /// at fallback capacity, the lookup failed, or it timed out. + async fn merkle_closeness_network_peer_ids( + &self, + p2p_node: &Arc, + pool_address: &[u8; 32], + lookup_count: usize, + ) -> Result> { + // Bound concurrent network walks. The local fast path handles healthy + // traffic, so honest uploads rarely reach here; a forged-pool flood + // (distinct pool_hashes that each fail the local check) could otherwise + // spawn unbounded CLOSENESS_LOOKUP_TIMEOUT-long walks. `inflight_closeness` + // already collapses same-pool concurrency; this caps the distinct-pool + // case. Excess PUTs are rejected fast (retryable) rather than queued. + let Ok(_permit) = Arc::clone(&self.closeness_fallback_permits).try_acquire_owned() else { + return Err(Error::Payment( + "Merkle candidate pool rejected: authoritative closeness lookups \ + are at capacity; retry shortly." + .into(), + )); + }; + + let network_lookup = p2p_node + .dht_manager() + .find_closest_nodes_network(pool_address, lookup_count); + match tokio::time::timeout(Self::CLOSENESS_LOOKUP_TIMEOUT, network_lookup).await { + Ok(Ok(peers)) => Ok(peers.iter().map(|n| n.peer_id).collect()), + Ok(Err(e)) => { + debug!( + "Merkle closeness network-lookup failed for pool midpoint {}: {e}", + hex::encode(pool_address), + ); + Err(Error::Payment( + "Merkle candidate pool rejected: could not verify candidate \ closeness against the authoritative network view." - .into(), - )); - } - Err(_) => { - debug!( - "Merkle closeness network-lookup timeout ({:?}) for pool midpoint {}", - Self::CLOSENESS_LOOKUP_TIMEOUT, - hex::encode(pool_address.0), - ); - return Err(Error::Payment( - "Merkle candidate pool rejected: authoritative network lookup \ + .into(), + )) + } + Err(_) => { + debug!( + "Merkle closeness network-lookup timeout ({:?}) for pool midpoint {}", + Self::CLOSENESS_LOOKUP_TIMEOUT, + hex::encode(pool_address), + ); + Err(Error::Payment( + "Merkle candidate pool rejected: authoritative network lookup \ timed out. Retry once the network lookup completes." - .into(), - )); - } - }; + .into(), + )) + } } - - let closeness_peer_ids: Vec = closeness_peers.iter().map(|n| n.peer_id).collect(); - Self::check_closeness_match(&candidate_peer_ids, &closeness_peer_ids, &pool_address.0) } /// Verify a merkle batch payment proof. From 40b3c47ac17d89bd9326c47902ff5063a6e3b18c Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Tue, 26 May 2026 21:49:57 +0100 Subject: [PATCH 12/23] chore: bump saorsa-core lock to rc-2026.5.4 tip (#122) Picks up find_closest_nodes_local_by_distance_with_self (saorsa-labs/saorsa-core#122, now merged to rc-2026.5.4) that the single-node close-group verification change depends on. The crate is pinned to `branch = "rc-2026.5.4"`, so this only advances Cargo.lock from 1be7352 to 82bb541; no manifest change. ant-node now compiles against the published branch without a local patch. Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index fcbaccbd..593602bc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4846,7 +4846,7 @@ dependencies = [ [[package]] name = "saorsa-core" version = "0.24.5-rc.1" -source = "git+https://github.com/saorsa-labs/saorsa-core?branch=rc-2026.5.4#1be73520bb8939d651f6d55ef1b2308810995052" +source = "git+https://github.com/saorsa-labs/saorsa-core?branch=rc-2026.5.4#82bb541237b66f4e0976ca7124f990bce2811a35" dependencies = [ "anyhow", "async-trait", From 6d54807bcaa3fe605a7fe60dae80fe69cbfba6df Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Tue, 26 May 2026 21:59:25 +0100 Subject: [PATCH 13/23] ci: run CI on rc-* release branches CI only triggered for push/pull_request against `main`, so PRs targeting release branches (e.g. rc-2026.5.4) ran no checks. Add `rc-*` to both branch filters. Note: the pull_request branch filter is evaluated against the PR's base branch, so this only starts firing for rc-targeted PRs once it has landed on the rc-2026.5.4 branch itself (i.e. after this PR merges). Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f6e45043..fd06ea2d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,9 +2,9 @@ name: CI on: push: - branches: [main] + branches: [main, "rc-*"] pull_request: - branches: [main] + branches: [main, "rc-*"] env: CARGO_TERM_COLOR: always From 0470e7f7b9177c553a095415d0fb2de9e738d2b1 Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Tue, 26 May 2026 22:25:20 +0100 Subject: [PATCH 14/23] chore(release): roll rc-2026.5.4 to 0.11.5-rc.2 --- Cargo.lock | 22 +++++++++++----------- Cargo.toml | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4473ac0a..bb285a6a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -809,7 +809,7 @@ dependencies = [ [[package]] name = "ant-node" -version = "0.11.5-rc.1" +version = "0.11.5-rc.2" dependencies = [ "alloy", "ant-protocol", @@ -861,8 +861,8 @@ dependencies = [ [[package]] name = "ant-protocol" -version = "2.1.2-rc.1" -source = "git+https://github.com/WithAutonomi/ant-protocol?branch=rc-2026.5.4#3d57d10ba7112169e3e02e955a4272dfcb13c550" +version = "2.1.2-rc.2" +source = "git+https://github.com/WithAutonomi/ant-protocol?branch=rc-2026.5.4#114e67a19abc5470a56e516b92d9796c2ab2633f" dependencies = [ "blake3", "bytes", @@ -1305,9 +1305,9 @@ checksum = "2dee39a0ee5b4095224a0cfc6bf4cc1baf0f9624b96b367e53b66d974e51d953" [[package]] name = "bitcoin_hashes" -version = "0.14.1" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26ec84b80c482df901772e931a9a681e26a1b9ee2302edeff23cb30328745c8b" +checksum = "4ed83caece3afc59919481b33b472e1432d1abc4641ed9100be142ef5110b406" dependencies = [ "bitcoin-io", "hex-conservative", @@ -3469,9 +3469,9 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" [[package]] name = "libmimalloc-sys" -version = "0.1.47" +version = "0.1.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d1eacfa31c33ec25e873c136ba5669f00f9866d0688bea7be4d3f7e43067df6" +checksum = "6a45a52f43e1c16f667ccfe4dd8c85b7f7c204fd5e3bf46c5b0db9a5c3c0b8e9" dependencies = [ "cc", ] @@ -3596,9 +3596,9 @@ dependencies = [ [[package]] name = "mimalloc" -version = "0.1.50" +version = "0.1.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3627c4272df786b9260cabaa46aec1d59c93ede723d4c3ef646c503816b0640" +checksum = "2d4139bb28d14ad1facf21d5eb8825051b326e172d216b39f6d31df53cc97862" dependencies = [ "libmimalloc-sys", ] @@ -4865,8 +4865,8 @@ dependencies = [ [[package]] name = "saorsa-core" -version = "0.24.5-rc.1" -source = "git+https://github.com/saorsa-labs/saorsa-core?branch=rc-2026.5.4#82bb541237b66f4e0976ca7124f990bce2811a35" +version = "0.24.5-rc.2" +source = "git+https://github.com/saorsa-labs/saorsa-core?branch=rc-2026.5.4#3de07d588cb06d752c6c30969e97aa71df201851" dependencies = [ "anyhow", "async-trait", diff --git a/Cargo.toml b/Cargo.toml index 1f5d1762..989aff3b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ant-node" -version = "0.11.5-rc.1" +version = "0.11.5-rc.2" edition = "2021" authors = ["David Irvine "] description = "Pure quantum-proof network node for the Autonomi decentralized network" From 41db069eb71de520e0f8530c9ba42614ed5a09a8 Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Wed, 27 May 2026 16:04:58 +0100 Subject: [PATCH 15/23] Revert "Merge pull request #114 from WithAutonomi/fix/single-node-and-merkle-closeness-verification" This reverts commit 59da17b5027c4267a957eca3ccacda7b23b00d68, reversing changes made to ed939af0945c583feed5d1243781275af200dcc7. --- .github/workflows/ci.yml | 4 +- src/payment/verifier.rs | 195 ++++++++++----------------------------- 2 files changed, 52 insertions(+), 147 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fd06ea2d..f6e45043 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,9 +2,9 @@ name: CI on: push: - branches: [main, "rc-*"] + branches: [main] pull_request: - branches: [main, "rc-*"] + branches: [main] env: CARGO_TERM_COLOR: always diff --git a/src/payment/verifier.rs b/src/payment/verifier.rs index 73bf82e8..660f70fd 100644 --- a/src/payment/verifier.rs +++ b/src/payment/verifier.rs @@ -163,14 +163,6 @@ pub struct PaymentVerifier { /// amplification to one lookup per unique `pool_hash` regardless of /// concurrency. inflight_closeness: Mutex>>, - /// Bounds the number of concurrent authoritative *network* closeness - /// lookups. The local routing-table fast path serves healthy traffic, so - /// honest uploads rarely fall back to the network; this caps the cost when - /// they (or a flood of forged pools that each fail the local check) do. - /// `inflight_closeness` already collapses same-pool concurrency — this - /// bounds the distinct-`pool_hash` case so a forged-pool storm cannot spawn - /// unbounded `CLOSENESS_LOOKUP_TIMEOUT`-long Kademlia walks. - closeness_fallback_permits: Arc, /// P2P node handle, attached post-construction so merkle verification can /// check that candidate `pub_keys` map to peers actually close to the pool /// midpoint in the live DHT. `None` in unit tests that don't exercise @@ -270,9 +262,6 @@ impl PaymentVerifier { let pool_cache = Mutex::new(LruCache::new(pool_cache_size)); let closeness_pass_cache = Mutex::new(LruCache::new(pool_cache_size)); let inflight_closeness = Mutex::new(LruCache::new(pool_cache_size)); - let closeness_fallback_permits = Arc::new(tokio::sync::Semaphore::new( - Self::CLOSENESS_NETWORK_FALLBACK_CONCURRENCY, - )); let cache_capacity = config.cache_capacity; info!("Payment verifier initialized (cache_capacity={cache_capacity}, evm=always-on, pool_cache={DEFAULT_POOL_CACHE_CAPACITY})"); @@ -294,7 +283,6 @@ impl PaymentVerifier { pool_cache, closeness_pass_cache, inflight_closeness, - closeness_fallback_permits, p2p_node: RwLock::new(None), config, } @@ -676,20 +664,9 @@ impl PaymentVerifier { &self.config.local_rewards_address, )?; - // Use the XOR-only `_by_distance` variant, NOT the reachability-reranked - // `find_closest_nodes_local_with_self`: this is a quote-closeness - // *verification*, so it must mirror the uploader's pure XOR-distance - // close-group selection (`get_store_quotes` keeps the XOR-closest - // CLOSE_GROUP_SIZE responders). The reachability re-rank that - // saorsa-labs/saorsa-core#121 added to `_with_self` demotes XOR-close - // relay-only / NAT'd peers out of the local top-CLOSE_GROUP_SIZE, so on a - // network with a meaningful NAT fraction the storer's view drops 2-3 of - // the uploader's quoted peers and falsely rejects honest single-node - // payments. This mirrors the same fix already applied to the Merkle - // closeness check below (verify_merkle_candidate_closeness_inner). let close_group = p2p_node .dht_manager() - .find_closest_nodes_local_by_distance_with_self(xorname, CLOSE_GROUP_SIZE) + .find_closest_nodes_local_with_self(xorname, CLOSE_GROUP_SIZE) .await; let close_group_peer_ids: Vec = close_group.iter().map(|node| node.peer_id).collect(); @@ -1036,20 +1013,6 @@ impl PaymentVerifier { /// at the previous value of 4). const MAX_LEADER_RETRIES: usize = 1; - /// Maximum number of authoritative *network* closeness lookups allowed to - /// run concurrently across all pools (see [`closeness_fallback_permits`]). - /// - /// The local routing-table fast path resolves honest traffic on a healthy, - /// converged network, so this cap is rarely approached in normal operation. - /// Its job is to bound the worst case: a flood of distinct forged pools that - /// each pass pool-signature verification but fail the local closeness check - /// would otherwise each spawn a `CLOSENESS_LOOKUP_TIMEOUT`-bounded Kademlia - /// walk. Beyond this many in-flight walks, further PUTs are rejected fast - /// with a retryable error rather than queueing. Sized generously enough to - /// absorb several legitimate concurrent batch uploads (each batch collapses - /// to a single lookup via [`inflight_closeness`]). - const CLOSENESS_NETWORK_FALLBACK_CONCURRENCY: usize = 16; - /// Compute the storer's authoritative-lookup width for a candidate pool. /// /// Returns `max(CLOSENESS_LOOKUP_WIDTH, pool_len)`: matches the client's @@ -1067,22 +1030,17 @@ impl PaymentVerifier { } } - /// Whether the storer's local routing table is too sparse to answer the - /// Merkle closeness check authoritatively on its own. + /// Whether the storer must fall back from the local routing table to the + /// iterative network lookup for the Merkle closeness check. /// /// The local k-buckets can answer authoritatively only when they hold at /// least `CLOSENESS_LOOKUP_WIDTH` peers near the midpoint; below that the - /// table is genuinely too sparse and the network lookup is mandatory. - /// - /// Note this is no longer the *only* trigger for the network lookup: a - /// dense-but-disagreeing local view also escalates to the network (see - /// [`verify_merkle_candidate_closeness_inner`]), because on a real network - /// the local k-bucket sample legitimately diverges from the uploader's - /// network-walked candidate set. The `DoS` concern that motivated the - /// size-only gate — a forged pool forcing the expensive path — is instead - /// bounded by [`closeness_fallback_permits`] - /// (`CLOSENESS_NETWORK_FALLBACK_CONCURRENCY`). Extracted so the boundary can - /// be unit tested without a `P2PNode`. + /// table is genuinely too sparse and we pay for the network lookup. The + /// gate is local table size — NOT match outcome — so a forged pool cannot + /// force the expensive 240s path (an attacker cannot make a victim's local + /// routing table sparse). Extracted from + /// [`verify_merkle_candidate_closeness_inner`] so the boundary can be unit + /// tested without a `P2PNode`. const fn closeness_should_fall_back_to_network(local_peer_count: usize) -> bool { local_peer_count < Self::CLOSENESS_LOOKUP_WIDTH } @@ -1419,113 +1377,60 @@ impl PaymentVerifier { // uses) could demote an XOR-close relay-only peer out of the compared // window and falsely reject an honest candidate pool that legitimately // contains that peer. See saorsa-labs/saorsa-core#121. - let local_peers = p2p_node + let mut closeness_peers = p2p_node .dht_manager() .find_closest_nodes_local_by_distance(&pool_address.0, lookup_count) .await; - // When the local table is dense enough to be authoritative AND the pool - // already matches it, accept without paying for a network lookup — the - // common case on a healthy, converged network, and the #111 performance - // win we want to keep. - if Self::closeness_should_fall_back_to_network(local_peers.len()) { + // Sparse-table fallback: only when the local table genuinely cannot + // answer authoritatively (fewer than CLOSENESS_LOOKUP_WIDTH peers near + // the midpoint) do we pay for the iterative network lookup. The gate is + // local table size, NOT match outcome — an attacker cannot make a + // victim's local routing table sparse, so a forged pool cannot force the + // expensive 240s network path (DoS-safe). On a well-connected production + // node the local table is dense near any key, so this path is rare. + if Self::closeness_should_fall_back_to_network(closeness_peers.len()) { debug!( "Merkle closeness: local table returned only {} peers (< {}) for \ pool midpoint {}; falling back to network lookup", - local_peers.len(), + closeness_peers.len(), Self::CLOSENESS_LOOKUP_WIDTH, hex::encode(pool_address.0), ); - } else { - let local_peer_ids: Vec = local_peers.iter().map(|n| n.peer_id).collect(); - if Self::check_closeness_match(&candidate_peer_ids, &local_peer_ids, &pool_address.0) - .is_ok() - { - return Ok(()); - } - // Local view disagrees. The storer's local k-bucket sample - // legitimately diverges from the uploader's *network-walked* - // candidate set: the uploader collected reachable responders (per - // CLOSENESS_LOOKUP_WIDTH, often from positions 17–32) that the - // storer's local closest-`lookup_count` need not rank the same way. - // Escalate to the authoritative network lookup before rejecting an - // honest pool — this restores the pre-#111 correctness, but only on - // the (rare on a healthy net) match-failure path rather than for - // every chunk. The DoS exposure this reopens is bounded by - // `merkle_closeness_network_peer_ids`' concurrency permit. - debug!( - "Merkle closeness: local view ({} peers) disagreed for pool \ - midpoint {}; escalating to authoritative network lookup", - local_peers.len(), - hex::encode(pool_address.0), - ); - } - - // Authoritative network lookup (bounded; see - // `merkle_closeness_network_peer_ids`). This is the final verdict — - // there is no further escalation, so its result is returned directly. - let network_peer_ids = self - .merkle_closeness_network_peer_ids(&p2p_node, &pool_address.0, lookup_count) - .await?; - Self::check_closeness_match(&candidate_peer_ids, &network_peer_ids, &pool_address.0) - } - - /// Run the authoritative iterative-Kademlia closeness lookup for a pool - /// midpoint, bounded both in time (`CLOSENESS_LOOKUP_TIMEOUT`) and in - /// concurrency ([`closeness_fallback_permits`]). - /// - /// Returns the network's closest-`lookup_count` peer IDs to the midpoint. - /// All error paths are retryable (the uploader can re-PUT): the verifier is - /// at fallback capacity, the lookup failed, or it timed out. - async fn merkle_closeness_network_peer_ids( - &self, - p2p_node: &Arc, - pool_address: &[u8; 32], - lookup_count: usize, - ) -> Result> { - // Bound concurrent network walks. The local fast path handles healthy - // traffic, so honest uploads rarely reach here; a forged-pool flood - // (distinct pool_hashes that each fail the local check) could otherwise - // spawn unbounded CLOSENESS_LOOKUP_TIMEOUT-long walks. `inflight_closeness` - // already collapses same-pool concurrency; this caps the distinct-pool - // case. Excess PUTs are rejected fast (retryable) rather than queued. - let Ok(_permit) = Arc::clone(&self.closeness_fallback_permits).try_acquire_owned() else { - return Err(Error::Payment( - "Merkle candidate pool rejected: authoritative closeness lookups \ - are at capacity; retry shortly." - .into(), - )); - }; - - let network_lookup = p2p_node - .dht_manager() - .find_closest_nodes_network(pool_address, lookup_count); - match tokio::time::timeout(Self::CLOSENESS_LOOKUP_TIMEOUT, network_lookup).await { - Ok(Ok(peers)) => Ok(peers.iter().map(|n| n.peer_id).collect()), - Ok(Err(e)) => { - debug!( - "Merkle closeness network-lookup failed for pool midpoint {}: {e}", - hex::encode(pool_address), - ); - Err(Error::Payment( - "Merkle candidate pool rejected: could not verify candidate \ + let network_lookup = p2p_node + .dht_manager() + .find_closest_nodes_network(&pool_address.0, lookup_count); + closeness_peers = + match tokio::time::timeout(Self::CLOSENESS_LOOKUP_TIMEOUT, network_lookup).await { + Ok(Ok(peers)) => peers, + Ok(Err(e)) => { + debug!( + "Merkle closeness network-lookup failed for pool midpoint {}: {e}", + hex::encode(pool_address.0), + ); + return Err(Error::Payment( + "Merkle candidate pool rejected: could not verify candidate \ closeness against the authoritative network view." - .into(), - )) - } - Err(_) => { - debug!( - "Merkle closeness network-lookup timeout ({:?}) for pool midpoint {}", - Self::CLOSENESS_LOOKUP_TIMEOUT, - hex::encode(pool_address), - ); - Err(Error::Payment( - "Merkle candidate pool rejected: authoritative network lookup \ + .into(), + )); + } + Err(_) => { + debug!( + "Merkle closeness network-lookup timeout ({:?}) for pool midpoint {}", + Self::CLOSENESS_LOOKUP_TIMEOUT, + hex::encode(pool_address.0), + ); + return Err(Error::Payment( + "Merkle candidate pool rejected: authoritative network lookup \ timed out. Retry once the network lookup completes." - .into(), - )) - } + .into(), + )); + } + }; } + + let closeness_peer_ids: Vec = closeness_peers.iter().map(|n| n.peer_id).collect(); + Self::check_closeness_match(&candidate_peer_ids, &closeness_peer_ids, &pool_address.0) } /// Verify a merkle batch payment proof. From fd581843714caa071adf3c421c2f752c134e36d9 Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Wed, 27 May 2026 16:05:09 +0100 Subject: [PATCH 16/23] Revert "Merge pull request #111 from jacderida/fix/merkle-closeness-local-lookup" This reverts commit 5ac1f761aae861bbe53b0b1b803b9b88babd9d75, reversing changes made to baa7dd2fad15b000bb3162ef53b9aa9b807df7de. --- src/payment/verifier.rs | 149 ++++++++-------------------------------- 1 file changed, 29 insertions(+), 120 deletions(-) diff --git a/src/payment/verifier.rs b/src/payment/verifier.rs index 660f70fd..8f63916e 100644 --- a/src/payment/verifier.rs +++ b/src/payment/verifier.rs @@ -990,12 +990,6 @@ impl PaymentVerifier { /// the dominant cost is still Sybil-grinding midpoint addresses or /// running real nodes near the target — same security floor. /// `CANDIDATE_CLOSENESS_REQUIRED` (13/16) is unchanged. - /// - /// Also doubles as the sparse-table gate in - /// [`verify_merkle_candidate_closeness_inner`]: the storer answers from its - /// local routing table and only falls back to the iterative network lookup - /// when the local table returns fewer than this many peers near the - /// midpoint (i.e. it genuinely cannot answer authoritatively). const CLOSENESS_LOOKUP_WIDTH: usize = 2 * evmlib::merkle_payments::CANDIDATES_PER_POOL; /// Maximum waiter → leader retries when the leader's future was cancelled @@ -1030,21 +1024,6 @@ impl PaymentVerifier { } } - /// Whether the storer must fall back from the local routing table to the - /// iterative network lookup for the Merkle closeness check. - /// - /// The local k-buckets can answer authoritatively only when they hold at - /// least `CLOSENESS_LOOKUP_WIDTH` peers near the midpoint; below that the - /// table is genuinely too sparse and we pay for the network lookup. The - /// gate is local table size — NOT match outcome — so a forged pool cannot - /// force the expensive 240s path (an attacker cannot make a victim's local - /// routing table sparse). Extracted from - /// [`verify_merkle_candidate_closeness_inner`] so the boundary can be unit - /// tested without a `P2PNode`. - const fn closeness_should_fall_back_to_network(local_peer_count: usize) -> bool { - local_peer_count < Self::CLOSENESS_LOOKUP_WIDTH - } - /// Verify that the candidate pool's `pub_keys` correspond to peers that /// are actually XOR-closest to the pool midpoint address, by querying /// the DHT for its closest peers to that address and requiring that a @@ -1361,76 +1340,39 @@ impl PaymentVerifier { // the pool rather than truncating, which would otherwise re-open the // K-too-small failure mode. let lookup_count = Self::closeness_lookup_count(pool.candidate_nodes.len()); - - // Fast path: answer from the local routing table. This is a pure - // in-memory k-bucket read (`find_closest_nodes_local_by_distance` - // returns `Vec` with no network I/O and no `Result`), so it is - // safe to call from this PUT-handling request handler — unlike - // `find_closest_nodes_network`, which runs an iterative Kademlia lookup - // (up to MAX_ITERATIONS rounds, bounded by CLOSENESS_LOOKUP_TIMEOUT) and - // is the dominant term in slow per-chunk store times. - // - // We use the XOR-only `_by_distance` variant deliberately, NOT the - // reachability-reranked `find_closest_nodes_local`: this is a closeness - // *verification*, so it must mirror the uploader's pure XOR-distance - // view. The reachability re-rank (which the close-group *selection* path - // uses) could demote an XOR-close relay-only peer out of the compared - // window and falsely reject an honest candidate pool that legitimately - // contains that peer. See saorsa-labs/saorsa-core#121. - let mut closeness_peers = p2p_node + let network_lookup = p2p_node .dht_manager() - .find_closest_nodes_local_by_distance(&pool_address.0, lookup_count) - .await; - - // Sparse-table fallback: only when the local table genuinely cannot - // answer authoritatively (fewer than CLOSENESS_LOOKUP_WIDTH peers near - // the midpoint) do we pay for the iterative network lookup. The gate is - // local table size, NOT match outcome — an attacker cannot make a - // victim's local routing table sparse, so a forged pool cannot force the - // expensive 240s network path (DoS-safe). On a well-connected production - // node the local table is dense near any key, so this path is rare. - if Self::closeness_should_fall_back_to_network(closeness_peers.len()) { - debug!( - "Merkle closeness: local table returned only {} peers (< {}) for \ - pool midpoint {}; falling back to network lookup", - closeness_peers.len(), - Self::CLOSENESS_LOOKUP_WIDTH, - hex::encode(pool_address.0), - ); - let network_lookup = p2p_node - .dht_manager() - .find_closest_nodes_network(&pool_address.0, lookup_count); - closeness_peers = - match tokio::time::timeout(Self::CLOSENESS_LOOKUP_TIMEOUT, network_lookup).await { - Ok(Ok(peers)) => peers, - Ok(Err(e)) => { - debug!( - "Merkle closeness network-lookup failed for pool midpoint {}: {e}", - hex::encode(pool_address.0), - ); - return Err(Error::Payment( - "Merkle candidate pool rejected: could not verify candidate \ + .find_closest_nodes_network(&pool_address.0, lookup_count); + let network_peers = + match tokio::time::timeout(Self::CLOSENESS_LOOKUP_TIMEOUT, network_lookup).await { + Ok(Ok(peers)) => peers, + Ok(Err(e)) => { + debug!( + "Merkle closeness network-lookup failed for pool midpoint {}: {e}", + hex::encode(pool_address.0), + ); + return Err(Error::Payment( + "Merkle candidate pool rejected: could not verify candidate \ closeness against the authoritative network view." - .into(), - )); - } - Err(_) => { - debug!( - "Merkle closeness network-lookup timeout ({:?}) for pool midpoint {}", - Self::CLOSENESS_LOOKUP_TIMEOUT, - hex::encode(pool_address.0), - ); - return Err(Error::Payment( - "Merkle candidate pool rejected: authoritative network lookup \ + .into(), + )); + } + Err(_) => { + debug!( + "Merkle closeness network-lookup timeout ({:?}) for pool midpoint {}", + Self::CLOSENESS_LOOKUP_TIMEOUT, + hex::encode(pool_address.0), + ); + return Err(Error::Payment( + "Merkle candidate pool rejected: authoritative network lookup \ timed out. Retry once the network lookup completes." - .into(), - )); - } - }; - } + .into(), + )); + } + }; - let closeness_peer_ids: Vec = closeness_peers.iter().map(|n| n.peer_id).collect(); - Self::check_closeness_match(&candidate_peer_ids, &closeness_peer_ids, &pool_address.0) + let network_peer_ids: Vec = network_peers.iter().map(|n| n.peer_id).collect(); + Self::check_closeness_match(&candidate_peer_ids, &network_peer_ids, &pool_address.0) } /// Verify a merkle batch payment proof. @@ -3411,39 +3353,6 @@ mod tests { ); } - #[test] - fn closeness_falls_back_to_network_only_below_lookup_width() { - // Fix B: the storer answers the Merkle closeness check from its local - // routing table and falls back to the iterative network lookup ONLY - // when the local table is genuinely too sparse to be authoritative — - // i.e. it returns fewer than CLOSENESS_LOOKUP_WIDTH peers near the - // midpoint. The gate is local table size, not match outcome, so a - // forged pool cannot force the expensive 240s network path. - let width = PaymentVerifier::CLOSENESS_LOOKUP_WIDTH; - - // Below the boundary: local table too sparse → must fall back. - assert!( - PaymentVerifier::closeness_should_fall_back_to_network(0), - "an empty local table must fall back to the network lookup" - ); - assert!( - PaymentVerifier::closeness_should_fall_back_to_network(width - 1), - "WIDTH-1 local peers is still too sparse — must fall back" - ); - - // At/above the boundary: local table is authoritative → no fallback. - // This is the common production path: a forged pool reaching a - // well-connected node must NOT be able to trigger the network lookup. - assert!( - !PaymentVerifier::closeness_should_fall_back_to_network(width), - "exactly WIDTH local peers is authoritative — must not fall back" - ); - assert!( - !PaymentVerifier::closeness_should_fall_back_to_network(width + 1), - "more than WIDTH local peers is authoritative — must not fall back" - ); - } - // Compile-time invariant: the `closeness_lookup_count` formula relies // on WIDTH being ≥ CANDIDATES_PER_POOL so we never request fewer peers // than the pool itself contains. From 530b597a85fa7f3857ce3a383fdf357c6765bf2d Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Wed, 27 May 2026 16:06:54 +0100 Subject: [PATCH 17/23] chore: drop abandoned saorsa-core/ant-protocol rc pins back to crates.io The saorsa-core and ant-protocol rc-2026.5.4 branches are being abandoned (their only changes, saorsa-core #121/#122, are reverted). Point both deps back at their crates.io releases (saorsa-core 0.24.4, ant-protocol 2.1.1) and refresh the lock. --- Cargo.lock | 34 ++++++++++++++++++---------------- Cargo.toml | 4 ++-- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bb285a6a..9ba07c29 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -861,8 +861,9 @@ dependencies = [ [[package]] name = "ant-protocol" -version = "2.1.2-rc.2" -source = "git+https://github.com/WithAutonomi/ant-protocol?branch=rc-2026.5.4#114e67a19abc5470a56e516b92d9796c2ab2633f" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4d0ba3f671c08a1d52291b601ec35a7353f4f4e10f221b5f0ee372d154636dd" dependencies = [ "blake3", "bytes", @@ -1299,15 +1300,15 @@ dependencies = [ [[package]] name = "bitcoin-io" -version = "0.1.4" +version = "0.1.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dee39a0ee5b4095224a0cfc6bf4cc1baf0f9624b96b367e53b66d974e51d953" +checksum = "11301df0b06f22dea7bb1916403fdd88a371031e495c49b8f96931b28189e175" [[package]] name = "bitcoin_hashes" -version = "0.14.2" +version = "0.14.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ed83caece3afc59919481b33b472e1432d1abc4641ed9100be142ef5110b406" +checksum = "0c9901a56e133a1fc86eeb1113e2591f45f4682451ca893bff494d2f88918e3f" dependencies = [ "bitcoin-io", "hex-conservative", @@ -2163,9 +2164,9 @@ dependencies = [ [[package]] name = "displaydoc" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f" dependencies = [ "proc-macro2", "quote", @@ -2960,9 +2961,9 @@ dependencies = [ [[package]] name = "hyper" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6299f016b246a94207e63da54dbe807655bf9e00044f73ded42c3ac5305fbcca" +checksum = "eb92f162bf56536459fc83c79b974bb12837acfed43d6bc370a7916d0ae15ecc" dependencies = [ "atomic-waker", "bytes", @@ -3478,9 +3479,9 @@ dependencies = [ [[package]] name = "libredox" -version = "0.1.16" +version = "0.1.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e02f3bb43d335493c96bf3fd3a321600bf6bd07ed34bc64118e9293bdffea46c" +checksum = "f02ab6bace2054fb888a3c16f990117b579d14a3088e472d63c6011fa185c9d3" dependencies = [ "libc", ] @@ -3581,9 +3582,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.8.0" +version = "2.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8" [[package]] name = "memoffset" @@ -4865,8 +4866,9 @@ dependencies = [ [[package]] name = "saorsa-core" -version = "0.24.5-rc.2" -source = "git+https://github.com/saorsa-labs/saorsa-core?branch=rc-2026.5.4#3de07d588cb06d752c6c30969e97aa71df201851" +version = "0.24.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c1267928da1bcc91748c314f95f7952bc01c0359a4ac70a0b111b0386898934" dependencies = [ "anyhow", "async-trait", diff --git a/Cargo.toml b/Cargo.toml index 989aff3b..6d3586b4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,10 +39,10 @@ mimalloc = "0.1" # Until then, the git pin tracks the matching saorsa-core lineage # (the rc-2026.4.2 branch) so Cargo can unify the wire types here # with ant-protocol's re-exports. -ant-protocol = { git = "https://github.com/WithAutonomi/ant-protocol", branch = "rc-2026.5.4" } +ant-protocol = "2.1.1" # Core (provides EVERYTHING: networking, DHT, security, trust, storage) -saorsa-core = { git = "https://github.com/saorsa-labs/saorsa-core", branch = "rc-2026.5.4" } +saorsa-core = "0.24.4" saorsa-pqc = "0.5" # Payment verification - autonomi network lookup + EVM payment From 510a041a93bd620f9bb40051827aed01222ea2ed Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Wed, 27 May 2026 16:18:03 +0100 Subject: [PATCH 18/23] ci: keep CI triggers on rc-* branches The reverts of #114/#111 also dropped the rc-* branch filters from the CI workflow. Restore them so push/PR CI still runs for rc-* base branches. --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f6e45043..fd06ea2d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,9 +2,9 @@ name: CI on: push: - branches: [main] + branches: [main, "rc-*"] pull_request: - branches: [main] + branches: [main, "rc-*"] env: CARGO_TERM_COLOR: always From a3449eaa8ce0bb1baf3157596e20419feec51665 Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Wed, 27 May 2026 16:27:33 +0100 Subject: [PATCH 19/23] fix(clippy): add backticks around O_NONBLOCK in doc comment CI re-enabled on rc-* branches surfaced a pre-existing doc_markdown lint (clippy 1.95) in binary_cache.rs that fails under -D warnings. --- src/upgrade/binary_cache.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/upgrade/binary_cache.rs b/src/upgrade/binary_cache.rs index af505fdd..0fb8035d 100644 --- a/src/upgrade/binary_cache.rs +++ b/src/upgrade/binary_cache.rs @@ -901,7 +901,7 @@ mod tests { /// A cache-dir attacker who replaces the cached archive with a FIFO /// must not be able to hang `get_verified_archive` waiting for a - /// writer to connect. The pre-check + O_NONBLOCK belt-and-braces + /// writer to connect. The pre-check + `O_NONBLOCK` belt-and-braces /// returns immediately with an error, the cache hit is abandoned, and /// the caller falls back to a fresh verified download. #[cfg(unix)] From 634a712e826e4d0035c3412ad5139d30b8898c61 Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Wed, 27 May 2026 17:13:25 +0100 Subject: [PATCH 20/23] chore(release): roll rc-2026.5.4 to 0.11.5-rc.3 --- Cargo.lock | 8 ++++---- Cargo.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9ba07c29..652a6080 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -809,7 +809,7 @@ dependencies = [ [[package]] name = "ant-node" -version = "0.11.5-rc.2" +version = "0.11.5-rc.3" dependencies = [ "alloy", "ant-protocol", @@ -4157,7 +4157,7 @@ version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e67ba7e9b2b56446f1d419b1d807906278ffa1a658a8a5d8a39dcb1f5a78614f" dependencies = [ - "toml_edit 0.25.11+spec-1.1.0", + "toml_edit 0.25.12+spec-1.1.0", ] [[package]] @@ -5929,9 +5929,9 @@ dependencies = [ [[package]] name = "toml_edit" -version = "0.25.11+spec-1.1.0" +version = "0.25.12+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b59c4d22ed448339746c59b905d24568fcbb3ab65a500494f7b8c3e97739f2b" +checksum = "d2153edc6955a6c354fad8f5efd38b6a8769bdccf9fe50f8e1329f81b0baa5d7" dependencies = [ "indexmap 2.14.0", "toml_datetime 1.1.1+spec-1.1.0", diff --git a/Cargo.toml b/Cargo.toml index 6d3586b4..a1cd5bc2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ant-node" -version = "0.11.5-rc.2" +version = "0.11.5-rc.3" edition = "2021" authors = ["David Irvine "] description = "Pure quantum-proof network node for the Autonomi decentralized network" From 89d72c33e2948ecb82008ebd763073c190e0bd1c Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Wed, 27 May 2026 19:47:06 +0100 Subject: [PATCH 21/23] Revert "Merge pull request #107 from mickvandijke/fix/single-node-payment-verification" This reverts commit bece78826befa59227cbda458ca05db2336e6c9e, reversing changes made to 360c2fca3e9c53e072f967c46f38646c7b91603e. --- src/payment/verifier.rs | 724 +++++----------------------------------- 1 file changed, 88 insertions(+), 636 deletions(-) diff --git a/src/payment/verifier.rs b/src/payment/verifier.rs index 8f63916e..56328fad 100644 --- a/src/payment/verifier.rs +++ b/src/payment/verifier.rs @@ -10,7 +10,7 @@ use crate::payment::cache::{CacheStats, VerifiedCache, XorName}; use crate::payment::proof::{ deserialize_merkle_proof, deserialize_proof, detect_proof_type, ProofType, }; -use crate::payment::single_node::{QuotePaymentInfo, SingleNodePayment}; +use crate::payment::single_node::SingleNodePayment; use ant_protocol::payment::verify::{verify_quote_content, verify_quote_signature}; use evmlib::common::Amount; use evmlib::contract::payment_vault; @@ -23,7 +23,6 @@ use parking_lot::{Mutex, RwLock}; use saorsa_core::identity::node_identity::peer_id_from_public_key_bytes; use saorsa_core::identity::PeerId; use saorsa_core::P2PNode; -use std::collections::HashSet; use std::num::NonZeroUsize; use std::sync::Arc; use std::time::{Duration, SystemTime}; @@ -53,30 +52,6 @@ const QUOTE_MAX_AGE_SECS: u64 = 86_400; /// future direction; past-dated quotes are governed by `QUOTE_MAX_AGE_SECS`. const QUOTE_FUTURE_SKEW_TOLERANCE_SECS: u64 = 300; -/// Single-node payments pay one valid quote at three times its quoted price. -const SINGLE_NODE_PRICE_MULTIPLIER: u64 = 3; - -/// Median index after sorting exactly `CLOSE_GROUP_SIZE` single-node quotes by price. -const SINGLE_NODE_MEDIAN_INDEX: usize = CLOSE_GROUP_SIZE / 2; - -/// `PaymentVaultV2.completedPayments` stores the first 16 bytes of the -/// 20-byte rewards address alongside the amount. -const COMPLETED_PAYMENT_REWARDS_PREFIX_LEN: usize = 16; - -/// Single-node close-group validation tolerates up to two quoted peers missing -/// from the local routing-table view to absorb normal DHT view skew. -const SINGLE_NODE_UNKNOWN_PEER_TOLERANCE: usize = 2; - -/// Minimum quoted peers that must appear in this node's local close-group view. -const SINGLE_NODE_CLOSE_GROUP_MATCH_REQUIRED: usize = - CLOSE_GROUP_SIZE - SINGLE_NODE_UNKNOWN_PEER_TOLERANCE; - -const _: () = assert!( - SINGLE_NODE_CLOSE_GROUP_MATCH_REQUIRED > 0 - && SINGLE_NODE_CLOSE_GROUP_MATCH_REQUIRED <= CLOSE_GROUP_SIZE, - "single-node close-group match threshold must be within 1..=CLOSE_GROUP_SIZE", -); - /// Configuration for EVM payment verification. /// /// EVM verification is always on. All new data requires on-chain @@ -105,12 +80,8 @@ pub struct PaymentVerifierConfig { pub evm: EvmVerifierConfig, /// Cache capacity (number of `XorName` values to cache). pub cache_capacity: usize, - /// Local node's configured rewards address. - /// - /// For single-node proofs, the verifier requires the quote signed by this - /// node's peer identity to name this rewards address. The paid quote is - /// separately checked against the on-chain recipient prefix recorded for - /// that quote hash. + /// Local node's rewards address. + /// The verifier rejects payments that don't include this node as a recipient. pub local_rewards_address: RewardsAddress, } @@ -471,12 +442,9 @@ impl PaymentVerifier { /// 2. All quotes target the correct content address (xorname binding) /// 3. Quote timestamps are fresh (not expired or future-dated) /// 4. Peer ID bindings match the ML-DSA-65 public keys - /// 5. This node's peer ID is among the quoted peers + /// 5. This node is among the quoted recipients /// 6. All ML-DSA-65 signatures are valid (offloaded to `spawn_blocking`) - /// 7. This node's quote is bound to the configured local rewards address - /// 8. At least 5 of 7 quoted peers are in this node's local close-group view - /// 9. A median-priced quote from that valid quote set was paid at least - /// 3x its price on-chain to the same rewards address prefix + /// 7. The median-priced quote was paid at least 3x its price on-chain /// (looked up via `completedPayments(quoteHash)` on the payment vault) /// /// For unit tests that don't need on-chain verification, pre-populate @@ -492,7 +460,8 @@ impl PaymentVerifier { Self::validate_quote_structure(payment)?; Self::validate_quote_content(payment, xorname)?; Self::validate_quote_timestamps(payment)?; - let quoted_peer_ids = Self::validate_peer_bindings(payment)?; + Self::validate_peer_bindings(payment)?; + self.validate_local_recipient(payment)?; // Verify quote signatures (CPU-bound, run off async runtime) let peer_quotes = payment.peer_quotes.clone(); @@ -509,9 +478,6 @@ impl PaymentVerifier { .await .map_err(|e| Error::Payment(format!("Signature verification task failed: {e}")))??; - self.validate_single_node_close_group(xorname, payment, "ed_peer_ids) - .await?; - // Reconstruct the SingleNodePayment to identify the median quote. // from_quotes() sorts by price and marks the median for 3x payment. let quotes_with_prices: Vec<_> = payment @@ -525,9 +491,17 @@ impl PaymentVerifier { )) })?; - let verified_amount = self - .verify_single_node_on_chain_payment(&single_payment) - .await?; + // Verify the median quote was paid at least 3x its price on-chain + // via completedPayments(quoteHash) on the payment vault contract. + let verified_amount = single_payment + .verify(&self.config.evm.network) + .await + .map_err(|e| { + let xorname_hex = hex::encode(xorname); + Error::Payment(format!( + "Median quote payment verification failed for {xorname_hex}: {e}" + )) + })?; if crate::logging::enabled!(crate::logging::Level::INFO) { let xorname_hex = hex::encode(xorname); @@ -613,8 +587,7 @@ impl PaymentVerifier { } /// Verify each quote's `pub_key` matches the claimed peer ID via BLAKE3. - fn validate_peer_bindings(payment: &ProofOfPayment) -> Result> { - let mut peer_ids = Vec::with_capacity(payment.peer_quotes.len()); + fn validate_peer_bindings(payment: &ProofOfPayment) -> Result<()> { for (encoded_peer_id, quote) in &payment.peer_quotes { let expected_peer_id = peer_id_from_public_key_bytes("e.pub_key) .map_err(|e| Error::Payment(format!("Invalid ML-DSA public key in quote: {e}")))?; @@ -627,290 +600,10 @@ impl PaymentVerifier { BLAKE3(pub_key) = {expected_hex}, peer_id = {actual_hex}" ))); } - peer_ids.push(expected_peer_id); - } - Ok(peer_ids) - } - - /// Verify enough single-node quotes came from this node's local close-group - /// view for the content address to tolerate bounded routing-table skew. - async fn validate_single_node_close_group( - &self, - xorname: &XorName, - payment: &ProofOfPayment, - quoted_peer_ids: &[PeerId], - ) -> Result<()> { - // Release the RwLock guard before awaiting the local DHT lookup. - let attached = self.p2p_node.read().as_ref().map(Arc::clone); - let Some(p2p_node) = attached else { - crate::logging::error!( - "PaymentVerifier: no P2PNode attached; rejecting single-node \ - payment. PaymentVerifier::attach_p2p_node must be called \ - before any PUT handler runs." - ); - return Err(Error::Payment( - "Single-node payment rejected: verifier is not wired to the \ - P2P layer; cannot verify quoted peer close-group membership." - .into(), - )); - }; - - let local_peer_id = *p2p_node.peer_id(); - Self::validate_local_quoted_peer(local_peer_id, quoted_peer_ids)?; - Self::validate_local_quote_rewards_address( - local_peer_id, - quoted_peer_ids, - payment, - &self.config.local_rewards_address, - )?; - - let close_group = p2p_node - .dht_manager() - .find_closest_nodes_local_with_self(xorname, CLOSE_GROUP_SIZE) - .await; - let close_group_peer_ids: Vec = - close_group.iter().map(|node| node.peer_id).collect(); - - Self::check_single_node_close_group_match(quoted_peer_ids, &close_group_peer_ids, xorname) - } - - /// Pure set-membership check for single-node close-group validation. - fn check_single_node_close_group_match( - quoted_peer_ids: &[PeerId], - close_group_peer_ids: &[PeerId], - xorname: &XorName, - ) -> Result<()> { - let quote_count = quoted_peer_ids.len(); - if quote_count != CLOSE_GROUP_SIZE { - return Err(Error::Payment(format!( - "Single-node payment must have exactly {CLOSE_GROUP_SIZE} quoted peers, got {quote_count}" - ))); - } - - let mut quoted_set = HashSet::with_capacity(quote_count); - for peer_id in quoted_peer_ids { - if !quoted_set.insert(*peer_id) { - return Err(Error::Payment(format!( - "Single-node payment contains duplicate quoted peer {}", - peer_id.to_hex() - ))); - } - } - - let close_group_count = close_group_peer_ids.len(); - if close_group_count < SINGLE_NODE_CLOSE_GROUP_MATCH_REQUIRED { - return Err(Error::Payment(format!( - "Single-node payment rejected: local close-group view for {} has only \ - {close_group_count} peer(s), need at least \ - {SINGLE_NODE_CLOSE_GROUP_MATCH_REQUIRED} to verify quotes", - hex::encode(xorname), - ))); - } - - let close_group_set: HashSet = close_group_peer_ids.iter().copied().collect(); - if close_group_set.len() != close_group_peer_ids.len() { - return Err(Error::Payment( - "Single-node payment rejected: local close-group view contains duplicate peer IDs" - .into(), - )); - } - - let unknown_peer_ids: Vec = quoted_peer_ids - .iter() - .filter(|peer_id| !close_group_set.contains(peer_id)) - .map(PeerId::to_hex) - .collect(); - let matched_count = quote_count.saturating_sub(unknown_peer_ids.len()); - - if matched_count < SINGLE_NODE_CLOSE_GROUP_MATCH_REQUIRED { - return Err(Error::Payment(format!( - "Single-node payment rejected: only {matched_count}/{CLOSE_GROUP_SIZE} quoted \ - peer(s) are present in this node's local close-group view for {}, need at \ - least {SINGLE_NODE_CLOSE_GROUP_MATCH_REQUIRED}; unknown peer(s): {}", - hex::encode(xorname), - unknown_peer_ids.join(", "), - ))); - } - - Ok(()) - } - - /// Verify the local node's peer identity signed one of the quotes. - fn validate_local_quoted_peer(local_peer_id: PeerId, quoted_peer_ids: &[PeerId]) -> Result<()> { - if !quoted_peer_ids.contains(&local_peer_id) { - return Err(Error::Payment(format!( - "Payment proof does not include this node's peer ID as a quoted peer: {}", - local_peer_id.to_hex() - ))); } Ok(()) } - /// Verify this node's signed quote is bound to the configured local rewards address. - fn validate_local_quote_rewards_address( - local_peer_id: PeerId, - quoted_peer_ids: &[PeerId], - payment: &ProofOfPayment, - local_rewards_address: &RewardsAddress, - ) -> Result<()> { - if quoted_peer_ids.len() != payment.peer_quotes.len() { - return Err(Error::Payment( - "internal error: quoted peer IDs and payment quotes have different lengths".into(), - )); - } - - let Some((_, (_, quote))) = quoted_peer_ids - .iter() - .zip(payment.peer_quotes.iter()) - .find(|(peer_id, _)| **peer_id == local_peer_id) - else { - return Err(Error::Payment(format!( - "Payment proof does not include this node's peer ID as a quoted peer: {}", - local_peer_id.to_hex() - ))); - }; - - if quote.rewards_address != *local_rewards_address { - return Err(Error::Payment(format!( - "Payment proof includes this node's peer ID but its quote rewards address {} \ - does not match the configured local rewards address {}", - quote.rewards_address, local_rewards_address - ))); - } - - Ok(()) - } - - /// Verify a paid single-node quote against `completedPayments(quote.hash())`. - async fn verify_single_node_on_chain_payment( - &self, - single_payment: &SingleNodePayment, - ) -> Result { - let median = single_payment.paid_quote().ok_or_else(|| { - Error::Payment(format!( - "Missing median quote at index {}: quotes array has only {} elements", - SINGLE_NODE_MEDIAN_INDEX, - single_payment.quotes.len() - )) - })?; - let median_price = median.price; - let tied_quotes = Self::ordered_median_tied_quotes(single_payment, median); - - debug!( - "Verifying single-node quote payment: median price {median_price}, {} quote(s) tied", - tied_quotes.len() - ); - - let provider = evmlib::utils::http_provider(self.config.evm.network.rpc_url().clone()); - let vault_address = *self.config.evm.network.payment_vault_address(); - let contract = payment_vault::interface::IPaymentVault::new(vault_address, provider); - - let mut last_rejection = None; - for candidate in &tied_quotes { - let result = contract - .completedPayments(candidate.quote_hash) - .call() - .await - .map_err(|e| Error::Payment(format!("completedPayments lookup failed: {e}")))?; - let on_chain_amount = Amount::from(result.amount); - - match Self::validate_completed_single_node_payment( - candidate, - result.rewardsAddress.as_slice(), - on_chain_amount, - ) { - Ok(verified_amount) => { - debug!( - "Single-node payment verified: {verified_amount} atto paid for quote {}", - candidate.quote_hash - ); - return Ok(verified_amount); - } - Err(e) => { - last_rejection = Some(e.to_string()); - } - } - } - - let detail = last_rejection - .map(|reason| format!(" Last rejection: {reason}")) - .unwrap_or_default(); - let expected_amount = Self::expected_single_node_payment_amount(median_price)?; - Err(Error::Payment(format!( - "No median-priced quote was paid enough to the quoted rewards address: \ - expected at least {expected_amount}, checked {} tied quote(s).{detail}", - tied_quotes.len() - ))) - } - - /// Return median-price quotes with the selected median first, so the - /// normal single-node path needs one on-chain lookup. Other median-price - /// ties are kept as fallback for deterministic tie-order drift. - fn ordered_median_tied_quotes<'a>( - single_payment: &'a SingleNodePayment, - median: &'a QuotePaymentInfo, - ) -> Vec<&'a QuotePaymentInfo> { - let mut tied_quotes = Vec::with_capacity(CLOSE_GROUP_SIZE); - tied_quotes.push(median); - tied_quotes.extend( - single_payment - .quotes - .iter() - .filter(|quote| quote.price == median.price && !std::ptr::eq(*quote, median)), - ); - tied_quotes - } - - /// Validate the contract record for a single quote against amount and recipient. - fn validate_completed_single_node_payment( - quote: &QuotePaymentInfo, - on_chain_rewards_prefix: &[u8], - on_chain_amount: Amount, - ) -> Result { - if quote.price == Amount::ZERO { - return Err(Error::Payment(format!( - "Median quote has zero price for quote {}; refusing to verify as paid", - quote.quote_hash - ))); - } - - let expected_amount = Self::expected_single_node_payment_amount(quote.price)?; - if on_chain_amount < expected_amount { - return Err(Error::Payment(format!( - "Underpayment for quote {}: paid {on_chain_amount}, expected at least {expected_amount}", - quote.quote_hash - ))); - } - - let expected_rewards_prefix = - Self::completed_payment_rewards_prefix("e.rewards_address); - if on_chain_rewards_prefix != expected_rewards_prefix { - return Err(Error::Payment(format!( - "Recipient mismatch for quote {}: completedPayments recipient prefix 0x{} \ - does not match quote rewards address {}", - quote.quote_hash, - hex::encode(on_chain_rewards_prefix), - quote.rewards_address - ))); - } - - Ok(on_chain_amount) - } - - fn expected_single_node_payment_amount(price: Amount) -> Result { - price - .checked_mul(Amount::from(SINGLE_NODE_PRICE_MULTIPLIER)) - .ok_or_else(|| { - Error::Payment(format!( - "Price overflow when calculating {SINGLE_NODE_PRICE_MULTIPLIER}x quote price" - )) - }) - } - - fn completed_payment_rewards_prefix(rewards_address: &RewardsAddress) -> &[u8] { - &rewards_address.as_slice()[..COMPLETED_PAYMENT_REWARDS_PREFIX_LEN] - } - /// Minimum number of candidate `pub_keys` (out of 16) whose derived `PeerId` /// must match the DHT's actual closest peers to the pool midpoint address. /// @@ -1183,7 +876,7 @@ impl PaymentVerifier { pool: &evmlib::merkle_payments::MerklePaymentCandidatePool, ) -> Result> { let mut candidate_peer_ids = Vec::with_capacity(pool.candidate_nodes.len()); - let mut seen = HashSet::with_capacity(pool.candidate_nodes.len()); + let mut seen = std::collections::HashSet::with_capacity(pool.candidate_nodes.len()); for candidate in &pool.candidate_nodes { let pid = peer_id_from_public_key_bytes(&candidate.pub_key).map_err(|e| { Error::Payment(format!( @@ -1244,7 +937,8 @@ impl PaymentVerifier { // Set-membership check against the returned closest-peers list. // Candidate `PeerId`s are deduplicated upstream, so each match // corresponds to a distinct peer. - let network_set: HashSet = network_peer_ids.iter().copied().collect(); + let network_set: std::collections::HashSet = + network_peer_ids.iter().copied().collect(); let matched = candidate_peer_ids .iter() .filter(|pid| network_set.contains(pid)) @@ -1594,13 +1288,27 @@ impl PaymentVerifier { Ok(()) } + + /// Verify this node is among the paid recipients. + fn validate_local_recipient(&self, payment: &ProofOfPayment) -> Result<()> { + let local_addr = &self.config.local_rewards_address; + let is_recipient = payment + .peer_quotes + .iter() + .any(|(_, quote)| quote.rewards_address == *local_addr); + if !is_recipient { + return Err(Error::Payment( + "Payment proof does not include this node as a recipient".to_string(), + )); + } + Ok(()) + } } #[cfg(test)] #[allow(clippy::expect_used, clippy::panic)] mod tests { use super::*; - use alloy::primitives::FixedBytes; use evmlib::merkle_payments::MerklePaymentCandidatePool; /// Create a verifier for unit tests. EVM is always on, but tests can @@ -2131,149 +1839,55 @@ mod tests { ); } - /// Build a deterministic `PeerId` from a single byte tag. - fn synthetic_peer_id(tag: u8) -> PeerId { - let mut bytes = [0u8; 32]; - bytes[0] = tag; - PeerId::from_bytes(bytes) - } - - /// Build a vector of synthetic `PeerId`s tagged with bytes 1..=n. - fn synthetic_peer_ids(n: u8) -> Vec { - (1..=n).map(synthetic_peer_id).collect() - } - - fn encoded_peer_id(peer_id: PeerId) -> evmlib::EncodedPeerId { - let mut bytes = [0u8; 32]; - bytes.copy_from_slice(peer_id.as_bytes()); - evmlib::EncodedPeerId::new(bytes) - } - - fn single_node_peer_ids() -> Vec { - (1..=CLOSE_GROUP_SIZE) - .map(|idx| { - let tag = u8::try_from(idx).expect("CLOSE_GROUP_SIZE fits in u8"); - synthetic_peer_id(tag) - }) - .collect() - } - - fn proof_of_payment_for_peer_ids( - xorname: XorName, - quoted_peer_ids: &[PeerId], - local_peer_id: PeerId, - local_rewards_address: RewardsAddress, - ) -> ProofOfPayment { - let peer_quotes = quoted_peer_ids - .iter() - .enumerate() - .map(|(idx, peer_id)| { - let tag = u8::try_from(idx).expect("CLOSE_GROUP_SIZE fits in u8"); - let rewards_address = if *peer_id == local_peer_id { - local_rewards_address - } else { - RewardsAddress::new([tag; 20]) - }; - ( - encoded_peer_id(*peer_id), - make_fake_quote(xorname, SystemTime::now(), rewards_address), - ) - }) - .collect(); - - ProofOfPayment { peer_quotes } + /// Helper: build an `EncodedPeerId` that matches the BLAKE3 hash of an ML-DSA public key. + fn encoded_peer_id_for_pub_key(pub_key: &[u8]) -> evmlib::EncodedPeerId { + let ant_peer_id = peer_id_from_public_key_bytes(pub_key).expect("valid ML-DSA pub key"); + evmlib::EncodedPeerId::new(*ant_peer_id.as_bytes()) } - fn make_quote_payment_info(price: Amount, rewards_address: RewardsAddress) -> QuotePaymentInfo { - QuotePaymentInfo { - quote_hash: FixedBytes::from([0xABu8; 32]), - rewards_address, - amount: PaymentVerifier::expected_single_node_payment_amount(price) - .expect("expected amount"), - price, - } - } - - #[test] - fn local_quoted_peer_accepts_when_local_peer_id_was_quoted() { - let quoted_peer_ids = single_node_peer_ids(); - let local_peer_id = quoted_peer_ids[0]; - - let result = PaymentVerifier::validate_local_quoted_peer(local_peer_id, "ed_peer_ids); - - assert!( - result.is_ok(), - "local peer ID in quoted peer set must pass: {result:?}" - ); - } + #[tokio::test] + async fn test_local_not_in_paid_set_rejected() { + use evmlib::RewardsAddress; + use saorsa_core::MlDsa65; + use saorsa_pqc::pqc::MlDsaOperations; - #[test] - fn local_quoted_peer_rejects_when_local_peer_id_missing() { - let quoted_peer_ids = single_node_peer_ids(); - let local_peer_id = synthetic_peer_id(0xF0); + // Verifier with a local rewards address set + let local_addr = RewardsAddress::new([0xAAu8; 20]); + let config = PaymentVerifierConfig { + evm: EvmVerifierConfig { + network: EvmNetwork::ArbitrumOne, + }, + cache_capacity: 100, + local_rewards_address: local_addr, + }; + let verifier = PaymentVerifier::new(config); - let result = PaymentVerifier::validate_local_quoted_peer(local_peer_id, "ed_peer_ids); + let xorname = [0xEEu8; 32]; + // Quotes pay a DIFFERENT rewards address + let other_addr = RewardsAddress::new([0xBBu8; 20]); - let err = result.expect_err("local peer ID outside quoted set must be rejected"); - assert!( - err.to_string() - .contains("does not include this node's peer ID"), - "expected local peer ID rejection, got: {err}" - ); - } + // Use real ML-DSA keys so the pub_key→peer_id binding check passes + let ml_dsa = MlDsa65::new(); + let mut peer_quotes = Vec::new(); + for _ in 0..CLOSE_GROUP_SIZE { + let (public_key, _secret_key) = ml_dsa.generate_keypair().expect("keygen"); + let pub_key_bytes = public_key.as_bytes().to_vec(); + let encoded = encoded_peer_id_for_pub_key(&pub_key_bytes); - #[test] - fn local_quote_rewards_address_accepts_matching_config() { - let xorname = [0x44u8; 32]; - let quoted_peer_ids = single_node_peer_ids(); - let local_peer_id = quoted_peer_ids[0]; - let local_rewards_address = RewardsAddress::new([0xA0; 20]); - let payment = proof_of_payment_for_peer_ids( - xorname, - "ed_peer_ids, - local_peer_id, - local_rewards_address, - ); + let mut quote = make_fake_quote(xorname, SystemTime::now(), other_addr); + quote.pub_key = pub_key_bytes; - let result = PaymentVerifier::validate_local_quote_rewards_address( - local_peer_id, - "ed_peer_ids, - &payment, - &local_rewards_address, - ); + peer_quotes.push((encoded, quote)); + } - assert!( - result.is_ok(), - "local quote rewards address must pass when it matches config: {result:?}" - ); - } + let proof_bytes = serialize_proof(peer_quotes); + let result = verifier.verify_payment(&xorname, Some(&proof_bytes)).await; - #[test] - fn local_quote_rewards_address_rejects_config_mismatch() { - let xorname = [0x45u8; 32]; - let quoted_peer_ids = single_node_peer_ids(); - let local_peer_id = quoted_peer_ids[0]; - let quoted_rewards_address = RewardsAddress::new([0xA1; 20]); - let configured_rewards_address = RewardsAddress::new([0xA2; 20]); - let payment = proof_of_payment_for_peer_ids( - xorname, - "ed_peer_ids, - local_peer_id, - quoted_rewards_address, - ); - - let result = PaymentVerifier::validate_local_quote_rewards_address( - local_peer_id, - "ed_peer_ids, - &payment, - &configured_rewards_address, - ); - - let err = result.expect_err("local rewards address mismatch must be rejected"); + assert!(result.is_err(), "Should reject payment not addressed to us"); + let err_msg = format!("{}", result.expect_err("should fail")); assert!( - err.to_string() - .contains("does not match the configured local rewards address"), - "expected local rewards mismatch rejection, got: {err}" + err_msg.contains("does not include this node as a recipient"), + "Error should mention recipient rejection: {err_msg}" ); } @@ -2314,180 +1928,6 @@ mod tests { ); } - #[test] - fn single_node_close_group_accepts_valid_quote_set() { - let xorname = [0x11u8; 32]; - let quoted_peer_ids = single_node_peer_ids(); - let close_group_peer_ids = quoted_peer_ids.clone(); - - let result = PaymentVerifier::check_single_node_close_group_match( - "ed_peer_ids, - &close_group_peer_ids, - &xorname, - ); - - assert!( - result.is_ok(), - "all quoted peers in the close-group view must pass: {result:?}" - ); - } - - #[test] - fn single_node_close_group_accepts_tolerated_unknown_peers() { - let xorname = [0x22u8; 32]; - let quoted_peer_ids = single_node_peer_ids(); - let close_group_peer_ids = - quoted_peer_ids[..SINGLE_NODE_CLOSE_GROUP_MATCH_REQUIRED].to_vec(); - - let result = PaymentVerifier::check_single_node_close_group_match( - "ed_peer_ids, - &close_group_peer_ids, - &xorname, - ); - - assert!( - result.is_ok(), - "up to {SINGLE_NODE_UNKNOWN_PEER_TOLERANCE} unknown quoted peers must pass: {result:?}" - ); - } - - #[test] - fn single_node_close_group_rejects_sparse_local_view() { - let xorname = [0x33u8; 32]; - let quoted_peer_ids = single_node_peer_ids(); - let sparse_count = SINGLE_NODE_CLOSE_GROUP_MATCH_REQUIRED - .checked_sub(1) - .expect("close-group match threshold is non-zero"); - let close_group_peer_ids = quoted_peer_ids[..sparse_count].to_vec(); - - let result = PaymentVerifier::check_single_node_close_group_match( - "ed_peer_ids, - &close_group_peer_ids, - &xorname, - ); - - let err = result.expect_err("sparse close-group view must be rejected"); - assert!( - err.to_string().contains("has only"), - "expected sparse-view rejection, got: {err}" - ); - } - - #[test] - fn single_node_close_group_rejects_unknown_peer_tolerance_exceeded() { - let xorname = [0x34u8; 32]; - let quoted_peer_ids = single_node_peer_ids(); - let matched_count = SINGLE_NODE_CLOSE_GROUP_MATCH_REQUIRED - .checked_sub(1) - .expect("close-group match threshold is non-zero"); - let mut close_group_peer_ids = quoted_peer_ids[..matched_count].to_vec(); - close_group_peer_ids.push(synthetic_peer_id(0xF0)); - - let result = PaymentVerifier::check_single_node_close_group_match( - "ed_peer_ids, - &close_group_peer_ids, - &xorname, - ); - - let err = result.expect_err("excess unknown quoted peers must be rejected"); - assert!( - err.to_string().contains(&format!( - "need at least {SINGLE_NODE_CLOSE_GROUP_MATCH_REQUIRED}" - )), - "expected close-group match threshold rejection, got: {err}" - ); - } - - #[test] - fn completed_single_node_payment_accepts_matching_recipient_and_overpayment() { - let rewards_address = RewardsAddress::new([0x44u8; 20]); - let quote = make_quote_payment_info(Amount::from(10u64), rewards_address); - let rewards_prefix = - PaymentVerifier::completed_payment_rewards_prefix("e.rewards_address).to_vec(); - let on_chain_amount = Amount::from(31u64); - - let result = PaymentVerifier::validate_completed_single_node_payment( - "e, - &rewards_prefix, - on_chain_amount, - ); - - assert_eq!(result.expect("valid completed payment"), on_chain_amount); - } - - #[test] - fn ordered_median_tied_quotes_checks_selected_median_first() { - const TIED_PRICE_ATTO: u64 = 10; - - let quotes = std::array::from_fn(|idx| { - let tag = u8::try_from(idx).expect("CLOSE_GROUP_SIZE fits in u8"); - let price = Amount::from(TIED_PRICE_ATTO); - QuotePaymentInfo { - quote_hash: FixedBytes::from([tag; 32]), - rewards_address: RewardsAddress::new([tag; 20]), - amount: PaymentVerifier::expected_single_node_payment_amount(price) - .expect("expected amount"), - price, - } - }); - let single_payment = SingleNodePayment { quotes }; - let median = single_payment.paid_quote().expect("median quote"); - - let ordered = PaymentVerifier::ordered_median_tied_quotes(&single_payment, median); - - assert_eq!(ordered.len(), CLOSE_GROUP_SIZE); - assert_eq!( - ordered.first().expect("first tied quote").quote_hash, - median.quote_hash - ); - let selected_median_count = ordered - .iter() - .filter(|quote| quote.quote_hash == median.quote_hash) - .count(); - assert_eq!(selected_median_count, 1); - } - - #[test] - fn completed_single_node_payment_rejects_underpayment() { - let rewards_address = RewardsAddress::new([0x55u8; 20]); - let quote = make_quote_payment_info(Amount::from(10u64), rewards_address); - let rewards_prefix = - PaymentVerifier::completed_payment_rewards_prefix("e.rewards_address).to_vec(); - let on_chain_amount = Amount::from(29u64); - - let result = PaymentVerifier::validate_completed_single_node_payment( - "e, - &rewards_prefix, - on_chain_amount, - ); - - let err = result.expect_err("underpayment must be rejected"); - assert!( - err.to_string().contains("Underpayment"), - "expected underpayment rejection, got: {err}" - ); - } - - #[test] - fn completed_single_node_payment_rejects_recipient_mismatch() { - let rewards_address = RewardsAddress::new([0x66u8; 20]); - let quote = make_quote_payment_info(Amount::from(10u64), rewards_address); - let wrong_rewards_prefix = [0x77u8; COMPLETED_PAYMENT_REWARDS_PREFIX_LEN]; - let on_chain_amount = Amount::from(30u64); - - let result = PaymentVerifier::validate_completed_single_node_payment( - "e, - &wrong_rewards_prefix, - on_chain_amount, - ); - - let err = result.expect_err("recipient mismatch must be rejected"); - assert!( - err.to_string().contains("Recipient mismatch"), - "expected recipient mismatch rejection, got: {err}" - ); - } - // ========================================================================= // Merkle-tagged proof tests // ========================================================================= @@ -3381,6 +2821,18 @@ mod tests { // can reason about which IDs are "in the network's top-K" vs not. // ========================================================================= + /// Build a deterministic `PeerId` from a single byte tag. + fn synthetic_peer_id(tag: u8) -> PeerId { + let mut bytes = [0u8; 32]; + bytes[0] = tag; + PeerId::from_bytes(bytes) + } + + /// Build a vector of synthetic `PeerId`s tagged with bytes 1..=n. + fn synthetic_peer_ids(n: u8) -> Vec { + (1..=n).map(synthetic_peer_id).collect() + } + #[test] fn closeness_match_passes_when_all_16_candidates_in_top_16() { // Trivial case: every candidate is in the network's top-16. From f8336fc24692c597d1de547c01a97acf336d497f Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Wed, 27 May 2026 20:02:19 +0100 Subject: [PATCH 22/23] chore(release): roll rc-2026.5.4 to 0.11.5-rc.4 --- Cargo.lock | 10 +++++----- Cargo.toml | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 652a6080..fae10c03 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -809,7 +809,7 @@ dependencies = [ [[package]] name = "ant-node" -version = "0.11.5-rc.3" +version = "0.11.5-rc.4" dependencies = [ "alloy", "ant-protocol", @@ -7105,18 +7105,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.48" +version = "0.8.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" +checksum = "bce33a6288fa3f072a8c2c7d0f2fdbb90e28298f0135c1f99b96c3db2efcc60b" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.48" +version = "0.8.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" +checksum = "8fd425244944f4ab65ccff928e7323354c5a018c75838362fdce749dfad2ee1e" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index a1cd5bc2..bce316f0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ant-node" -version = "0.11.5-rc.3" +version = "0.11.5-rc.4" edition = "2021" authors = ["David Irvine "] description = "Pure quantum-proof network node for the Autonomi decentralized network" From 88d47e8dafb6c1a0aabf491fc5b27f28255594c8 Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Thu, 28 May 2026 12:55:16 +0100 Subject: [PATCH 23/23] chore(release): promote rc-2026.5.4 to 0.11.5 --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fae10c03..13b5520b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -809,7 +809,7 @@ dependencies = [ [[package]] name = "ant-node" -version = "0.11.5-rc.4" +version = "0.11.5" dependencies = [ "alloy", "ant-protocol", diff --git a/Cargo.toml b/Cargo.toml index bce316f0..0ef01ea7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ant-node" -version = "0.11.5-rc.4" +version = "0.11.5" edition = "2021" authors = ["David Irvine "] description = "Pure quantum-proof network node for the Autonomi decentralized network"