From 784315a5acb4b5d8c2f2e0a66c917cbaf8c8d320 Mon Sep 17 00:00:00 2001 From: Mark Hildebrand Date: Sat, 7 Feb 2026 13:55:42 -0800 Subject: [PATCH 1/4] Add performance diagnostic to `diskann-benchmark` --- diskann-benchmark/src/main.rs | 189 +++++++++++++++++++------ diskann-wide/src/arch/emulated/mod.rs | 4 + diskann-wide/src/arch/mod.rs | 78 +++++++++- diskann-wide/src/arch/x86_64/mod.rs | 37 +++++ diskann-wide/src/arch/x86_64/v3/mod.rs | 6 +- diskann-wide/src/arch/x86_64/v4/mod.rs | 6 +- diskann-wide/src/helpers.rs | 5 - 7 files changed, 274 insertions(+), 51 deletions(-) diff --git a/diskann-benchmark/src/main.rs b/diskann-benchmark/src/main.rs index 53c7d9b55..d52b5be8f 100644 --- a/diskann-benchmark/src/main.rs +++ b/diskann-benchmark/src/main.rs @@ -7,26 +7,116 @@ mod backend; mod inputs; mod utils; +use diskann_benchmark_runner as runner; + fn main() -> Result<(), anyhow::Error> { - run_program( - &diskann_benchmark_runner::App::parse(), - &mut diskann_benchmark_runner::output::default(), - ) + let cli = Cli::parse(); + let mut output = runner::output::default(); + cli.run(&mut output) +} + +/// The top-level CLI for the benchmark binary. +/// +/// We have some additional arguments on top of [`runner::App`] for performance warnings. +#[derive(Debug, clap::Parser)] +struct Cli { + /// Suppress compilation target related performance warnings. + #[arg(long, action)] + quiet: bool, + + #[command(flatten)] + app: runner::App, } -fn run_program( - app: &diskann_benchmark_runner::App, - output: &mut dyn diskann_benchmark_runner::Output, -) -> anyhow::Result<()> { - // Collect inputs. - let mut inputs = diskann_benchmark_runner::registry::Inputs::new(); - inputs::register_inputs(&mut inputs)?; +// This controls printing of a banner warning if the benchmark tool is compiled for the +// `x86-64` target CPU instead of `x86-64-v3`. The former will likely lead to misleading +// performance, but is Rust's default when building for `x86-64` and can thus be a common +// source of performance confusion. +// +// The diagnostic can be suppressed by passing the `--quiet` flag. +impl Cli { + fn parse() -> Self { + ::parse() + } + + fn run(&self, output: &mut dyn runner::Output) -> anyhow::Result<()> { + self.check_target(output)?; + + // Collect inputs. + let mut inputs = runner::registry::Inputs::new(); + inputs::register_inputs(&mut inputs)?; + + // Collect benchmarks. + let mut benchmarks = runner::registry::Benchmarks::new(); + backend::register_benchmarks(&mut benchmarks); + + self.app.run(&inputs, &benchmarks, output) + } + + #[cfg(test)] + fn from_commands(commands: runner::app::Commands, quiet: bool) -> Self { + Self { + quiet, + app: runner::App::from_commands(commands), + } + } + + #[cfg(target_arch = "x86_64")] + fn check_target(&self, mut output: &mut dyn runner::Output) -> anyhow::Result<()> { + use std::io::Write; + use diskann_wide::Architecture; + + // The trick we use here is to inspect the compile-time architecture of `diskann-wide`. + // + // If the `x86_64::V3` architecture is reachable from `diskann_wide::ARCH`, then we know + // that most of the optimizations we care about should be present. + if !self.quiet + && diskann_wide::arch::Current::level() < diskann_wide::arch::x86_64::V3::level() + { + let message = r#" +WARNING + +> This application was compiled for the `x86-64` target CPU. +> It is recommended to set the target CPU to at least +> `x86-64-v3` for best performance. +> +> This can be done by using the environment variable +> RUSTFLAGS="-Ctarget-cpu=x86-64-v3" +> before compiling this binary with Cargo. +> +> This warning can be suppressed by passing the `--quiet` flag +> before any of the documented commands. +"#; + writeln!(output, "{}", message)?; + } + + Ok(()) + } - // Collect benchmarks. - let mut benchmarks = diskann_benchmark_runner::registry::Benchmarks::new(); - backend::register_benchmarks(&mut benchmarks); + #[cfg(target_arch = "aarch64")] + fn check_target(&self, mut output: &mut dyn runner::Output) -> anyhow::Result<()> { + use std::io::Write; + if !self.quiet { + let message = r#" +WARNING + +> Support for AArch64 has not yet been optimized. +> +> Performance may not be representative. +> +> This warning can be suppressed by passing the `--quiet` flag +> before any of the documented commands. +"#; + writeln!(output, "{}", message)?; + } - app.run(&inputs, &benchmarks, output) + Ok(()) + } + + #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + fn check_target(&self, mut _output: &mut dyn runner::Output) -> anyhow::Result<()> { + Ok(()) + } } /////////// @@ -40,7 +130,7 @@ mod tests { use super::*; - use diskann_benchmark_runner::{app::Commands, output::Memory, App}; + use diskann_benchmark_runner::{app::Commands, output::Memory}; use diskann_providers::storage::FileStorageProvider; use diskann_tools::utils::{compute_ground_truth_from_datafiles, GraphDataF32Vector}; use diskann_vector::distance::Metric; @@ -190,10 +280,10 @@ mod tests { output_file: output_path.to_owned(), dry_run: false, }; - let app = App::from_commands(command); + let cli = Cli::from_commands(command, true); let mut output = Memory::new(); - run_program(&app, &mut output).unwrap(); + cli.run(&mut output).unwrap(); println!( "output = {}", String::from_utf8(output.into_inner()).unwrap() @@ -234,10 +324,10 @@ mod tests { dry_run: false, }; - let app = App::from_commands(command); + let cli = Cli::from_commands(command, true); let mut output = Memory::new(); - run_program(&app, &mut output).unwrap(); + cli.run(&mut output).unwrap(); println!( "output = {}", String::from_utf8(output.into_inner()).unwrap() @@ -254,10 +344,10 @@ mod tests { output_file: output_path.to_owned(), dry_run: false, }; - let app = App::from_commands(command); + let cli = Cli::from_commands(command, true); let mut output = Memory::new(); - let err = run_program(&app, &mut output).unwrap_err(); + let err = cli.run(&mut output).unwrap_err(); println!("err = {:?}", err); let output = String::from_utf8(output.into_inner()).unwrap(); @@ -299,10 +389,10 @@ mod tests { dry_run: false, }; - let app = App::from_commands(command); + let cli = Cli::from_commands(command, true); let mut output = Memory::new(); - run_program(&app, &mut output).unwrap(); + cli.run(&mut output).unwrap(); println!( "output = {}", String::from_utf8(output.into_inner()).unwrap() @@ -319,10 +409,10 @@ mod tests { output_file: output_path.to_owned(), dry_run: false, }; - let app = App::from_commands(command); + let cli = Cli::from_commands(command, true); let mut output = Memory::new(); - let err = run_program(&app, &mut output).unwrap_err(); + let err = cli.run(&mut output).unwrap_err(); println!("err = {:?}", err); let output = String::from_utf8(output.into_inner()).unwrap(); @@ -367,10 +457,10 @@ mod tests { dry_run: false, }; - let app = App::from_commands(command); + let cli = Cli::from_commands(command, true); let mut output = Memory::new(); - run_program(&app, &mut output).unwrap(); + cli.run(&mut output).unwrap(); println!( "output = {}", String::from_utf8(output.into_inner()).unwrap() @@ -387,10 +477,10 @@ mod tests { output_file: output_path.to_owned(), dry_run: false, }; - let app = App::from_commands(command); + let cli = Cli::from_commands(command, true); let mut output = Memory::new(); - let err = run_program(&app, &mut output).unwrap_err(); + let err = cli.run(&mut output).unwrap_err(); println!("err = {:?}", err); let output = String::from_utf8(output.into_inner()).unwrap(); @@ -426,10 +516,10 @@ mod tests { output_file: output_path.to_owned(), dry_run: false, }; - let app = App::from_commands(command); + let cli = Cli::from_commands(command, true); let mut output = Memory::new(); - run_program(&app, &mut output).unwrap(); + cli.run(&mut output).unwrap(); println!( "output = {}", String::from_utf8(output.into_inner()).unwrap() @@ -480,10 +570,10 @@ mod tests { output_file: output_path.to_owned(), dry_run: false, }; - let app = App::from_commands(command); + let cli = Cli::from_commands(command, true); let mut output = Memory::new(); - run_program(&app, &mut output).unwrap(); + cli.run(&mut output).unwrap(); println!( "output = {}", String::from_utf8(output.into_inner()).unwrap() @@ -563,10 +653,10 @@ mod tests { output_file: output_path.to_owned(), dry_run: false, }; - let app = App::from_commands(command); + let cli = Cli::from_commands(command, true); let mut output = Memory::new(); - run_program(&app, &mut output).unwrap(); + cli.run(&mut output).unwrap(); println!( "output = {}", String::from_utf8(output.into_inner()).unwrap() @@ -637,10 +727,10 @@ mod tests { dry_run: false, }; - let app = App::from_commands(command); + let cli = Cli::from_commands(command, true); let mut output = Memory::new(); - run_program(&app, &mut output).unwrap(); + cli.run(&mut output).unwrap(); println!( "output = {}", String::from_utf8(output.into_inner()).unwrap() @@ -657,10 +747,10 @@ mod tests { output_file: output_path.to_owned(), dry_run: false, }; - let app = App::from_commands(command); + let cli = Cli::from_commands(command, true); let mut output = Memory::new(); - let err = run_program(&app, &mut output).unwrap_err(); + let err = cli.run(&mut output).unwrap_err(); println!("err = {:?}", err); let output = String::from_utf8(output.into_inner()).unwrap(); @@ -670,4 +760,23 @@ mod tests { // The output file should not have been created because we failed the test. assert!(!output_path.exists()); } + + #[test] + fn quiet_suppresses_check_target_warning() { + let cli = Cli::from_commands(Commands::Skeleton, true); + let mut output = Memory::new(); + cli.check_target(&mut output).unwrap(); + assert!(output.into_inner().is_empty()); + } + + // Smoke test: `check_target` should succeed regardless of the `--quiet` flag or the + // compile-time architecture level. We intentionally do not assert on the output content + // because whether a warning is emitted depends on the target CPU the tests were compiled + // for. + #[test] + fn check_target_smoke_test() { + let cli = Cli::from_commands(Commands::Skeleton, false); + let mut output = Memory::new(); + cli.check_target(&mut output).unwrap(); + } } diff --git a/diskann-wide/src/arch/emulated/mod.rs b/diskann-wide/src/arch/emulated/mod.rs index cd2a77a69..283c9a720 100644 --- a/diskann-wide/src/arch/emulated/mod.rs +++ b/diskann-wide/src/arch/emulated/mod.rs @@ -60,6 +60,10 @@ impl Architecture for Scalar { arch::maskdef!(); arch::typedef!(); + fn level() -> arch::Level { + arch::Level::scalar() + } + fn run(self, f: F) -> R where F: Target, diff --git a/diskann-wide/src/arch/mod.rs b/diskann-wide/src/arch/mod.rs index d0e595852..d2c13d621 100644 --- a/diskann-wide/src/arch/mod.rs +++ b/diskann-wide/src/arch/mod.rs @@ -321,12 +321,16 @@ //! //! ## Hierarchies //! +//! Each [`Architecture`] exposes a [`Level`] via [`Architecture::level()`] that +//! can be used to compare capabilities without instantiating the architecture. +//! //! ### X86 //! -//! * [`crate::arch::x86_64::V3`]: Supporting AVX2 and lower. -//! * [`Scalar`]: Fallback architecture +//! * [`x86_64::V4`]: Supporting AVX-512 (and AVX2 and lower). +//! * [`x86_64::V3`]: Supporting AVX2 and lower. +//! * [`Scalar`]: Fallback architecture. //! -//! Upcoming will be `V4`, which will include support for AVX-512. +//! The ordering is `Scalar` < `V3` < `V4`. //! //! ### Arm //! @@ -345,10 +349,43 @@ pub(crate) mod emulated; /// compiler for optimization. pub use emulated::Scalar; +/// An opaque representation of an [`Architecture`]'s capability level. +/// +/// `Level` allows comparing the relative capabilities of different architectures +/// without requiring an instance of the architecture type. This is useful for +/// compile-time checks against [`crate::ARCH`] where constructing architecture +/// types like [`x86_64::V3`] would require `unsafe`. +/// +/// Levels are totally ordered within an ISA family, with greater values indicating +/// more capable instruction sets. [`Scalar`] is always the lowest level. +/// +/// # Examples +/// +/// Checking if the compile-time architecture meets a minimum capability: +/// +/// ``` +/// use diskann_wide::{Architecture, arch}; +/// +/// // Check at compile time whether we were built with AVX2+ support. +/// #[cfg(target_arch = "x86_64")] +/// let meets_v3 = arch::Current::level() >= arch::x86_64::V3::level(); +/// ``` +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct Level(LevelInner); + +impl Level { + const fn scalar() -> Self { + Self(LevelInner::Scalar) + } +} + cfg_if::cfg_if! { if #[cfg(any(target_arch = "x86_64", doc))] { // Delegate to the architecture selection within the `x86_64` module,. pub mod x86_64; + + use x86_64::LevelInner; + pub use x86_64::current; pub use x86_64::Current; @@ -361,9 +398,25 @@ cfg_if::cfg_if! { pub use x86_64::dispatch1_no_features; pub use x86_64::dispatch2_no_features; pub use x86_64::dispatch3_no_features; + + impl Level { + const fn v3() -> Self { + Self(LevelInner::V3) + } + + const fn v4() -> Self { + Self(LevelInner::V4) + } + } } else { pub type Current = Scalar; + // There is only one architecture present in this mode. + #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] + enum LevelInner { + Scalar, + } + pub const fn current() -> Current { Scalar::new() } @@ -616,6 +669,23 @@ pub trait Architecture: sealed::Sealed { // Methods // //---------// + /// Return an opaque [`Level`] representing the capabilities of this architecture. + /// + /// Levels that compare greater represent architectures that are more capable. + /// + /// # Examples + /// + /// ``` + /// use diskann_wide::{Architecture, arch}; + /// + /// // Scalar is the baseline — every other architecture compares greater. + /// assert_eq!(arch::Scalar::level(), arch::Scalar::level()); + /// + /// #[cfg(target_arch = "x86_64")] + /// assert!(arch::Scalar::level() < arch::x86_64::V3::level()); + /// ``` + fn level() -> Level; + /// Run the provided closure targeting this architecture. /// /// This function is always safe to call, but the function `f` likely needs to be @@ -907,7 +977,7 @@ where } /// A hidden architecture for use in the function pointer API. -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)] struct Hidden; const _ASSERT_ZST: () = assert!( diff --git a/diskann-wide/src/arch/x86_64/mod.rs b/diskann-wide/src/arch/x86_64/mod.rs index 42d8d0abd..c0664293e 100644 --- a/diskann-wide/src/arch/x86_64/mod.rs +++ b/diskann-wide/src/arch/x86_64/mod.rs @@ -18,6 +18,14 @@ pub mod v4; pub use v3::V3; pub use v4::V4; +// The ordering is `Scalar < V3 < V4`. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub(super) enum LevelInner { + Scalar, + V3, + V4, +} + //////////////////////////// // Architecture Selection // //////////////////////////// @@ -527,4 +535,33 @@ mod tests { assert_eq!(y, "foo"); } } + + #[test] + fn test_level_ordering() { + use crate::Architecture; + + let scalar = Scalar::level(); + let v3 = V3::level(); + let v4 = V4::level(); + + // Scalar < V3 < V4 + assert!(scalar < v3); + assert!(scalar < v4); + assert!(v3 < v4); + + // Reverse + assert!(v4 > v3); + assert!(v4 > scalar); + assert!(v3 > scalar); + + // Equality + assert_eq!(scalar, Scalar::level()); + assert_eq!(v3, V3::level()); + assert_eq!(v4, V4::level()); + + // Not equal across levels + assert_ne!(scalar, v3); + assert_ne!(scalar, v4); + assert_ne!(v3, v4); + } } diff --git a/diskann-wide/src/arch/x86_64/v3/mod.rs b/diskann-wide/src/arch/x86_64/v3/mod.rs index 1f99b97ff..29d1d1d8a 100644 --- a/diskann-wide/src/arch/x86_64/v3/mod.rs +++ b/diskann-wide/src/arch/x86_64/v3/mod.rs @@ -96,7 +96,7 @@ mod conversion; /// An [`Architecture`] supporting all the requirements of the /// [`x86-64-v3`](https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels). -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)] pub struct V3(Hidden); impl arch::sealed::Sealed for V3 {} @@ -343,6 +343,10 @@ impl Architecture for V3 { arch::maskdef!(); arch::typedef!(); + fn level() -> arch::Level { + arch::Level::v3() + } + #[inline(always)] fn run(self, f: F) -> R where diff --git a/diskann-wide/src/arch/x86_64/v4/mod.rs b/diskann-wide/src/arch/x86_64/v4/mod.rs index 0e917425c..baa76f29b 100644 --- a/diskann-wide/src/arch/x86_64/v4/mod.rs +++ b/diskann-wide/src/arch/x86_64/v4/mod.rs @@ -181,7 +181,7 @@ mod conversion; /// * `avx512bitalg` (upcoming - pending CI resources) /// * `avx512vpopcntdq` (upcoming - pending CI resources) /// -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)] pub struct V4(Hidden); impl arch::Sealed for V4 {} @@ -444,6 +444,10 @@ impl Architecture for V4 { arch::maskdef!(); arch::typedef!(); + fn level() -> arch::Level { + arch::Level::v4() + } + #[inline(always)] fn run(self, f: F) -> R where diff --git a/diskann-wide/src/helpers.rs b/diskann-wide/src/helpers.rs index 19adeddcc..da1ef7ba4 100644 --- a/diskann-wide/src/helpers.rs +++ b/diskann-wide/src/helpers.rs @@ -3,11 +3,6 @@ * Licensed under the MIT license. */ -// #![cfg_attr( -// not(all(target_arch = "x86_64", target_feature = "avx2")), -// allow(unused_macros, unused_imports) -// )] - /// Utility macro for defining simple operations that lower to a single intrinsic. /// /// SAFETY: It is the invoker's responsibility to ensure that the intrinsic is safe to call From 370c6de17da7b31248d528517e7e16e73ad43780 Mon Sep 17 00:00:00 2001 From: Mark Hildebrand Date: Sat, 7 Feb 2026 14:16:44 -0800 Subject: [PATCH 2/4] Update diskann-wide/src/arch/mod.rs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- diskann-wide/src/arch/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/diskann-wide/src/arch/mod.rs b/diskann-wide/src/arch/mod.rs index d2c13d621..0147b864d 100644 --- a/diskann-wide/src/arch/mod.rs +++ b/diskann-wide/src/arch/mod.rs @@ -381,7 +381,7 @@ impl Level { cfg_if::cfg_if! { if #[cfg(any(target_arch = "x86_64", doc))] { - // Delegate to the architecture selection within the `x86_64` module,. + // Delegate to the architecture selection within the `x86_64` module. pub mod x86_64; use x86_64::LevelInner; From 8412ee21b1a044e9be7cc7c32ba20d8840777ce4 Mon Sep 17 00:00:00 2001 From: Mark Hildebrand Date: Sat, 7 Feb 2026 14:16:18 -0800 Subject: [PATCH 3/4] Run formatter. --- diskann-benchmark/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/diskann-benchmark/src/main.rs b/diskann-benchmark/src/main.rs index d52b5be8f..e999bf10e 100644 --- a/diskann-benchmark/src/main.rs +++ b/diskann-benchmark/src/main.rs @@ -63,8 +63,8 @@ impl Cli { #[cfg(target_arch = "x86_64")] fn check_target(&self, mut output: &mut dyn runner::Output) -> anyhow::Result<()> { - use std::io::Write; use diskann_wide::Architecture; + use std::io::Write; // The trick we use here is to inspect the compile-time architecture of `diskann-wide`. // From e835ba00078867e03dfffd911e599a0278f6c286 Mon Sep 17 00:00:00 2001 From: Mark Hildebrand Date: Sat, 7 Feb 2026 14:20:19 -0800 Subject: [PATCH 4/4] Minor docs cleanup. --- diskann-benchmark/src/main.rs | 2 +- diskann-wide/src/arch/mod.rs | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/diskann-benchmark/src/main.rs b/diskann-benchmark/src/main.rs index e999bf10e..b3de5901e 100644 --- a/diskann-benchmark/src/main.rs +++ b/diskann-benchmark/src/main.rs @@ -76,7 +76,7 @@ impl Cli { let message = r#" WARNING -> This application was compiled for the `x86-64` target CPU. +> This application was compiled without AVX2 support. > It is recommended to set the target CPU to at least > `x86-64-v3` for best performance. > diff --git a/diskann-wide/src/arch/mod.rs b/diskann-wide/src/arch/mod.rs index 0147b864d..3cf07654f 100644 --- a/diskann-wide/src/arch/mod.rs +++ b/diskann-wide/src/arch/mod.rs @@ -364,11 +364,12 @@ pub use emulated::Scalar; /// Checking if the compile-time architecture meets a minimum capability: /// /// ``` +/// #[cfg(target_arch = "x86_64")] /// use diskann_wide::{Architecture, arch}; /// /// // Check at compile time whether we were built with AVX2+ support. /// #[cfg(target_arch = "x86_64")] -/// let meets_v3 = arch::Current::level() >= arch::x86_64::V3::level(); +/// let _meets_v3 = arch::Current::level() >= arch::x86_64::V3::level(); /// ``` #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub struct Level(LevelInner);